Spaces:
Sleeping
Sleeping
File size: 3,304 Bytes
265ae36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.
import argparse
import logging
import os
from pathlib import Path
from typing import List, Optional
import submitit
from dinov2.utils.cluster import (
get_slurm_executor_parameters,
get_slurm_partition,
get_user_checkpoint_path,
)
logger = logging.getLogger("dinov2")
def get_args_parser(
description: Optional[str] = None,
parents: Optional[List[argparse.ArgumentParser]] = None,
add_help: bool = True,
) -> argparse.ArgumentParser:
parents = parents or []
slurm_partition = get_slurm_partition()
parser = argparse.ArgumentParser(
description=description,
parents=parents,
add_help=add_help,
)
parser.add_argument(
"--ngpus",
"--gpus",
"--gpus-per-node",
default=8,
type=int,
help="Number of GPUs to request on each node",
)
parser.add_argument(
"--nodes",
"--nnodes",
default=1,
type=int,
help="Number of nodes to request",
)
parser.add_argument(
"--timeout",
default=2800,
type=int,
help="Duration of the job",
)
parser.add_argument(
"--partition",
default=slurm_partition,
type=str,
help="Partition where to submit",
)
parser.add_argument(
"--use-volta32",
action="store_true",
help="Request V100-32GB GPUs",
)
parser.add_argument(
"--comment",
default="",
type=str,
help="Comment to pass to scheduler, e.g. priority message",
)
parser.add_argument(
"--exclude",
default="",
type=str,
help="Nodes to exclude",
)
return parser
def get_shared_folder() -> Path:
user_checkpoint_path = get_user_checkpoint_path()
if user_checkpoint_path is None:
raise RuntimeError("Path to user checkpoint cannot be determined")
path = user_checkpoint_path / "experiments"
path.mkdir(exist_ok=True)
return path
def submit_jobs(task_class, args, name: str):
if not args.output_dir:
args.output_dir = str(get_shared_folder() / "%j")
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
kwargs = {}
if args.use_volta32:
kwargs["slurm_constraint"] = "volta32gb"
if args.comment:
kwargs["slurm_comment"] = args.comment
if args.exclude:
kwargs["slurm_exclude"] = args.exclude
executor_params = get_slurm_executor_parameters(
nodes=args.nodes,
num_gpus_per_node=args.ngpus,
timeout_min=args.timeout, # max is 60 * 72
slurm_signal_delay_s=120,
slurm_partition=args.partition,
**kwargs,
)
executor.update_parameters(name=name, **executor_params)
task = task_class(args)
job = executor.submit(task)
logger.info(f"Submitted job_id: {job.job_id}")
str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
|