Spaces:
Running
Running
import sys | |
from pathlib import Path | |
sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) | |
import argparse | |
import os | |
from jean_zay.launch import JeanZayExperiment | |
def parse_mode(): | |
parser = argparse.ArgumentParser( | |
description="Extract embeddings from YFCC dataset using DINOv2" | |
) | |
parser.add_argument( | |
"--launch", | |
action="store_true", | |
help="Launch the experiment", | |
) | |
parser.add_argument("--src_csv_dir", help="path to source csv directory") | |
parser.add_argument("--src_images_dir", help="path to source images directory") | |
parser.add_argument("--dest", help="path to destination") | |
parser.add_argument( | |
"--num_samples_per_tar", | |
help="number of samples per tar", | |
type=int, | |
default=10000, | |
) | |
parser.add_argument("--batch_size", help="batch size", type=int, default=256) | |
args = parser.parse_args() | |
return args | |
args = parse_mode() | |
number_of_jobs = len(list(Path(args.src_csv_dir).glob("*.csv"))) | |
cmd_modifiers = [] | |
exps = [] | |
exp_name = f"yfcc_preprocessing" | |
job_name = f"yfcc_preprocessing" | |
jz_exp = JeanZayExperiment( | |
exp_name, | |
job_name, | |
slurm_array_nb_jobs=number_of_jobs, | |
cmd_path="data/to_webdataset/yfcc_to_wds.py", | |
num_nodes=1, | |
num_gpus_per_node=1, | |
qos="t3", | |
account="syq", | |
gpu_type="a100", | |
time="1:30:00", | |
) | |
exps.append(jz_exp) | |
trainer_modifiers = {} | |
exp_modifier = { | |
"--src_csv_dir": args.src_csv_dir, | |
"--src_images_dir": args.src_images_dir, | |
"--dest": args.dest, | |
"--num_samples_per_tar": args.num_samples_per_tar, | |
"--job_offset": "${SLURM_ARRAY_TASK_ID}", | |
"--batch_size": args.batch_size, | |
} | |
cmd_modifiers.append(dict(trainer_modifiers, **exp_modifier)) | |
if __name__ == "__main__": | |
for exp, cmd_modifier in zip(exps, cmd_modifiers): | |
exp.build_cmd(cmd_modifier) | |
if args.launch == True: | |
exp.launch() | |