Spaces:

GlandVergil
/

rfdiffusion

Runtime error

App Files Files Community

rfdiffusion / .rosetta-ci /hpc_drivers /slurm.py

GlandVergil

Upload 597 files

6b89792 verified 9 months ago

raw

history blame

7.24 kB

	# -- coding: utf-8 --
	# :noTabs=true:

	import os, sys, time, collections, math
	import stat as stat_module


	try:
	from .base import *

	except ImportError: # workaround for B2 back-end's
	import imp
	imp.load_source(__name__, '/'.join(__file__.split('/')[:-1]) + '/base.py') # A bit of Python magic here, what we trying to say is this: from base import *, but path to base is calculated from our source location # from base import HPC_Driver, execute, NT


	_T_slurm_array_job_template_ = '''\
	#!/bin/bash
	#
	#SBATCH --job-name={name}
	#SBATCH --output={log_dir}/.hpc.%x.%a.output
	#
	#SBATCH --time={time}:00
	#SBATCH --mem-per-cpu={memory}M
	#SBATCH --chdir={working_dir}
	#
	#SBATCH --array=1-{jobs_to_queue}

	srun {executable} {arguments}
	'''

	_T_slurm_mpi_job_template_ = '''\
	#!/bin/bash
	#
	#SBATCH --job-name={name}
	#SBATCH --output={log_dir}/.hpc.%x.output
	#
	#SBATCH --time={time}:00
	#SBATCH --mem-per-cpu={memory}M
	#SBATCH --chdir={working_dir}
	#
	#SBATCH --ntasks={ntasks}

	mpirun {executable} {arguments}
	'''

	class Slurm_HPC_Driver(HPC_Driver):
	def head_node_execute(self, message, command_line, args, *kwargs):
	head_node = self.config['slurm'].get('head_node')

	command_line, host = (f"ssh {head_node} cd `pwd` '&& {command_line}'", head_node) if head_node else (command_line, 'localhost')
	return execute(f'Executiong on {host}: {message}' if message else '', command_line, args, *kwargs)


	# NodeGroup = collections.namedtuple('NodeGroup', 'nodes cores')

	# @property
	# def mpi_topology(self):
	# ''' return list of NodeGroup's
	# '''
	# pass


	# @property
	# def number_of_cpu_per_node(self): return int( self.config['condor']['mpi_cpu_per_node'] )

	# @property
	# def maximum_number_of_mpi_cpu(self):
	# return self.number_of_cpu_per_node * int( self.config['condor']['mpi_maximum_number_of_nodes'] )


	# def complete(self, condor_job_id):
	# ''' Return job completion status. Note that single hpc_job may contatin inner list of individual HPC jobs, True should be return if they all run in to completion.
	# '''

	# execute('Releasing condor jobs...', 'condor_release $USER', return_='tuple')

	# s = execute('', 'condor_q $USER \| grep $USER \| grep {}'.format(condor_job_id), return_='output', terminate_on_failure=False).replace(' ', '').replace('\n', '')
	# if s: return False

	# # #setDaemonStatusAndPing('[Job #%s] Running... %s condor job(s) in queue...' % (self.id, len(s.split('\n') ) ) )
	# # n_jobs = len(s.split('\n'))
	# # s, o = execute('', 'condor_userprio -all \| grep $USER@', return_='tuple')
	# # if s == 0:
	# # jobs_running = o.split()
	# # jobs_running = 'XX' if len(jobs_running) < 4 else jobs_running[4]
	# # self.set_daemon_message("Waiting for condor to finish HPC jobs... [{} jobs in HPC-Queue, {} CPU's used]".format(n_jobs, jobs_running) )
	# # print "{} condor jobs in queue... Sleeping 32s... \r".format(n_jobs),
	# # sys.stdout.flush()
	# # time.sleep(32)
	# else:

	# #self.tracer('Waiting for condor to finish the jobs... DONE')
	# self.jobs.remove(condor_job_id)
	# self.cpu_usage += self.get_condor_accumulated_usage()
	# return True # jobs already finished, we return empty list to prevent double counting of cpu_usage


	def complete(self, slurm_job_id):
	''' Return True if job with given id is complete
	'''

	s = self.head_node_execute('', f'squeue -j {slurm_job_id} --noheader', return_='output', terminate_on_failure=False, silent=True)
	if s: return False
	else:
	#self.tracer('Waiting for condor to finish the jobs... DONE')
	self.jobs.remove(slurm_job_id)
	return True # jobs already finished, we return empty list to prevent double counting of cpu_usage


	def cancel_job(self, slurm_job_id):
	self.head_node_execute(f'Slurm_HPC_Driver.canceling job {slurm_job_id}...', f'scancel {slurm_job_id}', terminate_on_failure=False)


	# def submit_hpc_job(self, name, executable, arguments, working_dir, jobs_to_queue, log_dir, memory=512, time=12, block=True, shell_wrapper=False):
	# print('submit_hpc_job is DEPRECATED and will be removed in near future, please use submit_serial_hpc_job instead!')
	# return self.submit_serial_hpc_job(name, executable, arguments, working_dir, jobs_to_queue, log_dir, memory, time, block, shell_wrapper)


	def submit_serial_hpc_job(self, name, executable, arguments, working_dir, jobs_to_queue, log_dir, memory=512, time=12, block=True, shell_wrapper=False):

	arguments = arguments.format(process='%a') # %a is SLURM array index
	time = int( math.ceil(time*60) )

	if shell_wrapper:
	shell_wrapper_sh = os.path.abspath(self.working_dir + f'/hpc.{name}.shell_wrapper.sh')
	with open(shell_wrapper_sh, 'w') as f: f.write('#!/bin/bash\n{} {}\n'.format(executable, arguments)); os.fchmod(f.fileno(), stat.S_IEXEC \| stat.S_IREAD \| stat.S_IWRITE)
	executable, arguments = shell_wrapper_sh, ''

	slurm_file = working_dir + f'/.hpc.{name}.slurm'

	with open(slurm_file, 'w') as f: f.write( _T_slurm_array_job_template_.format( **vars() ) )


	slurm_job_id = self.head_node_execute('Submitting SLURM array job...', f'cd {self.working_dir} && sbatch {slurm_file}',
	tracer=self.tracer, return_='output'
	).split()[-1] # expecting something like `Submitted batch job 6122` in output


	self.jobs.append(slurm_job_id)

	if block:
	self.wait_until_complete( [slurm_job_id] )
	return None

	else: return slurm_job_id





	def submit_mpi_hpc_job(self, name, executable, arguments, working_dir, log_dir, ntasks, memory=512, time=12, block=True, shell_wrapper=False):
	''' submit jobs as MPI job
	'''
	arguments = arguments.format(process='0')
	time = int( math.ceil(time*60) )

	if shell_wrapper:
	shell_wrapper_sh = os.path.abspath(self.working_dir + f'/hpc.{name}.shell_wrapper.sh')
	with open(shell_wrapper_sh, 'w') as f: f.write('#!/bin/bash\n{} {}\n'.format(executable, arguments)); os.fchmod(f.fileno(), stat.S_IEXEC \| stat.S_IREAD \| stat.S_IWRITE)
	executable, arguments = shell_wrapper_sh, ''

	slurm_file = working_dir + f'/.hpc.{name}.slurm'

	with open(slurm_file, 'w') as f: f.write( _T_slurm_mpi_job_template_.format( **vars() ) )

	slurm_job_id = self.head_node_execute('Submitting SLURM mpi job...', f'cd {self.working_dir} && sbatch {slurm_file}',
	tracer=self.tracer, return_='output'
	).split()[-1] # expecting something like `Submitted batch job 6122` in output

	self.jobs.append(slurm_job_id)

	if block:
	self.wait_until_complete( [slurm_job_id] )
	return None

	else: return slurm_job_id