File size: 7,390 Bytes
b14983e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# -*- coding: utf-8 -*-
# :noTabs=true:

import time as time_module
import codecs
import signal

import os, sys

try:
    from .base import *

except ImportError: # workaround for B2 back-end's
    import imp
    imp.load_source(__name__, '/'.join(__file__.split('/')[:-1]) + '/base.py')  # A bit of Python magic here, what we trying to say is this: from base import *, but path to base is calculated from our source location  # from base import HPC_Driver, execute, NT


class MultiCore_HPC_Driver(HPC_Driver):

    class JobID:
        def __init__(self, pids=None):
            self.pids = pids if pids else []


        def __bool__(self): return bool(self.pids)


        def __len__(self): return len(self.pids)


        def add_pid(self, pid): self.pids.append(pid)


        def remove_completed_pids(self):
            for pid in self.pids[:]:
                try:
                    r = os.waitpid(pid, os.WNOHANG)
                    if r == (pid, 0): self.pids.remove(pid)  # process have ended without error
                    elif r[0] == pid :  # process ended but with error, special case we will have to wait for all process to terminate and call system exit.
                        #self.cancel_job()
                        #sys.exit(1)
                        self.pids.remove(pid)
                        print('ERROR: Some of the HPC jobs terminated abnormally! Please see HPC logs for details.')

                except ChildProcessError: self.pids.remove(pid)


        def cancel(self):
            for pid in self.pids:
                try:
                    os.killpg(os.getpgid(pid), signal.SIGKILL)
                except ChildProcessError: pass

            self.pids = []



    def __init__(self, *args, **kwds):
        HPC_Driver.__init__(self, *args, **kwds)
        #print(f'MultiCore_HPC_Driver: cpu_count: {self.cpu_count}')


    def remove_completed_jobs(self):
        for job in self.jobs[:]:  # Need to make a copy so we don't modify a list we're iterating over
            job.remove_completed_pids()
            if not job: self.jobs.remove(job)


    @property
    def process_count(self):
        ''' return number of processes that currently ran by this driver instance
        '''
        return sum( map(len, self.jobs) )


    def submit_hpc_job(self, name, executable, arguments, working_dir, jobs_to_queue, log_dir, memory=512, time=12, block=True, shell_wrapper=False):
        print('submit_hpc_job is DEPRECATED and will be removed in near future, please use submit_serial_hpc_job  instead!')
        return self.submit_serial_hpc_job(name, executable, arguments, working_dir, jobs_to_queue, log_dir, memory, time, block, shell_wrapper)


    def submit_serial_hpc_job(self, name, executable, arguments, working_dir, jobs_to_queue, log_dir, memory=512, time=12, block=True, shell_wrapper=False):
        cpu_usage = -time_module.time()/60./60.

        if shell_wrapper:
            shell_wrapper_sh = os.path.abspath(self.working_dir + f'/hpc.{name}.shell_wrapper.sh')
            with open(shell_wrapper_sh, 'w') as f: f.write('#!/bin/bash\n{} {}\n'.format(executable, arguments));  os.fchmod(f.fileno(), stat.S_IEXEC | stat.S_IREAD | stat.S_IWRITE)
            executable, arguments = shell_wrapper_sh, ''

        def mfork():
            ''' Check if number of child process is below cpu_count. And if it is - fork the new pocees and return its pid.
            '''
            while self.process_count >= self.cpu_count:
                self.remove_completed_jobs()
                if self.process_count >= self.cpu_count: time_module.sleep(.5)

            sys.stdout.flush()
            pid = os.fork()
            # appending at caller level insted  if pid: self.jobs.append(pid) # We are parent!
            return pid

        current_job = self.JobID()
        process = 0
        for i in range(jobs_to_queue):

            pid = mfork()
            if not pid: # we are child process
                command_line = 'cd {} && {} {}'.format(working_dir, executable, arguments.format(process=process) )
                exit_code, log = execute('Running job {}.{}...'.format(name, i), command_line, tracer=self.tracer, return_='tuple')
                with codecs.open(log_dir+'/.hpc.{name}.{i:02d}.log'.format(**vars()), 'w', encoding='utf-8', errors='replace') as f:
                    f.write(command_line+'\n'+log)
                    if exit_code:
                        error_report = f'\n\n{command_line}\nERROR: PROCESS {name}.{i:02d} TERMINATED WITH NON-ZERO-EXIT-CODE {exit_code}!\n'
                        f.write(error_report)
                        print(log, error_report)

                sys.exit(0)

            else: # we are parent!
                current_job.add_pid(pid)
                # Need to potentially re-add to list, as remove_completed_jobs() might trim it.
                if current_job not in self.jobs: self.jobs.append(current_job)

            process += 1

        if block:
            #for p in all_queued_jobs: os.waitpid(p, 0)  # waiting for all child process to termintate...

            self.wait_until_complete(current_job)
            self.remove_completed_jobs()

            cpu_usage += time_module.time()/60./60.
            self.cpu_usage += cpu_usage * jobs_to_queue  # approximation...

            current_job = self.JobID()

        return current_job


    @property
    def number_of_cpu_per_node(self): return self.cpu_count


    @property
    def maximum_number_of_mpi_cpu(self): return self.cpu_count


    def submit_mpi_hpc_job(self, name, executable, arguments, working_dir, log_dir, memory=512, time=12, block=True, process_coefficient="1", requested_nodes=1, requested_processes_per_node=1):

        if requested_nodes > 1:
            print( "WARNING: " + str( requested_nodes ) + " nodes were requested, but we're running locally, so only 1 node will be used." )

        if requested_processes_per_node > self.cpu_count:
            print( "WARNING: " + str(requested_processes_per_node) + " processes were requested, but I only have " + str(self.cpu_count) + " CPUs.  Will launch " + str(self.cpu_count) + " processes."  )
        actual_processes = min( requested_processes_per_node, self.cpu_count )

        cpu_usage = -time_module.time()/60./60.

        arguments = arguments.format(process=0)

        command_line = f'cd {working_dir} && mpirun -np {actual_processes} {executable} {arguments}'
        log = execute(f'Running job {name}...', command_line, tracer=self.tracer, return_='output')
        with codecs.open(log_dir+'/.hpc.{name}.log'.format(**vars()), 'w', encoding='utf-8', errors='replace') as f: f.write(command_line+'\n'+log)

        cpu_usage += time_module.time()/60./60.
        self.cpu_usage += cpu_usage * actual_processes  # approximation...

        # return None - we do not return anything from this version of submit which imply returning None which in turn will be treated as job-id for already finished job


    def complete(self, job_id):
        ''' Return job completion status. Return True if job completed and False otherwise
        '''
        self.remove_completed_jobs()
        return job_id not in self.jobs


    def cancel_job(self, job):
        job.cancel();
        if job in self.jobs:
            self.jobs.remove(job)


    def __repr__(self):
        return 'MultiCore_HPC_Driver<>'