1
2
3
4
5
6
7
8
9
10
11
12
13
14 from __future__ import absolute_import
15 from __future__ import print_function
16 import subprocess
17 import logging
18 import os
19 import time
20 import re
21 import glob
22 import inspect
23 import sys
24 import six
25 from six.moves import range
26 from six.moves import input
27
28 logger = logging.getLogger('madgraph.cluster')
29
30 try:
31 from madgraph import MadGraph5Error
32 import madgraph.various.misc as misc
33 except Exception as error:
34 if __debug__:
35 print(str(error))
36 from internal import MadGraph5Error
37 import internal.misc as misc
38
39 pjoin = os.path.join
43
46
47
48 multiple_try = misc.multiple_try
49 pjoin = os.path.join
53
54 def deco_interupt(f):
55 def deco_f_interupt(self, *args, **opt):
56 try:
57 return f(self, *args, **opt)
58 except error:
59 try:
60 self.remove(*args, **opt)
61 except Exception:
62 pass
63 raise error
64 return deco_f_interupt
65 return deco_interupt
66
79 return deco_f_store
80 return deco_store
81
83 """ This function checks whether compression of input files are necessary
84 given the running options given. """
85
86 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
87 return False
88 else:
89 return True
90
92 """Basic Class for all cluster type submission"""
93 name = 'mother class'
94 identifier_length = 14
95
97 """Init the cluster"""
98
99 self.submitted = 0
100 self.submitted_ids = []
101 self.finish = 0
102 self.submitted_dirs = []
103 self.submitted_exes = []
104 self.submitted_args = []
105
106 if 'cluster_queue' in opts:
107 self.cluster_queue = opts['cluster_queue']
108 else:
109 self.cluster_queue = 'madgraph'
110 if 'cluster_temp_path' in opts:
111 self.temp_dir = opts['cluster_temp_path']
112 else:
113 self.temp_dir = None
114 self.options = {'cluster_status_update': (600, 30)}
115 for key,value in opts.items():
116 self.options[key] = value
117 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
118 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
119 self.options = dict(opts)
120 self.retry_args = {}
121
122 self.packet = {}
123 self.id_to_packet = {}
124
125 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
126 log=None, required_output=[], nb_submit=0):
127 """How to make one submission. Return status id on the cluster."""
128 raise NotImplemented('No implementation of how to submit a job to cluster \'%s\'' % self.name)
129
130
131 @store_input()
132 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
133 log=None, input_files=[], output_files=[], required_output=[],
134 nb_submit=0):
135 """How to make one submission. Return status id on the cluster.
136 NO SHARE DISK"""
137
138 if cwd is None:
139 cwd = os.getcwd()
140 if not os.path.exists(prog):
141 prog = os.path.join(cwd, prog)
142
143 if not required_output and output_files:
144 required_output = output_files
145
146 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
147 (input_files == [] == output_files):
148
149 return self.submit(prog, argument, cwd, stdout, stderr, log,
150 required_output=required_output, nb_submit=nb_submit)
151
152 if not input_files and not output_files:
153
154 return self.submit(prog, argument, cwd, stdout, stderr, log,
155 required_output=required_output, nb_submit=nb_submit)
156
157 if cwd is None:
158 cwd = os.getcwd()
159 if not os.path.exists(prog):
160 prog = os.path.join(cwd, prog)
161 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
162
163 text = """#!/bin/bash
164 MYTMP=%(tmpdir)s/run$%(job_id)s
165 MYPWD=%(cwd)s
166 mkdir -p $MYTMP
167 cd $MYPWD
168 input_files=( %(input_files)s )
169 for i in ${input_files[@]}
170 do
171 cp -R -L $i $MYTMP
172 done
173 cd $MYTMP
174 echo '%(arguments)s' > arguments
175 chmod +x ./%(script)s
176 %(program)s ./%(script)s %(arguments)s
177 exit=$?
178 output_files=( %(output_files)s )
179 for i in ${output_files[@]}
180 do
181 cp -r $MYTMP/$i $MYPWD
182 done
183 # if [ "$exit" -eq "0" ]
184 # then
185 rm -rf $MYTMP
186 # fi
187 """
188
189 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
190 'cwd': cwd, 'job_id': self.job_id,
191 'input_files': ' '.join(input_files + [prog]),
192 'output_files': ' '.join(output_files),
193 'arguments': ' '.join([str(a) for a in argument]),
194 'program': ' ' if '.py' in prog else 'bash'}
195
196
197 new_prog = pjoin(cwd, temp_file_name)
198 open(new_prog, 'w').write(text % dico)
199 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
200
201 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
202 required_output=required_output, nb_submit=nb_submit)
203
204
205 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
206 log=None, input_files=[], output_files=[], required_output=[],
207 nb_submit=0, packet_member=None):
208 """This function wrap the cluster submition with cluster independant
209 method should not be overwritten (but for DAG type submission)"""
210
211 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
212 output_files, required_output, nb_submit)
213
214
215 if not packet_member:
216 return id
217 else:
218 if isinstance(packet_member, Packet):
219 self.id_to_packet[id] = packet_member
220 packet_member.put(id)
221 if packet_member.tag not in self.packet:
222 self.packet[packet_member.tag] = packet_member
223 else:
224 if packet_member in self.packet:
225 packet = self.packet[packet_member]
226 packet.put(id)
227 self.id_to_packet[id] = packet
228 return id
229
231 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
232 if not self.submitted_ids:
233 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
234 idle, run, fail = 0, 0, 0
235 for pid in self.submitted_ids[:]:
236 status = self.control_one_job(id)
237 if status == 'I':
238 idle += 1
239 elif status == 'R':
240 run += 1
241 elif status == 'F':
242 self.finish +=1
243 self.submitted_ids.remove(pid)
244 else:
245 fail += 1
246
247 return idle, run, self.finish, fail
248
250 """ control the status of a single job with it's cluster id """
251 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
252
254 """get a unique run_name for all the jobs helps to identify the runs
255 in the controller for some cluster."""
256
257 if second_path:
258 path = os.path.realpath(pjoin(path, second_path))
259 elif not os.path.exists(path):
260 return path
261
262 if 'SubProcesses' in path:
263 target = path.rsplit('/SubProcesses',1)[0]
264 elif 'MCatNLO' in path:
265 target = path.rsplit('/MCatNLO',1)[0]
266 elif 'PY8_parallelization' in path:
267 target = path.rsplit('/PY8_parallelization',1)[0]
268 elif second_path:
269 target=path
270 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
271 else:
272 target = path
273
274 if target.endswith('/'):
275 target = target[:-1]
276
277 target = misc.digest(target)[-self.identifier_length:]
278 if not target[0].isalpha():
279 target = 'a' + target[1:]
280
281 return target
282
283
284 @check_interupt()
285 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
286 """Wait that all job are finish.
287 if minimal_job set, then return if idle + run is lower than that number"""
288
289
290 mode = 1
291 nb_iter = 0
292 nb_short = 0
293 change_at = 5
294
295 if update_first:
296 idle, run, finish, fail = self.control(me_dir)
297 update_first(idle, run, finish)
298
299
300 longtime, shorttime = self.options['cluster_status_update']
301
302 nb_job = 0
303
304 if self.options['cluster_type'] == 'htcaas2':
305 me_dir = self.metasubmit(self)
306
307 while 1:
308 old_mode = mode
309 nb_iter += 1
310 idle, run, finish, fail = self.control(me_dir)
311 if nb_job:
312 if idle + run + finish + fail != nb_job:
313 nb_job = idle + run + finish + fail
314 nb_iter = 1
315 else:
316 nb_job = idle + run + finish + fail
317 if fail:
318 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
319 if idle + run == 0:
320
321 logger.info('All jobs finished')
322 fct(idle, run, finish)
323 break
324 if idle + run < minimal_job:
325 return
326 fct(idle, run, finish)
327
328 if nb_iter < change_at:
329 mode = 1
330 elif idle < run:
331 if old_mode == 0:
332 if nb_short:
333 mode = 0
334
335 elif idle:
336 if nb_iter > change_at + int(longtime)//shorttime:
337 mode = 0
338 else:
339 mode = 1
340 nb_short =0
341 else:
342 mode = 1
343 nb_short = 0
344 elif old_mode == 1:
345 nb_short +=1
346 if nb_short > 3* max(change_at, int(longtime)//shorttime):
347 mode = 0
348 else:
349 mode = 0
350
351
352 if old_mode > mode:
353 logger.info('''Start to wait %ss between checking status.
354 Note that you can change this time in the configuration file.
355 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
356
357
358 if mode == 0:
359 try:
360 time.sleep(self.options['cluster_status_update'][0])
361 except KeyboardInterrupt:
362 logger.info('start to update the status')
363 nb_iter = min(0, change_at -2)
364 nb_short = 0
365 else:
366 time.sleep(self.options['cluster_status_update'][1])
367
368
369 self.submitted = 0
370 self.submitted_ids = []
371
373 """Check the termination of the jobs with job_id and relaunch it if needed."""
374
375
376 if job_id not in self.retry_args:
377 if job_id in self.id_to_packet:
378 nb_in_packet = self.id_to_packet[job_id].remove_one()
379 if nb_in_packet == 0:
380
381 packet = self.id_to_packet[job_id]
382
383 packet.queue.join()
384
385 packet.fct(*packet.args)
386 del self.id_to_packet[job_id]
387 return 'resubmit'
388 else:
389 return True
390
391 args = self.retry_args[job_id]
392 if 'time_check' in args:
393 time_check = args['time_check']
394 else:
395 time_check = 0
396
397 for path in args['required_output']:
398 if args['cwd']:
399 path = pjoin(args['cwd'], path)
400
401 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
402 break
403 else:
404
405 if time_check > 0:
406 logger.info('Job %s Finally found the missing output.' % (job_id))
407 del self.retry_args[job_id]
408 self.submitted_ids.remove(job_id)
409
410 if job_id in self.id_to_packet:
411 nb_in_packet = self.id_to_packet[job_id].remove_one()
412 if nb_in_packet == 0:
413
414 packet = self.id_to_packet[job_id]
415
416 packet.queue.join()
417
418 packet.fct(*packet.args)
419 del self.id_to_packet[job_id]
420 return 'resubmit'
421
422 return 'done'
423
424 if time_check == 0:
425 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
426 args['time_check'] = time.time()
427 return 'wait'
428 elif self.cluster_retry_wait > time.time() - time_check:
429 return 'wait'
430
431
432 if self.nb_retry < 0:
433 logger.critical('''Fail to run correctly job %s.
434 with option: %s
435 file missing: %s''' % (job_id, args, path))
436 input('press enter to continue.')
437 elif self.nb_retry == 0:
438 logger.critical('''Fail to run correctly job %s.
439 with option: %s
440 file missing: %s.
441 Stopping all runs.''' % (job_id, args, path))
442 self.remove()
443 elif args['nb_submit'] >= self.nb_retry:
444 logger.critical('''Fail to run correctly job %s.
445 with option: %s
446 file missing: %s
447 Fails %s times
448 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
449 self.remove()
450 else:
451 args['nb_submit'] += 1
452 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
453 del self.retry_args[job_id]
454 self.submitted_ids.remove(job_id)
455 if 'time_check' in args:
456 del args['time_check']
457 if job_id in self.id_to_packet:
458 self.id_to_packet[job_id].remove_one()
459 args['packet_member'] = self.id_to_packet[job_id]
460 del self.id_to_packet[job_id]
461 self.cluster_submit(**args)
462 else:
463 self.submit2(**args)
464 return 'resubmit'
465 return 'done'
466
467 @check_interupt()
468 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
469 stderr=None, log=None, required_output=[], nb_submit=0,
470 input_files=[], output_files=[]):
471 """launch one job on the cluster and wait for it"""
472
473 special_output = False
474 if stderr == -2 and stdout:
475
476 special_output = True
477 stderr = stdout + '.err'
478
479 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
480 required_output=required_output, input_files=input_files,
481 output_files=output_files)
482
483 if self.options['cluster_type']=='htcaas2':
484 if self.submitted == self.submitted_ids[-1]:
485 id = self.metasubmit(self)
486
487 frame = inspect.currentframe()
488 args, _, _, values = inspect.getargvalues(frame)
489 args = dict([(i, values[i]) for i in args if i != 'self'])
490 self.retry_args[id] = args
491
492 nb_wait=0
493 while 1:
494 nb_wait+=1
495 status = self.control_one_job(id)
496 if not status in ['R','I']:
497 status = self.check_termination(id)
498 if status in ['wait']:
499 time.sleep(30)
500 continue
501 elif status in ['resubmit']:
502 id = self.submitted_ids[0]
503 time.sleep(30)
504 continue
505
506 time.sleep(30)
507 break
508 time.sleep(self.options['cluster_status_update'][1])
509
510 if required_output:
511 status = self.check_termination(id)
512 if status == 'wait':
513 run += 1
514 elif status == 'resubmit':
515 idle += 1
516
517
518 if special_output:
519
520
521 for i in range(5):
522 if os.path.exists(stdout):
523 if not os.path.exists(stderr):
524 time.sleep(5)
525 if os.path.exists(stderr):
526 err_text = open(stderr).read()
527 if not err_text:
528 return
529 logger.warning(err_text)
530 text = open(stdout).read()
531 open(stdout,'w').write(text + err_text)
532 else:
533 return
534 time.sleep(10)
535
536 - def remove(self, *args, **opts):
537 """ """
538 logger.warning("""This cluster didn't support job removal,
539 the jobs are still running on the cluster.""")
540
541 @store_input()
545
547 """routine which allow to modify the run_card/mg5cmd object to change the
548 default behavior of the runs.
549 This is called at the time of the compilation of the run_card.
550 Note that this function can be called multiple times by run.
551 """
552
553 return
554
556 """ an object for handling packet of job, it is designed to be thread safe
557 """
558
559 - def __init__(self, name, fct, args, opts={}):
560 import six.moves.queue
561 import threading
562 self.queue = six.moves.queue.Queue()
563 self.tag = name
564 self.fct = fct
565 self.args = args
566 self.opts = opts
567 self.done = threading.Event()
568
569 - def put(self, *args, **opts):
571
572 append = put
573
578
580 """class for dealing with the submission in multiple node"""
581
582 job_id = "$"
583
585 """Init the cluster """
586
587
588 super(MultiCore, self).__init__(self, *args, **opt)
589
590 import six.moves.queue
591 import threading
592 import six.moves._thread
593 self.queue = six.moves.queue.Queue()
594 self.done = six.moves.queue.Queue()
595 self.submitted = six.moves.queue.Queue()
596 self.stoprequest = threading.Event()
597 self.demons = []
598 self.nb_done =0
599 if 'nb_core' in opt:
600 self.nb_core = opt['nb_core']
601 elif isinstance(args[0],int):
602 self.nb_core = args[0]
603 else:
604 self.nb_core = 1
605 self.update_fct = None
606
607 self.lock = threading.Event()
608 self.pids = six.moves.queue.Queue()
609 self.done_pid = []
610 self.done_pid_queue = six.moves.queue.Queue()
611 self.fail_msg = None
612
613
614 for _ in range(self.nb_core):
615 self.start_demon()
616
617
619 import threading
620 t = threading.Thread(target=self.worker)
621 t.daemon = True
622 t.start()
623 self.demons.append(t)
624
625
627 import six.moves.queue
628 import six.moves._thread
629 while not self.stoprequest.isSet():
630 try:
631 args = self.queue.get()
632 tag, exe, arg, opt = args
633 try:
634
635 if isinstance(exe,str):
636 if os.path.exists(exe) and not exe.startswith('/'):
637 exe = './' + exe
638 if isinstance(opt['stdout'],str):
639 opt['stdout'] = open(opt['stdout'],'w')
640 if opt['stderr'] == None:
641 opt['stderr'] = subprocess.STDOUT
642 if arg:
643 proc = misc.Popen([exe] + arg, **opt)
644 else:
645 proc = misc.Popen(exe, **opt)
646 pid = proc.pid
647 self.pids.put(pid)
648 proc.wait()
649 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
650 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
651 (' '.join([exe]+arg), proc.returncode)
652 logger.warning(fail_msg)
653 self.stoprequest.set()
654 self.remove(fail_msg)
655
656
657
658
659 else:
660 pid = tag
661 self.pids.put(pid)
662
663
664 returncode = exe(*arg, **opt)
665 if returncode != 0:
666 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
667 self.stoprequest.set()
668 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
669 except Exception as error:
670 self.fail_msg = sys.exc_info()
671 logger.warning(str(error))
672 self.stoprequest.set()
673 self.remove(error)
674
675 if __debug__:
676 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
677
678 self.queue.task_done()
679 self.done.put(tag)
680 self.done_pid_queue.put(pid)
681
682 try:
683 self.lock.set()
684 except six.moves._thread.error:
685 continue
686 except six.moves.queue.Empty:
687 continue
688
689
690
691
692 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
693 log=None, required_output=[], nb_submit=0):
694 """submit a job on multicore machine"""
695
696 tag = (prog, tuple(argument), cwd, nb_submit)
697 if isinstance(prog, str):
698
699 opt = {'cwd': cwd,
700 'stdout':stdout,
701 'stderr': stderr}
702
703 self.queue.put((tag, prog, argument, opt))
704 self.submitted.put(1)
705 return tag
706 else:
707
708 self.queue.put((tag, prog, argument, {}))
709 self.submitted.put(1)
710 return tag
711
712 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
713 stderr=None, log=None, **opts):
714 """launch one job and wait for it"""
715 if isinstance(stdout, str):
716 stdout = open(stdout, 'w')
717 if isinstance(stderr, str):
718 stdout = open(stderr, 'w')
719 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
720
721 - def remove(self, error=None):
722 """Ensure that all thread are killed"""
723
724
725 self.stoprequest.set()
726 if error and not self.fail_msg:
727 self.fail_msg = error
728
729
730 while not self.done_pid_queue.empty():
731 pid = self.done_pid_queue.get()
732 self.done_pid.append(pid)
733
734
735 while not self.pids.empty():
736 pid = self.pids.get()
737 self.pids.task_done()
738 if isinstance(pid, tuple):
739 continue
740 if pid in self.done_pid:
741 continue
742 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
743 % {'pid':pid} )
744 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
745
746
747 - def wait(self, me_dir, update_status, update_first=None):
748 """Waiting that all the jobs are done. This function also control that
749 the submission by packet are handle correctly (i.e. submit the function)"""
750
751 import six.moves.queue
752 import threading
753
754 try:
755 last_status = (0, 0, 0)
756 sleep_time = 1
757 use_lock = True
758 first = True
759 while True:
760 force_one_more_loop = False
761
762
763
764 while self.done.qsize():
765 try:
766 tag = self.done.get(True, 1)
767 except six.moves.queue.Empty:
768 pass
769 else:
770 if self.id_to_packet and tuple(tag) in self.id_to_packet:
771 packet = self.id_to_packet[tuple(tag)]
772 remaining = packet.remove_one()
773 if remaining == 0:
774
775 packet.queue.join()
776 self.submit(packet.fct, packet.args)
777 force_one_more_loop = True
778 self.nb_done += 1
779 self.done.task_done()
780
781
782
783 Idle = self.queue.qsize()
784 Done = self.nb_done + self.done.qsize()
785 Running = max(0, self.submitted.qsize() - Idle - Done)
786
787 if Idle + Running <= 0 and not force_one_more_loop:
788 update_status(Idle, Running, Done)
789
790
791 self.queue.join()
792 break
793
794 if (Idle, Running, Done) != last_status:
795 if first and update_first:
796 update_first(Idle, Running, Done)
797 first = False
798 else:
799 update_status(Idle, Running, Done)
800 last_status = (Idle, Running, Done)
801
802
803 while not self.done_pid_queue.empty():
804 pid = self.done_pid_queue.get()
805 self.done_pid.append(pid)
806 self.done_pid_queue.task_done()
807
808
809
810 if use_lock:
811
812 use_lock = self.lock.wait(300)
813 self.lock.clear()
814 if not use_lock and Idle > 0:
815 use_lock = True
816 else:
817
818
819 time.sleep(sleep_time)
820 sleep_time = min(sleep_time + 2, 180)
821 if update_first:
822 update_first(Idle, Running, Done)
823
824 if self.stoprequest.isSet():
825 if isinstance(self.fail_msg, Exception):
826 raise self.fail_msg
827 elif isinstance(self.fail_msg, str):
828 raise Exception(self.fail_msg)
829 else:
830 misc.sprint(self.fail_msg)
831 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
832
833 try:
834 self.lock.clear()
835 except Exception:
836 pass
837 self.done = six.moves.queue.Queue()
838 self.done_pid = []
839 self.done_pid_queue = six.moves.queue.Queue()
840 self.nb_done = 0
841 self.submitted = six.moves.queue.Queue()
842 self.pids = six.moves.queue.Queue()
843 self.stoprequest.clear()
844
845 except KeyboardInterrupt:
846
847 if isinstance(self.fail_msg, Exception):
848 raise self.fail_msg
849 elif isinstance(self.fail_msg, str):
850 raise Exception(self.fail_msg)
851 elif self.fail_msg:
852 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
853
854 raise
855
857 """Basic class for dealing with cluster submission"""
858
859 name = 'condor'
860 job_id = 'CONDOR_ID'
861
862
863
864 @multiple_try()
865 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
866 required_output=[], nb_submit=0):
867 """Submit a job prog to a Condor cluster"""
868
869 text = """Executable = %(prog)s
870 output = %(stdout)s
871 error = %(stderr)s
872 log = %(log)s
873 %(argument)s
874 environment = CONDOR_ID=$(Cluster).$(Process)
875 Universe = vanilla
876 notification = Error
877 Initialdir = %(cwd)s
878 %(requirement)s
879 getenv=True
880 queue 1
881 """
882
883 if self.cluster_queue not in ['None', None]:
884 requirement = 'Requirements = %s=?=True' % self.cluster_queue
885 else:
886 requirement = ''
887
888 if cwd is None:
889 cwd = os.getcwd()
890 if stdout is None:
891 stdout = '/dev/null'
892 if stderr is None:
893 stderr = '/dev/null'
894 if log is None:
895 log = '/dev/null'
896 if not os.path.exists(prog):
897 prog = os.path.join(cwd, prog)
898 if argument:
899 argument = 'Arguments = %s' % ' '.join(argument)
900 else:
901 argument = ''
902
903
904 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
905 'stderr': stderr,'log': log,'argument': argument,
906 'requirement': requirement}
907
908
909 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
910 stdin=subprocess.PIPE)
911 output, _ = a.communicate((text % dico).encode())
912
913
914
915
916 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
917 output = output.decode()
918 try:
919 id = pat.search(output).groups()[0]
920 except:
921 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
922 % output)
923 self.submitted += 1
924 self.submitted_ids.append(id)
925 return id
926
927 @store_input()
928 @multiple_try()
929 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
930 log=None, input_files=[], output_files=[], required_output=[],
931 nb_submit=0):
932 """Submit the job on the cluster NO SHARE DISK
933 input/output file should be give relative to cwd
934 """
935
936 if not required_output and output_files:
937 required_output = output_files
938
939 if (input_files == [] == output_files):
940 return self.submit(prog, argument, cwd, stdout, stderr, log,
941 required_output=required_output, nb_submit=nb_submit)
942
943 text = """Executable = %(prog)s
944 output = %(stdout)s
945 error = %(stderr)s
946 log = %(log)s
947 %(argument)s
948 should_transfer_files = YES
949 when_to_transfer_output = ON_EXIT
950 transfer_input_files = %(input_files)s
951 %(output_files)s
952 Universe = vanilla
953 notification = Error
954 Initialdir = %(cwd)s
955 %(requirement)s
956 getenv=True
957 queue 1
958 """
959
960 if self.cluster_queue not in ['None', None]:
961 requirement = 'Requirements = %s=?=True' % self.cluster_queue
962 else:
963 requirement = ''
964
965 if cwd is None:
966 cwd = os.getcwd()
967 if stdout is None:
968 stdout = '/dev/null'
969 if stderr is None:
970 stderr = '/dev/null'
971 if log is None:
972 log = '/dev/null'
973 if not os.path.exists(prog):
974 prog = os.path.join(cwd, prog)
975 if argument:
976 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
977 else:
978 argument = ''
979
980 if input_files:
981 input_files = ','.join(input_files)
982 else:
983 input_files = ''
984 if output_files:
985 output_files = 'transfer_output_files = %s' % ','.join(output_files)
986 else:
987 output_files = ''
988
989
990
991 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
992 'stderr': stderr,'log': log,'argument': argument,
993 'requirement': requirement, 'input_files':input_files,
994 'output_files':output_files}
995
996
997 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
998 stdin=subprocess.PIPE)
999 output, _ = a.communicate((text % dico).encode())
1000
1001
1002
1003
1004 output = output.decode()
1005 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
1006 try:
1007 id = pat.search(output).groups()[0]
1008 except:
1009 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1010 % output)
1011 self.submitted += 1
1012 self.submitted_ids.append(id)
1013 return id
1014
1015
1016
1017
1018
1019 @multiple_try(nb_try=10, sleep=10)
1021 """ control the status of a single job with it's cluster id """
1022 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1023 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1024 stderr=subprocess.PIPE)
1025
1026 error = status.stderr.read().decode()
1027 if status.returncode or error:
1028 raise ClusterManagmentError('condor_q returns error: %s' % error)
1029
1030 return status.stdout.readline().decode().strip()
1031
1032 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'}
1033 @check_interupt()
1034 @multiple_try(nb_try=10, sleep=10)
1036 """ control the status of a single job with it's cluster id """
1037
1038 if not self.submitted_ids:
1039 return 0, 0, 0, 0
1040
1041 packet = 15000
1042 idle, run, fail = 0, 0, 0
1043 ongoing = []
1044 for i in range(1+(len(self.submitted_ids)-1)//packet):
1045 start = i * packet
1046 stop = (i+1) * packet
1047 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1048 " -format \"%d \" ClusterId " + \
1049 " -format \"%d\\n\" JobStatus "
1050
1051 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1052 stderr=subprocess.PIPE)
1053 error = status.stderr.read().decode()
1054 if status.returncode or error:
1055 raise ClusterManagmentError('condor_q returns error: %s' % error)
1056
1057 for line in status.stdout:
1058 id, status = line.decode().strip().split()
1059 status = self.jobstatus[status]
1060 ongoing.append(id)
1061 if status in ['I','U']:
1062 idle += 1
1063 elif status == 'R':
1064 run += 1
1065 elif status != 'C':
1066 fail += 1
1067
1068 for id in list(self.submitted_ids):
1069 if id not in ongoing:
1070 status = self.check_termination(id)
1071 if status == 'wait':
1072 run += 1
1073 elif status == 'resubmit':
1074 idle += 1
1075
1076 return idle, run, self.submitted - (idle+run+fail), fail
1077
1078 @multiple_try()
1079 - def remove(self, *args, **opts):
1080 """Clean the jobson the cluster"""
1081
1082 if not self.submitted_ids:
1083 return
1084 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1085
1086 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1087 self.submitted_ids = []
1088
1090 """Basic class for dealing with cluster submission"""
1091
1092 name = 'pbs'
1093 job_id = 'PBS_JOBID'
1094 idle_tag = ['Q']
1095 running_tag = ['T','E','R']
1096 complete_tag = ['C']
1097
1098 maximum_submited_jobs = 2500
1099
1100 @multiple_try()
1101 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1102 required_output=[], nb_submit=0):
1103 """Submit a job prog to a PBS cluster"""
1104
1105 me_dir = self.get_jobs_identifier(cwd, prog)
1106
1107 if len(self.submitted_ids) > self.maximum_submited_jobs:
1108 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1109 self.wait(me_dir, fct, self.maximum_submited_jobs)
1110
1111
1112 text = ""
1113 if cwd is None:
1114 cwd = os.getcwd()
1115 else:
1116 text = " cd %s;" % cwd
1117 if stdout is None:
1118 stdout = '/dev/null'
1119 if stderr is None:
1120 stderr = '/dev/null'
1121 elif stderr == -2:
1122 stderr = stdout
1123 if log is None:
1124 log = '/dev/null'
1125
1126 if not os.path.isabs(prog):
1127 text += "./%s" % prog
1128 else:
1129 text+= prog
1130
1131 if argument:
1132 text += ' ' + ' '.join(argument)
1133
1134 command = ['qsub','-o', stdout,
1135 '-N', me_dir,
1136 '-e', stderr,
1137 '-V']
1138
1139 if self.cluster_queue and self.cluster_queue != 'None':
1140 command.extend(['-q', self.cluster_queue])
1141
1142 a = misc.Popen(command, stdout=subprocess.PIPE,
1143 stderr=subprocess.STDOUT,
1144 stdin=subprocess.PIPE, cwd=cwd)
1145
1146 output = a.communicate(text.encode())[0].decode()
1147 id = output.split('.')[0]
1148 if not id.isdigit() or a.returncode !=0:
1149 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1150 % output)
1151
1152 self.submitted += 1
1153 self.submitted_ids.append(id)
1154 return id
1155
1156 @multiple_try()
1158 """ control the status of a single job with it's cluster id """
1159 cmd = 'qstat '+str(id)
1160 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1161 stderr=subprocess.STDOUT)
1162
1163 for line in status.stdout:
1164 line = line.decode().strip()
1165 if 'cannot connect to server' in line or 'cannot read reply' in line:
1166 raise ClusterManagmentError('server disconnected')
1167 if 'Unknown' in line:
1168 return 'F'
1169 elif line.startswith(str(id)):
1170 jobstatus = line.split()[4]
1171 else:
1172 jobstatus=""
1173
1174 if status.returncode != 0 and status.returncode is not None:
1175 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode)
1176 if jobstatus in self.idle_tag:
1177 return 'I'
1178 elif jobstatus in self.running_tag:
1179 return 'R'
1180 return 'F'
1181
1182
1183 @multiple_try()
1185 """ control the status of a single job with it's cluster id """
1186 cmd = "qstat"
1187 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1188
1189 me_dir = self.get_jobs_identifier(me_dir)
1190
1191 ongoing = []
1192
1193 idle, run, fail = 0, 0, 0
1194 for line in status.stdout:
1195 line = line.decode()
1196 if 'cannot connect to server' in line or 'cannot read reply' in line:
1197 raise ClusterManagmentError('server disconnected')
1198 if me_dir in line:
1199 ongoing.append(line.split()[0].split('.')[0])
1200 status2 = line.split()[4]
1201 if status2 in self.idle_tag:
1202 idle += 1
1203 elif status2 in self.running_tag:
1204 run += 1
1205 elif status2 in self.complete_tag:
1206 if not self.check_termination(line.split()[0].split('.')[0]):
1207 idle += 1
1208 else:
1209 fail += 1
1210
1211 if status.returncode != 0 and status.returncode is not None:
1212 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode)
1213
1214 for id in list(self.submitted_ids):
1215 if id not in ongoing:
1216 status2 = self.check_termination(id)
1217 if status2 == 'wait':
1218 run += 1
1219 elif status2 == 'resubmit':
1220 idle += 1
1221
1222 return idle, run, self.submitted - (idle+run+fail), fail
1223
1224 @multiple_try()
1225 - def remove(self, *args, **opts):
1226 """Clean the jobs on the cluster"""
1227
1228 if not self.submitted_ids:
1229 return
1230 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1231 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1232 self.submitted_ids = []
1233
1236 """Basic class for dealing with cluster submission"""
1237
1238
1239 name = 'sge'
1240 job_id = 'JOB_ID'
1241 idle_tag = ['qw', 'hqw','hRqw','w']
1242 running_tag = ['r','t','Rr','Rt']
1243 identifier_length = 10
1244
1246 """replace string for path issues"""
1247 location = os.path.realpath(location)
1248 homePath = os.getenv("HOME")
1249 if homePath:
1250 location = location.replace(homePath,'$HOME')
1251 return location
1252
1253 @multiple_try()
1254 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1255 required_output=[], nb_submit=0):
1256 """Submit a job prog to an SGE cluster"""
1257
1258 me_dir = self.get_jobs_identifier(cwd, prog)
1259
1260
1261 if cwd is None:
1262
1263 cwd = self.def_get_path(os.getcwd())
1264 cwd1 = self.def_get_path(cwd)
1265 text = " cd %s;" % cwd1
1266 if stdout is None:
1267 stdout = '/dev/null'
1268 else:
1269 stdout = self.def_get_path(stdout)
1270 if stderr is None:
1271 stderr = '/dev/null'
1272 elif stderr == -2:
1273 stderr = stdout
1274 else:
1275 stderr = self.def_get_path(stderr)
1276
1277 if log is None:
1278 log = '/dev/null'
1279 else:
1280 log = self.def_get_path(log)
1281
1282 text += prog
1283 if argument:
1284 text += ' ' + ' '.join(argument)
1285
1286
1287
1288
1289 homePath = os.getenv("HOME")
1290 if homePath:
1291 text = text.replace(homePath,'$HOME')
1292
1293 logger.debug("!=== input %s" % text)
1294 logger.debug("!=== output %s" % stdout)
1295 logger.debug("!=== error %s" % stderr)
1296 logger.debug("!=== logs %s" % log)
1297
1298 command = ['qsub','-o', stdout,
1299 '-N', me_dir,
1300 '-e', stderr,
1301 '-V']
1302
1303 if self.cluster_queue and self.cluster_queue != 'None':
1304 command.extend(['-q', self.cluster_queue])
1305
1306 a = misc.Popen(command, stdout=subprocess.PIPE,
1307 stderr=subprocess.STDOUT,
1308 stdin=subprocess.PIPE, cwd=cwd)
1309
1310 output = a.communicate(text.encode())[0].decode()
1311 id = output.split(' ')[2]
1312 if not id.isdigit():
1313 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1314 % output)
1315 self.submitted += 1
1316 self.submitted_ids.append(id)
1317 logger.debug(output)
1318
1319 return id
1320
1321 @multiple_try()
1323 """ control the status of a single job with it's cluster id """
1324
1325 cmd = 'qstat '
1326 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1327 for line in status.stdout:
1328 line = line.decode()
1329
1330
1331
1332
1333
1334
1335 if str(id) in line:
1336 status = line.split()[4]
1337
1338 if status in self.idle_tag:
1339 return 'I'
1340 elif status in self.running_tag:
1341 return 'R'
1342 return 'F'
1343
1344 @multiple_try()
1346 """ control the status of a single job with it's cluster id """
1347 cmd = "qstat "
1348 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1349
1350 me_dir = self.get_jobs_identifier(me_dir)
1351
1352 finished = list(self.submitted_ids)
1353
1354 idle, run, fail = 0, 0, 0
1355 for line in status.stdout:
1356 line = line.decode()
1357 if me_dir in line:
1358 id,_,_,_,status = line.split()[:5]
1359 if status in self.idle_tag:
1360 idle += 1
1361 finished.remove(id)
1362 elif status in self.running_tag:
1363 run += 1
1364 finished.remove(id)
1365 else:
1366 logger.debug(line)
1367 fail += 1
1368 finished.remove(id)
1369
1370 for id in finished:
1371 self.check_termination(id)
1372
1373 return idle, run, self.submitted - (idle+run+fail), fail
1374
1375
1376
1377 @multiple_try()
1378 - def remove(self, *args, **opts):
1379 """Clean the jobs on the cluster"""
1380
1381 if not self.submitted_ids:
1382 return
1383 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1384 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1385 self.submitted_ids = []
1386
1389 """Basic class for dealing with cluster submission"""
1390
1391 name = 'lsf'
1392 job_id = 'LSB_JOBID'
1393
1394 @multiple_try()
1395 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1396 required_output=[], nb_submit=0):
1397 """Submit the job prog to an LSF cluster"""
1398
1399
1400 me_dir = self.get_jobs_identifier(cwd, prog)
1401
1402 text = ""
1403 command = ['bsub', '-C0', '-J', me_dir]
1404 if cwd is None:
1405 cwd = os.getcwd()
1406 else:
1407 text = " cd %s;" % cwd
1408 if stdout and isinstance(stdout, str):
1409 command.extend(['-o', stdout])
1410 if stderr and isinstance(stdout, str):
1411 command.extend(['-e', stderr])
1412 elif stderr == -2:
1413 pass
1414 if log is None:
1415 log = '/dev/null'
1416
1417 text += prog
1418 if argument:
1419 text += ' ' + ' '.join(argument)
1420
1421 if self.cluster_queue and self.cluster_queue != 'None':
1422 command.extend(['-q', self.cluster_queue])
1423
1424 a = misc.Popen(command, stdout=subprocess.PIPE,
1425 stderr=subprocess.STDOUT,
1426 stdin=subprocess.PIPE, cwd=cwd)
1427
1428 output = a.communicate(text.encode())[0].decode()
1429
1430 try:
1431 id = output.split('>',1)[0].split('<')[1]
1432 except:
1433 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1434 % output)
1435 if not id.isdigit():
1436 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1437 % output)
1438 self.submitted += 1
1439 self.submitted_ids.append(id)
1440 return id
1441
1442
1443 @multiple_try()
1445 """ control the status of a single job with it's cluster id """
1446
1447 cmd = 'bjobs '+str(id)
1448 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1449
1450 for line in status.stdout:
1451 line = line.decode().strip().upper()
1452 if 'JOBID' in line:
1453 continue
1454 elif str(id) not in line:
1455 continue
1456 status = line.split()[2]
1457 if status == 'RUN':
1458 return 'R'
1459 elif status == 'PEND':
1460 return 'I'
1461 elif status == 'DONE':
1462 return 'F'
1463 else:
1464 return 'H'
1465 return 'F'
1466
1467 @multiple_try()
1469 """ control the status of a single job with it's cluster id """
1470
1471 if not self.submitted_ids:
1472 return 0, 0, 0, 0
1473
1474 cmd = "bjobs " + ' '.join(self.submitted_ids)
1475 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1476
1477 jobstatus = {}
1478 for line in status.stdout:
1479 line = line.decode().strip()
1480 if 'JOBID' in line:
1481 continue
1482 splitline = line.split()
1483 id = splitline[0]
1484 if id not in self.submitted_ids:
1485 continue
1486 jobstatus[id] = splitline[2]
1487
1488 idle, run, fail = 0, 0, 0
1489 for id in self.submitted_ids[:]:
1490 if id in jobstatus:
1491 status = jobstatus[id]
1492 else:
1493 status = 'MISSING'
1494 if status == 'RUN':
1495 run += 1
1496 elif status == 'PEND':
1497 idle += 1
1498 else:
1499 status = self.check_termination(id)
1500 if status == 'wait':
1501 run += 1
1502 elif status == 'resubmit':
1503 idle += 1
1504
1505 return idle, run, self.submitted - (idle+run+fail), fail
1506
1507 @multiple_try()
1508 - def remove(self, *args,**opts):
1509 """Clean the jobs on the cluster"""
1510
1511 if not self.submitted_ids:
1512 return
1513 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1514 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1515 self.submitted_ids = []
1516
1518 """Class for dealing with cluster submission on a GE cluster"""
1519
1520 name = 'ge'
1521 job_id = 'JOB_ID'
1522 idle_tag = ['qw']
1523 running_tag = ['r']
1524
1525 @multiple_try()
1526 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1527 required_output=[], nb_submit=0):
1528 """Submit a job prog to a GE cluster"""
1529
1530 text = ""
1531 if cwd is None:
1532 cwd = os.getcwd()
1533 else:
1534 text = " cd %s; bash " % cwd
1535 if stdout is None:
1536 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1537 if stderr is None:
1538 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1539 elif stderr == -2:
1540 stderr = stdout
1541 if log is None:
1542 log = '/dev/null'
1543
1544 text += prog
1545 if argument:
1546 text += ' ' + ' '.join(argument)
1547 text += '\n'
1548 tmp_submit = os.path.join(cwd, 'tmp_submit')
1549 open(tmp_submit,'w').write(text)
1550
1551 a = misc.Popen(['qsub','-o', stdout,
1552 '-e', stderr,
1553 tmp_submit],
1554 stdout=subprocess.PIPE,
1555 stderr=subprocess.STDOUT,
1556 stdin=subprocess.PIPE, cwd=cwd)
1557
1558 output = a.communicate()[0].decode()
1559
1560 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1561 try:
1562 id = pat.search(output).groups()[0]
1563 except:
1564 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1565 % output)
1566 self.submitted += 1
1567 self.submitted_ids.append(id)
1568 return id
1569
1570 @multiple_try()
1572 """ control the status of a single job with it's cluster id """
1573 cmd = 'qstat | grep '+str(id)
1574 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1575 if not status:
1576 return 'F'
1577
1578 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1579 stat = ''
1580 for line in status.stdout.read().decode().split('\n'):
1581 if not line:
1582 continue
1583 line = line.strip()
1584 try:
1585 groups = pat.search(line).groups()
1586 except:
1587 raise ClusterManagmentError('bad syntax for stat: \n\"%s\"' % line)
1588 if groups[0] != id: continue
1589 stat = groups[1]
1590 if not stat:
1591 return 'F'
1592 if stat in self.idle_tag:
1593 return 'I'
1594 if stat in self.running_tag:
1595 return 'R'
1596
1597 @multiple_try()
1599 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1600 if not self.submitted_ids:
1601 return 0, 0, 0, 0
1602 idle, run, fail = 0, 0, 0
1603 ongoing = []
1604 for statusflag in ['p', 'r', 'sh']:
1605 cmd = 'qstat -s %s' % statusflag
1606 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1607
1608 pat = re.compile("^(\d+)")
1609 for line in status.stdout.read().decode().split('\n'):
1610 line = line.strip()
1611 try:
1612 id = pat.search(line).groups()[0]
1613 except Exception:
1614 pass
1615 else:
1616 if id not in self.submitted_ids:
1617 continue
1618 ongoing.append(id)
1619 if statusflag == 'p':
1620 idle += 1
1621 if statusflag == 'r':
1622 run += 1
1623 if statusflag == 'sh':
1624 fail += 1
1625 for id in list(self.submitted_ids):
1626 if id not in ongoing:
1627 self.check_termination(id)
1628
1629
1630 return idle, run, self.submitted - idle - run - fail, fail
1631
1632 @multiple_try()
1633 - def remove(self, *args, **opts):
1634 """Clean the jobs on the cluster"""
1635
1636 if not self.submitted_ids:
1637 return
1638 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1639 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1640 self.submitted_ids = []
1641
1643 """start a computation and not wait for it to finish.
1644 this fonction returns a lock which is locked as long as the job is
1645 running."""
1646
1647 mc = MultiCore(1)
1648 mc.submit(exe, argument, cwd, stdout, **opt)
1649 mc.need_waiting = True
1650 return mc.lock
1651
1654 """Basic class for dealing with cluster submission"""
1655
1656 name = 'slurm'
1657 job_id = 'SLURM_JOBID'
1658 idle_tag = ['Q','PD','S','CF']
1659 running_tag = ['R', 'CG']
1660 complete_tag = ['C']
1661 identifier_length = 8
1662
1663 @multiple_try()
1664 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1665 required_output=[], nb_submit=0):
1666 """Submit a job prog to a SLURM cluster"""
1667
1668 me_dir = self.get_jobs_identifier(cwd, prog)
1669
1670
1671 if cwd is None:
1672 cwd = os.getcwd()
1673 if stdout is None:
1674 stdout = '/dev/null'
1675 if stderr is None:
1676 stderr = '/dev/null'
1677 elif stderr == -2:
1678 stderr = stdout
1679 if log is None:
1680 log = '/dev/null'
1681
1682 command = ['sbatch', '-o', stdout,
1683 '-J', me_dir,
1684 '-e', stderr, prog] + argument
1685
1686 if self.cluster_queue and self.cluster_queue != 'None':
1687 command.insert(1, '-p')
1688 command.insert(2, self.cluster_queue)
1689
1690 a = misc.Popen(command, stdout=subprocess.PIPE,
1691 stderr=subprocess.STDOUT,
1692 stdin=subprocess.PIPE, cwd=cwd)
1693
1694 output = a.communicate()
1695 output_arr = output[0].decode().split(' ')
1696 id = output_arr[3].rstrip()
1697
1698 if not id.isdigit():
1699 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1700 % (output[0] + '\n' + output[1]))
1701
1702 self.submitted += 1
1703 self.submitted_ids.append(id)
1704 return id
1705
1706 @multiple_try()
1708 """ control the status of a single job with it's cluster id """
1709 cmd = 'squeue j'+str(id)
1710 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1711 stderr=open(os.devnull,'w'))
1712
1713 for line in status.stdout:
1714 line = line.decode().strip()
1715 if 'Invalid' in line:
1716 return 'F'
1717 elif line.startswith(str(id)):
1718 status = line.split()[4]
1719 if status in self.idle_tag:
1720 return 'I'
1721 elif status in self.running_tag:
1722 return 'R'
1723 return 'F'
1724
1725 @multiple_try()
1727 """ control the status of a single job with it's cluster id """
1728 cmd = "squeue"
1729 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1730
1731 me_dir = self.get_jobs_identifier(me_dir)
1732
1733 idle, run, fail = 0, 0, 0
1734 ongoing=[]
1735 for line in pstatus.stdout:
1736 line = line.decode()
1737 if me_dir in line:
1738 id, _, _,_ , status,_ = line.split(None,5)
1739 ongoing.append(id)
1740 if status in self.idle_tag:
1741 idle += 1
1742 elif status in self.running_tag:
1743 run += 1
1744 elif status in self.complete_tag:
1745 status = self.check_termination(id)
1746 if status == 'wait':
1747 run += 1
1748 elif status == 'resubmit':
1749 idle += 1
1750 else:
1751 fail += 1
1752
1753
1754 for id in list(self.submitted_ids):
1755 if id not in ongoing:
1756 status = self.check_termination(id)
1757 if status == 'wait':
1758 run += 1
1759 elif status == 'resubmit':
1760 idle += 1
1761
1762
1763 return idle, run, self.submitted - (idle+run+fail), fail
1764
1765 @multiple_try()
1766 - def remove(self, *args, **opts):
1767 """Clean the jobs on the cluster"""
1768
1769 if not self.submitted_ids:
1770 return
1771 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1772 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1773 self.submitted_ids = []
1774
1776 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1777
1778 name= 'htcaas'
1779 job_id = 'HTCAAS_JOBID'
1780 idle_tag = ['waiting']
1781 running_tag = ['preparing','running']
1782 complete_tag = ['done']
1783
1784 @store_input()
1785 @multiple_try()
1786 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1787 log=None, input_files=[], output_files=[], required_output=[],
1788 nb_submit=0):
1789 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1790 input/output file should be given as relative to CWd
1791 """
1792
1793 cur_usr = os.getenv('USER')
1794
1795 if cwd is None:
1796 cwd = os.getcwd()
1797
1798 cwd_cp = cwd.rsplit("/",2)
1799
1800 if not stdout is None:
1801 print("stdout: %s" % stdout)
1802
1803 if not os.path.exists(prog):
1804 prog = os.path.join(cwd, prog)
1805
1806 if not required_output and output_files:
1807 required_output = output_files
1808
1809 logger.debug(prog)
1810 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1811 cwd_arg = cwd+"/arguments"
1812 temp = ' '.join([str(a) for a in argument])
1813 arg_cmd="echo '"+temp+"' > " + cwd_arg
1814 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1815 if argument :
1816 command.extend(['-a ', '='.join([str(a) for a in argument])])
1817 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1818 id = a.stdout.read().strip()
1819
1820 else:
1821 cwd_arg = cwd+"/arguments"
1822 temp = ' '.join([str(a) for a in argument])
1823 temp_file_name = "sub." + os.path.basename(prog)
1824 text = """#!/bin/bash
1825 MYPWD=%(cwd)s
1826 cd $MYPWD
1827 input_files=(%(input_files)s )
1828 for i in ${input_files[@]}
1829 do
1830 chmod -f +x $i
1831 done
1832 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1833 """
1834 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1835 'arguments': ' '.join([str(a) for a in argument]),
1836 'program': ' ' if '.py' in prog else 'bash'}
1837
1838
1839 new_prog = pjoin(cwd, temp_file_name)
1840 open(new_prog, 'w').write(text % dico)
1841 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1842 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1843 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1844 id = a.stdout.read().strip()
1845 logger.debug(id)
1846
1847 nb_try=0
1848 nb_limit=5
1849 if not id.isdigit() :
1850 print("[ID is not digit]:" + id)
1851
1852 while not id.isdigit() :
1853 nb_try+=1
1854 print("[fail_retry]:"+ nb_try)
1855 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1856 id = a.stdout.read().strip()
1857 if nb_try > nb_limit :
1858 raise ClusterManagementError('fail to submit to the HTCaaS cluster: \n %s' % id)
1859 break
1860
1861 self.submitted += 1
1862 self.submitted_ids.append(id)
1863
1864 return id
1865
1866 @multiple_try(nb_try=10, sleep=5)
1868 """ control the status of a single job with it's cluster id """
1869
1870 if id == 0 :
1871 status_out ='C'
1872 else :
1873 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1874 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1875 stderr=subprocess.PIPE)
1876 error = status.stderr.read().decode()
1877 if status.returncode or error:
1878 raise ClusterManagmentError('htcaas-job-submit returns error: %s' % error)
1879 status_out= status.stdout.read().decode().strip()
1880 status_out= status_out.split(":",1)[1]
1881 if status_out == 'waiting':
1882 status_out='I'
1883 elif status_out == 'preparing' or status_out == 'running':
1884 status_out = 'R'
1885 elif status_out != 'done':
1886 status_out = 'F'
1887 elif status_out == 'done':
1888 status_out = 'C'
1889
1890 return status_out
1891
1892 @multiple_try()
1894 """ control the status of a single job with it's cluster id """
1895 if not self.submitted_ids:
1896 logger.debug("self.submitted_ids not exists")
1897 return 0, 0, 0, 0
1898
1899 ongoing = []
1900 idle, run, fail = 0, 0, 0
1901
1902 start = self.submitted_ids[0]
1903 end = self.submitted_ids[-1]
1904
1905 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1906 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1907
1908 for line in status.stdout:
1909
1910 status2 = line.decode().split()[-1]
1911 if status2 != 'null' or line.split()[0].strip() != '0':
1912 ongoing.append(line.split()[0].strip())
1913 logger.debug("["+line.split()[0].strip()+"]"+status2)
1914 if status2 != 'null' or line.split()[0].strip() != '0':
1915 idle += 1
1916 elif status2 in self.idle_tag:
1917 idle += 1
1918 elif status2 in self.running_tag:
1919 run += 1
1920 elif status2 in self.complete_tag:
1921 if not self.check_termination(line.split()[0]):
1922 idle +=1
1923 else:
1924 fail += 1
1925
1926 return idle, run, self.submitted - (idle+run+fail), fail
1927
1928 @multiple_try()
1929 - def remove(self, *args, **opts):
1930 """Clean the jobson the cluster"""
1931
1932 if not self.submitted_ids:
1933 return
1934 for i in range(len(self.submitted_ids)):
1935 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1936 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1937
1939 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1940
1941 name= 'htcaas2'
1942 job_id = 'HTCAAS2_JOBID'
1943 idle_tag = ['waiting']
1944 running_tag = ['preparing','running']
1945 complete_tag = ['done']
1946
1947 @store_input()
1948 @multiple_try()
1949 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1950 log=None, input_files=[], output_files=[], required_output=[],
1951 nb_submit=0):
1952
1953 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1954 input/output file should be given as relative to CWD
1955 """
1956 if cwd is None:
1957 cwd = os.getcwd()
1958
1959 if not os.path.exists(prog):
1960 prog = os.path.join(cwd, prog)
1961
1962 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1963 if cwd or prog :
1964 self.submitted_dirs.append(cwd)
1965 self.submitted_exes.append(prog)
1966 else:
1967 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1968
1969 if argument :
1970 self.submitted_args.append('='.join([str(a) for a in argument]))
1971
1972 if cwd or prog :
1973 self.submitted += 1
1974 id = self.submitted
1975 self.submitted_ids.append(id)
1976 else:
1977 logger.debug("cwd and prog are not exist! ")
1978 id = 0
1979
1980 else:
1981 temp_file_name = "sub."+ os.path.basename(prog)
1982 text = """#!/bin/bash
1983 MYPWD=%(cwd)s
1984 cd $MYPWD
1985 input_files=(%(input_files)s )
1986 for i in ${input_files[@]}
1987 do
1988 chmod -f +x $i
1989 done
1990 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1991 """
1992 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1993 'arguments': ' '.join([str(a) for a in argument]),
1994 'program': ' ' if '.py' in prog else 'bash'}
1995
1996 new_prog = pjoin(cwd, temp_file_name)
1997 open(new_prog, 'w').write(text % dico)
1998 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1999 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
2000 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
2001 id = a.stdout.read().strip()
2002 logger.debug("[mode2]-["+str(id)+"]")
2003 if cwd and prog :
2004 self.submitted += 1
2005 self.submitted_ids.append(id)
2006 else:
2007 logger.debug("cwd and prog are not exist! ")
2008 id = 0
2009
2010 return id
2011
2012 @multiple_try()
2056
2057
2058 @multiple_try(nb_try=10, sleep=5)
2060 """ control the status of a single job with it's cluster id """
2061
2062 if self.submitted == self.submitted_ids[-1] :
2063 id = self.metasubmit(self)
2064 tempid = self.submitted_ids[-1]
2065 self.submitted_ids.remove(self.submitted_ids[-1])
2066 self.submitted_ids.append(id)
2067 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2068
2069 if id == 0 :
2070 status_out ='C'
2071 else:
2072 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2073 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2074 stderr=subprocess.PIPE)
2075 error = status.stderr.read().decode()
2076 if status.returncode or error:
2077 raise ClusterManagmentError('htcaas-job-status returns error: %s' % error)
2078 status_out= status.stdout.read().decode().strip()
2079 status_out= status_out.split(":",1)[1]
2080 logger.debug("[["+str(id)+"]]"+status_out)
2081 if status_out == 'waiting':
2082 status_out='I'
2083 elif status_out == 'preparing' or status_out == 'running':
2084 status_out = 'R'
2085 elif status_out != 'done':
2086 status_out = 'F'
2087 elif status_out == 'done':
2088 status_out = 'C'
2089 self.submitted -= 1
2090
2091 return status_out
2092
2093 @multiple_try()
2095 """ control the status of a single job with it's cluster id """
2096 if not self.submitted_ids:
2097 logger.debug("self.submitted_ids not exists")
2098 return 0, 0, 0, 0
2099
2100 if "//" in me_dir :
2101 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2102 start = me_dir.split("//")[0]
2103 end = me_dir.split("//")[1]
2104 else :
2105 start = me_dir.split("//")[1]
2106 end = me_dir.split("//")[0]
2107 elif "/" in me_dir :
2108 start = 0
2109 end = 0
2110 elif me_dir.isdigit():
2111 start = me_dir
2112 end = me_dir
2113 elif not me_dir.isdigit():
2114 me_dir = self.submitted_ids[0]
2115 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2116
2117 ongoing = []
2118 idle, run, fail, done = 0, 0, 0, 0
2119
2120 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2121 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2122
2123 for line in status.stdout:
2124 line = line.decode()
2125 status2 = line.split()[-1]
2126 if status2 != 'null' or line.split()[0].strip() != '0':
2127 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2128 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2129
2130 if status2 == 'null' or line.split()[0].strip() == '0':
2131 idle += 1
2132 elif status2 in self.idle_tag:
2133 idle += 1
2134 elif status2 in self.running_tag:
2135 run += 1
2136 elif status2 in self.complete_tag:
2137 done += 1
2138 self.submitted -= 1
2139 if not self.check_termination(line.split()[1]):
2140 idle +=1
2141 else:
2142 fail += 1
2143
2144 return idle, run, self.submitted - (idle+run+fail), fail
2145
2146 @multiple_try()
2147 - def remove(self, *args, **opts):
2148 """Clean the jobson the cluster"""
2149
2150 if not self.submitted_ids:
2151 return
2152 id = self.submitted_ids[0]
2153 if id:
2154 cmd = "htcaas-job-cancel -m %s" % str(id)
2155 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2156
2157 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2158 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2159 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2160
2161 onecore=MultiCore(1)
2162
2163