1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21 import sys
22
23 logger = logging.getLogger('madgraph.cluster')
24
25 try:
26 from madgraph import MadGraph5Error
27 import madgraph.various.misc as misc
28 except Exception, error:
29 if __debug__:
30 print str(error)
31 from internal import MadGraph5Error
32 import internal.misc as misc
33
34 pjoin = os.path.join
38
41
42
43 multiple_try = misc.multiple_try
44 pjoin = os.path.join
48
49 def deco_interupt(f):
50 def deco_f_interupt(self, *args, **opt):
51 try:
52 return f(self, *args, **opt)
53 except error:
54 try:
55 self.remove(*args, **opt)
56 except Exception:
57 pass
58 raise error
59 return deco_f_interupt
60 return deco_interupt
61
74 return deco_f_store
75 return deco_store
76
78 """ This function checks whether compression of input files are necessary
79 given the running options given. """
80
81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
82 return False
83 else:
84 return True
85
87 """Basic Class for all cluster type submission"""
88 name = 'mother class'
89 identifier_length = 14
90
92 """Init the cluster"""
93
94 self.submitted = 0
95 self.submitted_ids = []
96 self.finish = 0
97 self.submitted_dirs = []
98 self.submitted_exes = []
99 self.submitted_args = []
100
101 if 'cluster_queue' in opts:
102 self.cluster_queue = opts['cluster_queue']
103 else:
104 self.cluster_queue = 'madgraph'
105 if 'cluster_temp_path' in opts:
106 self.temp_dir = opts['cluster_temp_path']
107 else:
108 self.temp_dir = None
109 self.options = {'cluster_status_update': (600, 30)}
110 for key,value in opts.items():
111 self.options[key] = value
112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
114 self.options = dict(opts)
115 self.retry_args = {}
116
117 self.packet = {}
118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster."""
123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124
125
126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
128 log=None, input_files=[], output_files=[], required_output=[],
129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster.
131 NO SHARE DISK"""
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137
138 if not required_output and output_files:
139 required_output = output_files
140
141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
142 (input_files == [] == output_files):
143 return self.submit(prog, argument, cwd, stdout, stderr, log,
144 required_output=required_output, nb_submit=nb_submit)
145
146 if not input_files and not output_files:
147
148 return self.submit(prog, argument, cwd, stdout, stderr, log,
149 required_output=required_output, nb_submit=nb_submit)
150
151 if cwd is None:
152 cwd = os.getcwd()
153 if not os.path.exists(prog):
154 prog = os.path.join(cwd, prog)
155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
156
157 text = """#!/bin/bash
158 MYTMP=%(tmpdir)s/run$%(job_id)s
159 MYPWD=%(cwd)s
160 mkdir -p $MYTMP
161 cd $MYPWD
162 input_files=( %(input_files)s )
163 for i in ${input_files[@]}
164 do
165 cp -R -L $i $MYTMP
166 done
167 cd $MYTMP
168 echo '%(arguments)s' > arguments
169 chmod +x ./%(script)s
170 %(program)s ./%(script)s %(arguments)s
171 exit=$?
172 output_files=( %(output_files)s )
173 for i in ${output_files[@]}
174 do
175 cp -r $MYTMP/$i $MYPWD
176 done
177 # if [ "$exit" -eq "0" ]
178 # then
179 rm -rf $MYTMP
180 # fi
181 """
182
183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
184 'cwd': cwd, 'job_id': self.job_id,
185 'input_files': ' '.join(input_files + [prog]),
186 'output_files': ' '.join(output_files),
187 'arguments': ' '.join([str(a) for a in argument]),
188 'program': ' ' if '.py' in prog else 'bash'}
189
190
191 new_prog = pjoin(cwd, temp_file_name)
192 open(new_prog, 'w').write(text % dico)
193 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
194
195 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
196 required_output=required_output, nb_submit=nb_submit)
197
198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
200 log=None, input_files=[], output_files=[], required_output=[],
201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant
203 method should not be overwritten (but for DAG type submission)"""
204
205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
206 output_files, required_output, nb_submit)
207
208
209 if not packet_member:
210 return id
211 else:
212 if isinstance(packet_member, Packet):
213 self.id_to_packet[id] = packet_member
214 packet_member.put(id)
215 if packet_member.tag not in self.packet:
216 self.packet[packet_member.tag] = packet_member
217 else:
218 if packet_member in self.packet:
219 packet = self.packet[packet_member]
220 packet.put(id)
221 self.id_to_packet[id] = packet
222 return id
223
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
226 if not self.submitted_ids:
227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
228 idle, run, fail = 0, 0, 0
229 for pid in self.submitted_ids[:]:
230 status = self.control_one_job(id)
231 if status == 'I':
232 idle += 1
233 elif status == 'R':
234 run += 1
235 elif status == 'F':
236 self.finish +=1
237 self.submitted_ids.remove(pid)
238 else:
239 fail += 1
240
241 return idle, run, self.finish, fail
242
244 """ control the status of a single job with it's cluster id """
245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
248 """get a unique run_name for all the jobs helps to identify the runs
249 in the controller for some cluster."""
250
251 if second_path:
252 path = os.path.realpath(pjoin(path, second_path))
253 elif not os.path.exists(path):
254 return path
255
256 if 'SubProcesses' in path:
257 target = path.rsplit('/SubProcesses',1)[0]
258 elif 'MCatNLO' in path:
259 target = path.rsplit('/MCatNLO',1)[0]
260 elif second_path:
261 target=path
262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
263 elif 'PY8_parallelization' in path:
264 target = path.rsplit('/PY8_parallelization',1)[0]
265 else:
266 target = path
267
268 if target.endswith('/'):
269 target = target[:-1]
270
271 target = misc.digest(target)[-self.identifier_length:]
272 if not target[0].isalpha():
273 target = 'a' + target[1:]
274
275 return target
276
277
278 @check_interupt()
279 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
280 """Wait that all job are finish.
281 if minimal_job set, then return if idle + run is lower than that number"""
282
283
284 mode = 1
285 nb_iter = 0
286 nb_short = 0
287 change_at = 5
288
289 if update_first:
290 idle, run, finish, fail = self.control(me_dir)
291 update_first(idle, run, finish)
292
293
294 longtime, shorttime = self.options['cluster_status_update']
295
296 nb_job = 0
297
298 if self.options['cluster_type'] == 'htcaas2':
299 me_dir = self.metasubmit(self)
300
301 while 1:
302 old_mode = mode
303 nb_iter += 1
304 idle, run, finish, fail = self.control(me_dir)
305 if nb_job:
306 if idle + run + finish + fail != nb_job:
307 nb_job = idle + run + finish + fail
308 nb_iter = 1
309 else:
310 nb_job = idle + run + finish + fail
311 if fail:
312 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
313 if idle + run == 0:
314
315 logger.info('All jobs finished')
316 fct(idle, run, finish)
317 break
318 if idle + run < minimal_job:
319 return
320 fct(idle, run, finish)
321
322 if nb_iter < change_at:
323 mode = 1
324 elif idle < run:
325 if old_mode == 0:
326 if nb_short:
327 mode = 0
328
329 elif idle:
330 if nb_iter > change_at + int(longtime)//shorttime:
331 mode = 0
332 else:
333 mode = 1
334 nb_short =0
335 else:
336 mode = 1
337 nb_short = 0
338 elif old_mode == 1:
339 nb_short +=1
340 if nb_short > 3* max(change_at, int(longtime)//shorttime):
341 mode = 0
342 else:
343 mode = 0
344
345
346 if old_mode > mode:
347 logger.info('''Start to wait %ss between checking status.
348 Note that you can change this time in the configuration file.
349 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
350
351
352 if mode == 0:
353 try:
354 time.sleep(self.options['cluster_status_update'][0])
355 except KeyboardInterrupt:
356 logger.info('start to update the status')
357 nb_iter = min(0, change_at -2)
358 nb_short = 0
359 else:
360 time.sleep(self.options['cluster_status_update'][1])
361
362
363 self.submitted = 0
364 self.submitted_ids = []
365
367 """Check the termination of the jobs with job_id and relaunch it if needed."""
368
369
370 if job_id not in self.retry_args:
371 if job_id in self.id_to_packet:
372 nb_in_packet = self.id_to_packet[job_id].remove_one()
373 if nb_in_packet == 0:
374
375 packet = self.id_to_packet[job_id]
376
377 packet.queue.join()
378
379 packet.fct(*packet.args)
380 del self.id_to_packet[job_id]
381 return 'resubmit'
382 else:
383 return True
384
385 args = self.retry_args[job_id]
386 if 'time_check' in args:
387 time_check = args['time_check']
388 else:
389 time_check = 0
390
391 for path in args['required_output']:
392 if args['cwd']:
393 path = pjoin(args['cwd'], path)
394
395 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
396 break
397 else:
398
399 if time_check > 0:
400 logger.info('Job %s Finally found the missing output.' % (job_id))
401 del self.retry_args[job_id]
402 self.submitted_ids.remove(job_id)
403
404 if job_id in self.id_to_packet:
405 nb_in_packet = self.id_to_packet[job_id].remove_one()
406 if nb_in_packet == 0:
407
408 packet = self.id_to_packet[job_id]
409
410 packet.queue.join()
411
412 packet.fct(*packet.args)
413 del self.id_to_packet[job_id]
414 return 'resubmit'
415
416 return 'done'
417
418 if time_check == 0:
419 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
420 args['time_check'] = time.time()
421 return 'wait'
422 elif self.cluster_retry_wait > time.time() - time_check:
423 return 'wait'
424
425
426 if self.nb_retry < 0:
427 logger.critical('''Fail to run correctly job %s.
428 with option: %s
429 file missing: %s''' % (job_id, args, path))
430 raw_input('press enter to continue.')
431 elif self.nb_retry == 0:
432 logger.critical('''Fail to run correctly job %s.
433 with option: %s
434 file missing: %s.
435 Stopping all runs.''' % (job_id, args, path))
436 self.remove()
437 elif args['nb_submit'] >= self.nb_retry:
438 logger.critical('''Fail to run correctly job %s.
439 with option: %s
440 file missing: %s
441 Fails %s times
442 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
443 self.remove()
444 else:
445 args['nb_submit'] += 1
446 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
447 del self.retry_args[job_id]
448 self.submitted_ids.remove(job_id)
449 if 'time_check' in args:
450 del args['time_check']
451 if job_id in self.id_to_packet:
452 self.id_to_packet[job_id].remove_one()
453 args['packet_member'] = self.id_to_packet[job_id]
454 del self.id_to_packet[job_id]
455 self.cluster_submit(**args)
456 else:
457 self.submit2(**args)
458 return 'resubmit'
459 return 'done'
460
461 @check_interupt()
462 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
463 stderr=None, log=None, required_output=[], nb_submit=0,
464 input_files=[], output_files=[]):
465 """launch one job on the cluster and wait for it"""
466
467 special_output = False
468 if stderr == -2 and stdout:
469
470 special_output = True
471 stderr = stdout + '.err'
472
473 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
474 required_output=required_output, input_files=input_files,
475 output_files=output_files)
476
477 if self.options['cluster_type']=='htcaas2':
478 if self.submitted == self.submitted_ids[-1]:
479 id = self.metasubmit(self)
480
481 frame = inspect.currentframe()
482 args, _, _, values = inspect.getargvalues(frame)
483 args = dict([(i, values[i]) for i in args if i != 'self'])
484 self.retry_args[id] = args
485
486 nb_wait=0
487 while 1:
488 nb_wait+=1
489 status = self.control_one_job(id)
490 if not status in ['R','I']:
491 status = self.check_termination(id)
492 if status in ['wait']:
493 time.sleep(30)
494 continue
495 elif status in ['resubmit']:
496 id = self.submitted_ids[0]
497 time.sleep(30)
498 continue
499
500 time.sleep(30)
501 break
502 time.sleep(self.options['cluster_status_update'][1])
503
504 if required_output:
505 status = self.check_termination(id)
506 if status == 'wait':
507 run += 1
508 elif status == 'resubmit':
509 idle += 1
510
511
512 if special_output:
513
514
515 for i in range(5):
516 if os.path.exists(stdout):
517 if not os.path.exists(stderr):
518 time.sleep(5)
519 if os.path.exists(stderr):
520 err_text = open(stderr).read()
521 if not err_text:
522 return
523 logger.warning(err_text)
524 text = open(stdout).read()
525 open(stdout,'w').write(text + err_text)
526 else:
527 return
528 time.sleep(10)
529
530 - def remove(self, *args, **opts):
531 """ """
532 logger.warning("""This cluster didn't support job removal,
533 the jobs are still running on the cluster.""")
534
535 @store_input()
539
541 """ an object for handling packet of job, it is designed to be thread safe
542 """
543
544 - def __init__(self, name, fct, args, opts={}):
545 import Queue
546 import threading
547 self.queue = Queue.Queue()
548 self.tag = name
549 self.fct = fct
550 self.args = args
551 self.opts = opts
552 self.done = threading.Event()
553
554 - def put(self, *args, **opts):
556
557 append = put
558
563
565 """class for dealing with the submission in multiple node"""
566
567 job_id = "$"
568
570 """Init the cluster """
571
572
573 super(MultiCore, self).__init__(self, *args, **opt)
574
575 import Queue
576 import threading
577 import thread
578 self.queue = Queue.Queue()
579 self.done = Queue.Queue()
580 self.submitted = Queue.Queue()
581 self.stoprequest = threading.Event()
582 self.demons = []
583 self.nb_done =0
584 if 'nb_core' in opt:
585 self.nb_core = opt['nb_core']
586 elif isinstance(args[0],int):
587 self.nb_core = args[0]
588 else:
589 self.nb_core = 1
590 self.update_fct = None
591
592 self.lock = threading.Event()
593 self.pids = Queue.Queue()
594 self.done_pid = []
595 self.done_pid_queue = Queue.Queue()
596 self.fail_msg = None
597
598
599 for _ in range(self.nb_core):
600 self.start_demon()
601
602
604 import threading
605 t = threading.Thread(target=self.worker)
606 t.daemon = True
607 t.start()
608 self.demons.append(t)
609
610
612 import Queue
613 import thread
614 while not self.stoprequest.isSet():
615 try:
616 args = self.queue.get()
617 tag, exe, arg, opt = args
618 try:
619
620 if isinstance(exe,str):
621 if os.path.exists(exe) and not exe.startswith('/'):
622 exe = './' + exe
623 if isinstance(opt['stdout'],str):
624 opt['stdout'] = open(opt['stdout'],'w')
625 if opt['stderr'] == None:
626 opt['stderr'] = subprocess.STDOUT
627 proc = misc.Popen([exe] + arg, **opt)
628 pid = proc.pid
629 self.pids.put(pid)
630 proc.wait()
631 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
632 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
633 (' '.join([exe]+arg), proc.returncode)
634 logger.warning(fail_msg)
635 self.stoprequest.set()
636 self.remove(fail_msg)
637
638
639
640
641 else:
642 pid = tag
643 self.pids.put(pid)
644
645
646 returncode = exe(*arg, **opt)
647 if returncode != 0:
648 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
649 self.stoprequest.set()
650 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
651 except Exception,error:
652 self.fail_msg = sys.exc_info()
653 logger.warning(str(error))
654 self.stoprequest.set()
655 self.remove(error)
656
657 if __debug__:
658 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2]
659
660 self.queue.task_done()
661 self.done.put(tag)
662 self.done_pid_queue.put(pid)
663
664 try:
665 self.lock.set()
666 except thread.error:
667 continue
668 except Queue.Empty:
669 continue
670
671
672
673
674 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
675 log=None, required_output=[], nb_submit=0):
676 """submit a job on multicore machine"""
677
678 tag = (prog, tuple(argument), cwd, nb_submit)
679 if isinstance(prog, str):
680
681 opt = {'cwd': cwd,
682 'stdout':stdout,
683 'stderr': stderr}
684 self.queue.put((tag, prog, argument, opt))
685 self.submitted.put(1)
686 return tag
687 else:
688
689 self.queue.put((tag, prog, argument, {}))
690 self.submitted.put(1)
691 return tag
692
693 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
694 stderr=None, log=None, **opts):
695 """launch one job and wait for it"""
696 if isinstance(stdout, str):
697 stdout = open(stdout, 'w')
698 if isinstance(stderr, str):
699 stdout = open(stderr, 'w')
700 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
701
702 - def remove(self, error=None):
703 """Ensure that all thread are killed"""
704
705
706 self.stoprequest.set()
707 if error and not self.fail_msg:
708 self.fail_msg = error
709
710
711 while not self.done_pid_queue.empty():
712 pid = self.done_pid_queue.get()
713 self.done_pid.append(pid)
714
715
716 while not self.pids.empty():
717 pid = self.pids.get()
718 self.pids.task_done()
719 if isinstance(pid, tuple):
720 continue
721 if pid in self.done_pid:
722 continue
723 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
724 % {'pid':pid} )
725 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
726
727
728 - def wait(self, me_dir, update_status, update_first=None):
729 """Waiting that all the jobs are done. This function also control that
730 the submission by packet are handle correctly (i.e. submit the function)"""
731
732 import Queue
733 import threading
734
735 try:
736 last_status = (0, 0, 0)
737 sleep_time = 1
738 use_lock = True
739 first = True
740 while True:
741 force_one_more_loop = False
742
743
744
745 while self.done.qsize():
746 try:
747 tag = self.done.get(True, 1)
748 except Queue.Empty:
749 pass
750 else:
751 if self.id_to_packet and tuple(tag) in self.id_to_packet:
752 packet = self.id_to_packet[tuple(tag)]
753 remaining = packet.remove_one()
754 if remaining == 0:
755
756 packet.queue.join()
757 self.submit(packet.fct, packet.args)
758 force_one_more_loop = True
759 self.nb_done += 1
760 self.done.task_done()
761
762
763
764 Idle = self.queue.qsize()
765 Done = self.nb_done + self.done.qsize()
766 Running = max(0, self.submitted.qsize() - Idle - Done)
767
768 if Idle + Running <= 0 and not force_one_more_loop:
769 update_status(Idle, Running, Done)
770
771
772 self.queue.join()
773 break
774
775 if (Idle, Running, Done) != last_status:
776 if first and update_first:
777 update_first(Idle, Running, Done)
778 first = False
779 else:
780 update_status(Idle, Running, Done)
781 last_status = (Idle, Running, Done)
782
783
784 while not self.done_pid_queue.empty():
785 pid = self.done_pid_queue.get()
786 self.done_pid.append(pid)
787 self.done_pid_queue.task_done()
788
789
790
791 if use_lock:
792
793 use_lock = self.lock.wait(300)
794 self.lock.clear()
795 if not use_lock and Idle > 0:
796 use_lock = True
797 else:
798
799
800 time.sleep(sleep_time)
801 sleep_time = min(sleep_time + 2, 180)
802 if update_first:
803 update_first(Idle, Running, Done)
804
805 if self.stoprequest.isSet():
806 if isinstance(self.fail_msg, Exception):
807 raise self.fail_msg
808 elif isinstance(self.fail_msg, str):
809 raise Exception, self.fail_msg
810 else:
811 misc.sprint(self.fail_msg)
812 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
813
814 try:
815 self.lock.clear()
816 except Exception:
817 pass
818 self.done = Queue.Queue()
819 self.done_pid = []
820 self.done_pid_queue = Queue.Queue()
821 self.nb_done = 0
822 self.submitted = Queue.Queue()
823 self.pids = Queue.Queue()
824 self.stoprequest.clear()
825
826 except KeyboardInterrupt:
827
828 if isinstance(self.fail_msg, Exception):
829 raise self.fail_msg
830 elif isinstance(self.fail_msg, str):
831 raise Exception, self.fail_msg
832 elif self.fail_msg:
833 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
834
835 raise
836
838 """Basic class for dealing with cluster submission"""
839
840 name = 'condor'
841 job_id = 'CONDOR_ID'
842
843
844
845 @multiple_try()
846 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
847 required_output=[], nb_submit=0):
848 """Submit a job prog to a Condor cluster"""
849
850 text = """Executable = %(prog)s
851 output = %(stdout)s
852 error = %(stderr)s
853 log = %(log)s
854 %(argument)s
855 environment = CONDOR_ID=$(Cluster).$(Process)
856 Universe = vanilla
857 notification = Error
858 Initialdir = %(cwd)s
859 %(requirement)s
860 getenv=True
861 queue 1
862 """
863
864 if self.cluster_queue not in ['None', None]:
865 requirement = 'Requirements = %s=?=True' % self.cluster_queue
866 else:
867 requirement = ''
868
869 if cwd is None:
870 cwd = os.getcwd()
871 if stdout is None:
872 stdout = '/dev/null'
873 if stderr is None:
874 stderr = '/dev/null'
875 if log is None:
876 log = '/dev/null'
877 if not os.path.exists(prog):
878 prog = os.path.join(cwd, prog)
879 if argument:
880 argument = 'Arguments = %s' % ' '.join(argument)
881 else:
882 argument = ''
883
884
885 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
886 'stderr': stderr,'log': log,'argument': argument,
887 'requirement': requirement}
888
889
890 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
891 stdin=subprocess.PIPE)
892 output, _ = a.communicate(text % dico)
893
894
895
896
897 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
898 try:
899 id = pat.search(output).groups()[0]
900 except:
901 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
902 % output
903 self.submitted += 1
904 self.submitted_ids.append(id)
905 return id
906
907 @store_input()
908 @multiple_try()
909 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
910 log=None, input_files=[], output_files=[], required_output=[],
911 nb_submit=0):
912 """Submit the job on the cluster NO SHARE DISK
913 input/output file should be give relative to cwd
914 """
915
916 if not required_output and output_files:
917 required_output = output_files
918
919 if (input_files == [] == output_files):
920 return self.submit(prog, argument, cwd, stdout, stderr, log,
921 required_output=required_output, nb_submit=nb_submit)
922
923 text = """Executable = %(prog)s
924 output = %(stdout)s
925 error = %(stderr)s
926 log = %(log)s
927 %(argument)s
928 should_transfer_files = YES
929 when_to_transfer_output = ON_EXIT
930 transfer_input_files = %(input_files)s
931 %(output_files)s
932 Universe = vanilla
933 notification = Error
934 Initialdir = %(cwd)s
935 %(requirement)s
936 getenv=True
937 queue 1
938 """
939
940 if self.cluster_queue not in ['None', None]:
941 requirement = 'Requirements = %s=?=True' % self.cluster_queue
942 else:
943 requirement = ''
944
945 if cwd is None:
946 cwd = os.getcwd()
947 if stdout is None:
948 stdout = '/dev/null'
949 if stderr is None:
950 stderr = '/dev/null'
951 if log is None:
952 log = '/dev/null'
953 if not os.path.exists(prog):
954 prog = os.path.join(cwd, prog)
955 if argument:
956 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
957 else:
958 argument = ''
959
960 if input_files:
961 input_files = ','.join(input_files)
962 else:
963 input_files = ''
964 if output_files:
965 output_files = 'transfer_output_files = %s' % ','.join(output_files)
966 else:
967 output_files = ''
968
969
970
971 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
972 'stderr': stderr,'log': log,'argument': argument,
973 'requirement': requirement, 'input_files':input_files,
974 'output_files':output_files}
975
976
977 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
978 stdin=subprocess.PIPE)
979 output, _ = a.communicate(text % dico)
980
981
982
983
984 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
985 try:
986 id = pat.search(output).groups()[0]
987 except:
988 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
989 % output
990 self.submitted += 1
991 self.submitted_ids.append(id)
992 return id
993
994
995
996
997
998 @multiple_try(nb_try=10, sleep=10)
1000 """ control the status of a single job with it's cluster id """
1001 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1002 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1003 stderr=subprocess.PIPE)
1004
1005 error = status.stderr.read()
1006 if status.returncode or error:
1007 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1008
1009 return status.stdout.readline().strip()
1010
1011 @check_interupt()
1012 @multiple_try(nb_try=10, sleep=10)
1014 """ control the status of a single job with it's cluster id """
1015
1016 if not self.submitted_ids:
1017 return 0, 0, 0, 0
1018
1019 packet = 15000
1020 idle, run, fail = 0, 0, 0
1021 ongoing = []
1022 for i in range(1+(len(self.submitted_ids)-1)//packet):
1023 start = i * packet
1024 stop = (i+1) * packet
1025 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1026 " -format \'%-2s\ ' \'ClusterId\' " + \
1027 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1028
1029 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1030 stderr=subprocess.PIPE)
1031 error = status.stderr.read()
1032 if status.returncode or error:
1033 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1034
1035 for line in status.stdout:
1036 id, status = line.strip().split()
1037 ongoing.append(int(id))
1038 if status in ['I','U']:
1039 idle += 1
1040 elif status == 'R':
1041 run += 1
1042 elif status != 'C':
1043 fail += 1
1044
1045 for id in list(self.submitted_ids):
1046 if int(id) not in ongoing:
1047 status = self.check_termination(id)
1048 if status == 'wait':
1049 run += 1
1050 elif status == 'resubmit':
1051 idle += 1
1052
1053 return idle, run, self.submitted - (idle+run+fail), fail
1054
1055 @multiple_try()
1056 - def remove(self, *args, **opts):
1057 """Clean the jobson the cluster"""
1058
1059 if not self.submitted_ids:
1060 return
1061 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1062
1063 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1064 self.submitted_ids = []
1065
1067 """Basic class for dealing with cluster submission"""
1068
1069 name = 'pbs'
1070 job_id = 'PBS_JOBID'
1071 idle_tag = ['Q']
1072 running_tag = ['T','E','R']
1073 complete_tag = ['C']
1074
1075 maximum_submited_jobs = 2500
1076
1077 @multiple_try()
1078 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1079 required_output=[], nb_submit=0):
1080 """Submit a job prog to a PBS cluster"""
1081
1082 me_dir = self.get_jobs_identifier(cwd, prog)
1083
1084 if len(self.submitted_ids) > self.maximum_submited_jobs:
1085 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1086 self.wait(me_dir, fct, self.maximum_submited_jobs)
1087
1088
1089 text = ""
1090 if cwd is None:
1091 cwd = os.getcwd()
1092 else:
1093 text = " cd %s;" % cwd
1094 if stdout is None:
1095 stdout = '/dev/null'
1096 if stderr is None:
1097 stderr = '/dev/null'
1098 elif stderr == -2:
1099 stderr = stdout
1100 if log is None:
1101 log = '/dev/null'
1102
1103 if not os.path.isabs(prog):
1104 text += "./%s" % prog
1105 else:
1106 text+= prog
1107
1108 if argument:
1109 text += ' ' + ' '.join(argument)
1110
1111 command = ['qsub','-o', stdout,
1112 '-N', me_dir,
1113 '-e', stderr,
1114 '-V']
1115
1116 if self.cluster_queue and self.cluster_queue != 'None':
1117 command.extend(['-q', self.cluster_queue])
1118
1119 a = misc.Popen(command, stdout=subprocess.PIPE,
1120 stderr=subprocess.STDOUT,
1121 stdin=subprocess.PIPE, cwd=cwd)
1122
1123 output = a.communicate(text)[0]
1124 id = output.split('.')[0]
1125 if not id.isdigit() or a.returncode !=0:
1126 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1127 % output
1128
1129 self.submitted += 1
1130 self.submitted_ids.append(id)
1131 return id
1132
1133 @multiple_try()
1135 """ control the status of a single job with it's cluster id """
1136 cmd = 'qstat '+str(id)
1137 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1138 stderr=subprocess.STDOUT)
1139
1140 for line in status.stdout:
1141 line = line.strip()
1142 if 'cannot connect to server' in line or 'cannot read reply' in line:
1143 raise ClusterManagmentError, 'server disconnected'
1144 if 'Unknown' in line:
1145 return 'F'
1146 elif line.startswith(str(id)):
1147 jobstatus = line.split()[4]
1148 else:
1149 jobstatus=""
1150
1151 if status.returncode != 0 and status.returncode is not None:
1152 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1153 if jobstatus in self.idle_tag:
1154 return 'I'
1155 elif jobstatus in self.running_tag:
1156 return 'R'
1157 return 'F'
1158
1159
1160 @multiple_try()
1162 """ control the status of a single job with it's cluster id """
1163 cmd = "qstat"
1164 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1165
1166 me_dir = self.get_jobs_identifier(me_dir)
1167
1168 ongoing = []
1169
1170 idle, run, fail = 0, 0, 0
1171 for line in status.stdout:
1172 if 'cannot connect to server' in line or 'cannot read reply' in line:
1173 raise ClusterManagmentError, 'server disconnected'
1174 if me_dir in line:
1175 ongoing.append(line.split()[0].split('.')[0])
1176 status2 = line.split()[4]
1177 if status2 in self.idle_tag:
1178 idle += 1
1179 elif status2 in self.running_tag:
1180 run += 1
1181 elif status2 in self.complete_tag:
1182 if not self.check_termination(line.split()[0].split('.')[0]):
1183 idle += 1
1184 else:
1185 fail += 1
1186
1187 if status.returncode != 0 and status.returncode is not None:
1188 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1189
1190 for id in list(self.submitted_ids):
1191 if id not in ongoing:
1192 status2 = self.check_termination(id)
1193 if status2 == 'wait':
1194 run += 1
1195 elif status2 == 'resubmit':
1196 idle += 1
1197
1198 return idle, run, self.submitted - (idle+run+fail), fail
1199
1200 @multiple_try()
1201 - def remove(self, *args, **opts):
1202 """Clean the jobs on the cluster"""
1203
1204 if not self.submitted_ids:
1205 return
1206 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1207 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1208 self.submitted_ids = []
1209
1212 """Basic class for dealing with cluster submission"""
1213
1214
1215 name = 'sge'
1216 job_id = 'JOB_ID'
1217 idle_tag = ['qw', 'hqw','hRqw','w']
1218 running_tag = ['r','t','Rr','Rt']
1219 identifier_length = 10
1220
1222 """replace string for path issues"""
1223 location = os.path.realpath(location)
1224 homePath = os.getenv("HOME")
1225 if homePath:
1226 location = location.replace(homePath,'$HOME')
1227 return location
1228
1229 @multiple_try()
1230 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1231 required_output=[], nb_submit=0):
1232 """Submit a job prog to an SGE cluster"""
1233
1234 me_dir = self.get_jobs_identifier(cwd, prog)
1235
1236
1237 if cwd is None:
1238
1239 cwd = self.def_get_path(os.getcwd())
1240 cwd1 = self.def_get_path(cwd)
1241 text = " cd %s;" % cwd1
1242 if stdout is None:
1243 stdout = '/dev/null'
1244 else:
1245 stdout = self.def_get_path(stdout)
1246 if stderr is None:
1247 stderr = '/dev/null'
1248 elif stderr == -2:
1249 stderr = stdout
1250 else:
1251 stderr = self.def_get_path(stderr)
1252
1253 if log is None:
1254 log = '/dev/null'
1255 else:
1256 log = self.def_get_path(log)
1257
1258 text += prog
1259 if argument:
1260 text += ' ' + ' '.join(argument)
1261
1262
1263
1264
1265 homePath = os.getenv("HOME")
1266 if homePath:
1267 text = text.replace(homePath,'$HOME')
1268
1269 logger.debug("!=== input %s" % text)
1270 logger.debug("!=== output %s" % stdout)
1271 logger.debug("!=== error %s" % stderr)
1272 logger.debug("!=== logs %s" % log)
1273
1274 command = ['qsub','-o', stdout,
1275 '-N', me_dir,
1276 '-e', stderr,
1277 '-V']
1278
1279 if self.cluster_queue and self.cluster_queue != 'None':
1280 command.extend(['-q', self.cluster_queue])
1281
1282 a = misc.Popen(command, stdout=subprocess.PIPE,
1283 stderr=subprocess.STDOUT,
1284 stdin=subprocess.PIPE, cwd=cwd)
1285
1286 output = a.communicate(text)[0]
1287 id = output.split(' ')[2]
1288 if not id.isdigit():
1289 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1290 % output
1291 self.submitted += 1
1292 self.submitted_ids.append(id)
1293 logger.debug(output)
1294
1295 return id
1296
1297 @multiple_try()
1299 """ control the status of a single job with it's cluster id """
1300
1301 cmd = 'qstat '
1302 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1303 for line in status.stdout:
1304
1305
1306
1307
1308
1309
1310 if str(id) in line:
1311 status = line.split()[4]
1312
1313 if status in self.idle_tag:
1314 return 'I'
1315 elif status in self.running_tag:
1316 return 'R'
1317 return 'F'
1318
1319 @multiple_try()
1321 """ control the status of a single job with it's cluster id """
1322 cmd = "qstat "
1323 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1324
1325 me_dir = self.get_jobs_identifier(me_dir)
1326
1327 finished = list(self.submitted_ids)
1328
1329 idle, run, fail = 0, 0, 0
1330 for line in status.stdout:
1331 if me_dir in line:
1332 id,_,_,_,status = line.split()[:5]
1333 if status in self.idle_tag:
1334 idle += 1
1335 finished.remove(id)
1336 elif status in self.running_tag:
1337 run += 1
1338 finished.remove(id)
1339 else:
1340 logger.debug(line)
1341 fail += 1
1342 finished.remove(id)
1343
1344 for id in finished:
1345 self.check_termination(id)
1346
1347 return idle, run, self.submitted - (idle+run+fail), fail
1348
1349
1350
1351 @multiple_try()
1352 - def remove(self, *args, **opts):
1353 """Clean the jobs on the cluster"""
1354
1355 if not self.submitted_ids:
1356 return
1357 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1358 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1359 self.submitted_ids = []
1360
1363 """Basic class for dealing with cluster submission"""
1364
1365 name = 'lsf'
1366 job_id = 'LSB_JOBID'
1367
1368 @multiple_try()
1369 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1370 required_output=[], nb_submit=0):
1371 """Submit the job prog to an LSF cluster"""
1372
1373
1374 me_dir = self.get_jobs_identifier(cwd, prog)
1375
1376 text = ""
1377 command = ['bsub', '-C0', '-J', me_dir]
1378 if cwd is None:
1379 cwd = os.getcwd()
1380 else:
1381 text = " cd %s;" % cwd
1382 if stdout and isinstance(stdout, str):
1383 command.extend(['-o', stdout])
1384 if stderr and isinstance(stdout, str):
1385 command.extend(['-e', stderr])
1386 elif stderr == -2:
1387 pass
1388 if log is None:
1389 log = '/dev/null'
1390
1391 text += prog
1392 if argument:
1393 text += ' ' + ' '.join(argument)
1394
1395 if self.cluster_queue and self.cluster_queue != 'None':
1396 command.extend(['-q', self.cluster_queue])
1397
1398 a = misc.Popen(command, stdout=subprocess.PIPE,
1399 stderr=subprocess.STDOUT,
1400 stdin=subprocess.PIPE, cwd=cwd)
1401
1402 output = a.communicate(text)[0]
1403
1404 try:
1405 id = output.split('>',1)[0].split('<')[1]
1406 except:
1407 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1408 % output
1409 if not id.isdigit():
1410 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1411 % output
1412 self.submitted += 1
1413 self.submitted_ids.append(id)
1414 return id
1415
1416
1417 @multiple_try()
1419 """ control the status of a single job with it's cluster id """
1420
1421 cmd = 'bjobs '+str(id)
1422 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1423
1424 for line in status.stdout:
1425 line = line.strip().upper()
1426 if 'JOBID' in line:
1427 continue
1428 elif str(id) not in line:
1429 continue
1430 status = line.split()[2]
1431 if status == 'RUN':
1432 return 'R'
1433 elif status == 'PEND':
1434 return 'I'
1435 elif status == 'DONE':
1436 return 'F'
1437 else:
1438 return 'H'
1439 return 'F'
1440
1441 @multiple_try()
1443 """ control the status of a single job with it's cluster id """
1444
1445 if not self.submitted_ids:
1446 return 0, 0, 0, 0
1447
1448 cmd = "bjobs " + ' '.join(self.submitted_ids)
1449 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1450
1451 jobstatus = {}
1452 for line in status.stdout:
1453 line = line.strip()
1454 if 'JOBID' in line:
1455 continue
1456 splitline = line.split()
1457 id = splitline[0]
1458 if id not in self.submitted_ids:
1459 continue
1460 jobstatus[id] = splitline[2]
1461
1462 idle, run, fail = 0, 0, 0
1463 for id in self.submitted_ids[:]:
1464 if id in jobstatus:
1465 status = jobstatus[id]
1466 else:
1467 status = 'MISSING'
1468 if status == 'RUN':
1469 run += 1
1470 elif status == 'PEND':
1471 idle += 1
1472 else:
1473 status = self.check_termination(id)
1474 if status == 'wait':
1475 run += 1
1476 elif status == 'resubmit':
1477 idle += 1
1478
1479 return idle, run, self.submitted - (idle+run+fail), fail
1480
1481 @multiple_try()
1482 - def remove(self, *args,**opts):
1483 """Clean the jobs on the cluster"""
1484
1485 if not self.submitted_ids:
1486 return
1487 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1488 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1489 self.submitted_ids = []
1490
1492 """Class for dealing with cluster submission on a GE cluster"""
1493
1494 name = 'ge'
1495 job_id = 'JOB_ID'
1496 idle_tag = ['qw']
1497 running_tag = ['r']
1498
1499 @multiple_try()
1500 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1501 required_output=[], nb_submit=0):
1502 """Submit a job prog to a GE cluster"""
1503
1504 text = ""
1505 if cwd is None:
1506 cwd = os.getcwd()
1507 else:
1508 text = " cd %s; bash " % cwd
1509 if stdout is None:
1510 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1511 if stderr is None:
1512 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1513 elif stderr == -2:
1514 stderr = stdout
1515 if log is None:
1516 log = '/dev/null'
1517
1518 text += prog
1519 if argument:
1520 text += ' ' + ' '.join(argument)
1521 text += '\n'
1522 tmp_submit = os.path.join(cwd, 'tmp_submit')
1523 open(tmp_submit,'w').write(text)
1524
1525 a = misc.Popen(['qsub','-o', stdout,
1526 '-e', stderr,
1527 tmp_submit],
1528 stdout=subprocess.PIPE,
1529 stderr=subprocess.STDOUT,
1530 stdin=subprocess.PIPE, cwd=cwd)
1531
1532 output = a.communicate()[0]
1533
1534 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1535 try:
1536 id = pat.search(output).groups()[0]
1537 except:
1538 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1539 % output
1540 self.submitted += 1
1541 self.submitted_ids.append(id)
1542 return id
1543
1544 @multiple_try()
1546 """ control the status of a single job with it's cluster id """
1547 cmd = 'qstat | grep '+str(id)
1548 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1549 if not status:
1550 return 'F'
1551
1552 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1553 stat = ''
1554 for line in status.stdout.read().split('\n'):
1555 if not line:
1556 continue
1557 line = line.strip()
1558 try:
1559 groups = pat.search(line).groups()
1560 except:
1561 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1562 if groups[0] != id: continue
1563 stat = groups[1]
1564 if not stat:
1565 return 'F'
1566 if stat in self.idle_tag:
1567 return 'I'
1568 if stat in self.running_tag:
1569 return 'R'
1570
1571 @multiple_try()
1573 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1574 if not self.submitted_ids:
1575 return 0, 0, 0, 0
1576 idle, run, fail = 0, 0, 0
1577 ongoing = []
1578 for statusflag in ['p', 'r', 'sh']:
1579 cmd = 'qstat -s %s' % statusflag
1580 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1581
1582 pat = re.compile("^(\d+)")
1583 for line in status.stdout.read().split('\n'):
1584 line = line.strip()
1585 try:
1586 id = pat.search(line).groups()[0]
1587 except Exception:
1588 pass
1589 else:
1590 if id not in self.submitted_ids:
1591 continue
1592 ongoing.append(id)
1593 if statusflag == 'p':
1594 idle += 1
1595 if statusflag == 'r':
1596 run += 1
1597 if statusflag == 'sh':
1598 fail += 1
1599 for id in list(self.submitted_ids):
1600 if id not in ongoing:
1601 self.check_termination(id)
1602
1603
1604 return idle, run, self.submitted - idle - run - fail, fail
1605
1606 @multiple_try()
1607 - def remove(self, *args, **opts):
1608 """Clean the jobs on the cluster"""
1609
1610 if not self.submitted_ids:
1611 return
1612 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1613 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1614 self.submitted_ids = []
1615
1617 """start a computation and not wait for it to finish.
1618 this fonction returns a lock which is locked as long as the job is
1619 running."""
1620
1621 mc = MultiCore(1)
1622 mc.submit(exe, argument, cwd, stdout, **opt)
1623 mc.need_waiting = True
1624 return mc.lock
1625
1628 """Basic class for dealing with cluster submission"""
1629
1630 name = 'slurm'
1631 job_id = 'SLURM_JOBID'
1632 idle_tag = ['Q','PD','S','CF']
1633 running_tag = ['R', 'CG']
1634 complete_tag = ['C']
1635 identifier_length = 8
1636
1637 @multiple_try()
1638 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1639 required_output=[], nb_submit=0):
1640 """Submit a job prog to a SLURM cluster"""
1641
1642 me_dir = self.get_jobs_identifier(cwd, prog)
1643
1644
1645 if cwd is None:
1646 cwd = os.getcwd()
1647 if stdout is None:
1648 stdout = '/dev/null'
1649 if stderr is None:
1650 stderr = '/dev/null'
1651 elif stderr == -2:
1652 stderr = stdout
1653 if log is None:
1654 log = '/dev/null'
1655
1656 command = ['sbatch', '-o', stdout,
1657 '-J', me_dir,
1658 '-e', stderr, prog] + argument
1659
1660 if self.cluster_queue and self.cluster_queue != 'None':
1661 command.insert(1, '-p')
1662 command.insert(2, self.cluster_queue)
1663
1664 a = misc.Popen(command, stdout=subprocess.PIPE,
1665 stderr=subprocess.STDOUT,
1666 stdin=subprocess.PIPE, cwd=cwd)
1667
1668 output = a.communicate()
1669 output_arr = output[0].split(' ')
1670 id = output_arr[3].rstrip()
1671
1672 if not id.isdigit():
1673 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1674 % (output[0] + '\n' + output[1])
1675
1676 self.submitted += 1
1677 self.submitted_ids.append(id)
1678 return id
1679
1680 @multiple_try()
1682 """ control the status of a single job with it's cluster id """
1683 cmd = 'squeue j'+str(id)
1684 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1685 stderr=open(os.devnull,'w'))
1686
1687 for line in status.stdout:
1688 line = line.strip()
1689 if 'Invalid' in line:
1690 return 'F'
1691 elif line.startswith(str(id)):
1692 status = line.split()[4]
1693 if status in self.idle_tag:
1694 return 'I'
1695 elif status in self.running_tag:
1696 return 'R'
1697 return 'F'
1698
1699 @multiple_try()
1701 """ control the status of a single job with it's cluster id """
1702 cmd = "squeue"
1703 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1704
1705 me_dir = self.get_jobs_identifier(me_dir)
1706
1707 idle, run, fail = 0, 0, 0
1708 ongoing=[]
1709 for line in pstatus.stdout:
1710 if me_dir in line:
1711 id, _, _,_ , status,_ = line.split(None,5)
1712 ongoing.append(id)
1713 if status in self.idle_tag:
1714 idle += 1
1715 elif status in self.running_tag:
1716 run += 1
1717 elif status in self.complete_tag:
1718 status = self.check_termination(id)
1719 if status == 'wait':
1720 run += 1
1721 elif status == 'resubmit':
1722 idle += 1
1723 else:
1724 fail += 1
1725
1726
1727 for id in list(self.submitted_ids):
1728 if id not in ongoing:
1729 status = self.check_termination(id)
1730 if status == 'wait':
1731 run += 1
1732 elif status == 'resubmit':
1733 idle += 1
1734
1735
1736 return idle, run, self.submitted - (idle+run+fail), fail
1737
1738 @multiple_try()
1739 - def remove(self, *args, **opts):
1740 """Clean the jobs on the cluster"""
1741
1742 if not self.submitted_ids:
1743 return
1744 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1745 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1746 self.submitted_ids = []
1747
1749 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1750
1751 name= 'htcaas'
1752 job_id = 'HTCAAS_JOBID'
1753 idle_tag = ['waiting']
1754 running_tag = ['preparing','running']
1755 complete_tag = ['done']
1756
1757 @store_input()
1758 @multiple_try()
1759 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1760 log=None, input_files=[], output_files=[], required_output=[],
1761 nb_submit=0):
1762 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1763 input/output file should be given as relative to CWd
1764 """
1765
1766 cur_usr = os.getenv('USER')
1767
1768 if cwd is None:
1769 cwd = os.getcwd()
1770
1771 cwd_cp = cwd.rsplit("/",2)
1772
1773 if not stdout is None:
1774 print "stdout: %s" % stdout
1775
1776 if not os.path.exists(prog):
1777 prog = os.path.join(cwd, prog)
1778
1779 if not required_output and output_files:
1780 required_output = output_files
1781
1782 logger.debug(prog)
1783 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1784 cwd_arg = cwd+"/arguments"
1785 temp = ' '.join([str(a) for a in argument])
1786 arg_cmd="echo '"+temp+"' > " + cwd_arg
1787 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1788 if argument :
1789 command.extend(['-a ', '='.join([str(a) for a in argument])])
1790 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1791 id = a.stdout.read().strip()
1792
1793 else:
1794 cwd_arg = cwd+"/arguments"
1795 temp = ' '.join([str(a) for a in argument])
1796 temp_file_name = "sub." + os.path.basename(prog)
1797 text = """#!/bin/bash
1798 MYPWD=%(cwd)s
1799 cd $MYPWD
1800 input_files=(%(input_files)s )
1801 for i in ${input_files[@]}
1802 do
1803 chmod -f +x $i
1804 done
1805 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1806 """
1807 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1808 'arguments': ' '.join([str(a) for a in argument]),
1809 'program': ' ' if '.py' in prog else 'bash'}
1810
1811
1812 new_prog = pjoin(cwd, temp_file_name)
1813 open(new_prog, 'w').write(text % dico)
1814 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1815 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1816 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1817 id = a.stdout.read().strip()
1818 logger.debug(id)
1819
1820 nb_try=0
1821 nb_limit=5
1822 if not id.isdigit() :
1823 print "[ID is not digit]:" + id
1824
1825 while not id.isdigit() :
1826 nb_try+=1
1827 print "[fail_retry]:"+ nb_try
1828 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1829 id = a.stdout.read().strip()
1830 if nb_try > nb_limit :
1831 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1832 break
1833
1834 self.submitted += 1
1835 self.submitted_ids.append(id)
1836
1837 return id
1838
1839 @multiple_try(nb_try=10, sleep=5)
1841 """ control the status of a single job with it's cluster id """
1842
1843 if id == 0 :
1844 status_out ='C'
1845 else :
1846 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1847 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1848 stderr=subprocess.PIPE)
1849 error = status.stderr.read()
1850 if status.returncode or error:
1851 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1852 status_out= status.stdout.read().strip()
1853 status_out= status_out.split(":",1)[1]
1854 if status_out == 'waiting':
1855 status_out='I'
1856 elif status_out == 'preparing' or status_out == 'running':
1857 status_out = 'R'
1858 elif status_out != 'done':
1859 status_out = 'F'
1860 elif status_out == 'done':
1861 status_out = 'C'
1862
1863 return status_out
1864
1865 @multiple_try()
1867 """ control the status of a single job with it's cluster id """
1868 if not self.submitted_ids:
1869 logger.debug("self.submitted_ids not exists")
1870 return 0, 0, 0, 0
1871
1872 ongoing = []
1873 idle, run, fail = 0, 0, 0
1874
1875 start = self.submitted_ids[0]
1876 end = self.submitted_ids[-1]
1877
1878 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1879 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1880
1881 for line in status.stdout:
1882
1883 status2 = line.split()[-1]
1884 if status2 is not 'null' or line.split()[0].strip() is not '0':
1885 ongoing.append(line.split()[0].strip())
1886 logger.debug("["+line.split()[0].strip()+"]"+status2)
1887 if status2 is 'null' or line.split()[0].strip() is '0':
1888 idle += 1
1889 elif status2 in self.idle_tag:
1890 idle += 1
1891 elif status2 in self.running_tag:
1892 run += 1
1893 elif status2 in self.complete_tag:
1894 if not self.check_termination(line.split()[0]):
1895 idle +=1
1896 else:
1897 fail += 1
1898
1899 return idle, run, self.submitted - (idle+run+fail), fail
1900
1901 @multiple_try()
1902 - def remove(self, *args, **opts):
1903 """Clean the jobson the cluster"""
1904
1905 if not self.submitted_ids:
1906 return
1907 for i in range(len(self.submitted_ids)):
1908 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1909 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1910
1912 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1913
1914 name= 'htcaas2'
1915 job_id = 'HTCAAS2_JOBID'
1916 idle_tag = ['waiting']
1917 running_tag = ['preparing','running']
1918 complete_tag = ['done']
1919
1920 @store_input()
1921 @multiple_try()
1922 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1923 log=None, input_files=[], output_files=[], required_output=[],
1924 nb_submit=0):
1925
1926 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1927 input/output file should be given as relative to CWD
1928 """
1929 if cwd is None:
1930 cwd = os.getcwd()
1931
1932 if not os.path.exists(prog):
1933 prog = os.path.join(cwd, prog)
1934
1935 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1936 if cwd or prog :
1937 self.submitted_dirs.append(cwd)
1938 self.submitted_exes.append(prog)
1939 else:
1940 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1941
1942 if argument :
1943 self.submitted_args.append('='.join([str(a) for a in argument]))
1944
1945 if cwd or prog :
1946 self.submitted += 1
1947 id = self.submitted
1948 self.submitted_ids.append(id)
1949 else:
1950 logger.debug("cwd and prog are not exist! ")
1951 id = 0
1952
1953 else:
1954 temp_file_name = "sub."+ os.path.basename(prog)
1955 text = """#!/bin/bash
1956 MYPWD=%(cwd)s
1957 cd $MYPWD
1958 input_files=(%(input_files)s )
1959 for i in ${input_files[@]}
1960 do
1961 chmod -f +x $i
1962 done
1963 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1964 """
1965 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1966 'arguments': ' '.join([str(a) for a in argument]),
1967 'program': ' ' if '.py' in prog else 'bash'}
1968
1969 new_prog = pjoin(cwd, temp_file_name)
1970 open(new_prog, 'w').write(text % dico)
1971 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1972 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
1973 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1974 id = a.stdout.read().strip()
1975 logger.debug("[mode2]-["+str(id)+"]")
1976 if cwd and prog :
1977 self.submitted += 1
1978 self.submitted_ids.append(id)
1979 else:
1980 logger.debug("cwd and prog are not exist! ")
1981 id = 0
1982
1983 return id
1984
1985 @multiple_try()
2029
2030
2031 @multiple_try(nb_try=10, sleep=5)
2033 """ control the status of a single job with it's cluster id """
2034
2035 if self.submitted == self.submitted_ids[-1] :
2036 id = self.metasubmit(self)
2037 tempid = self.submitted_ids[-1]
2038 self.submitted_ids.remove(self.submitted_ids[-1])
2039 self.submitted_ids.append(id)
2040 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2041
2042 if id == 0 :
2043 status_out ='C'
2044 else:
2045 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2046 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2047 stderr=subprocess.PIPE)
2048 error = status.stderr.read()
2049 if status.returncode or error:
2050 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error
2051 status_out= status.stdout.read().strip()
2052 status_out= status_out.split(":",1)[1]
2053 logger.debug("[["+str(id)+"]]"+status_out)
2054 if status_out == 'waiting':
2055 status_out='I'
2056 elif status_out == 'preparing' or status_out == 'running':
2057 status_out = 'R'
2058 elif status_out != 'done':
2059 status_out = 'F'
2060 elif status_out == 'done':
2061 status_out = 'C'
2062 self.submitted -= 1
2063
2064 return status_out
2065
2066 @multiple_try()
2068 """ control the status of a single job with it's cluster id """
2069 if not self.submitted_ids:
2070 logger.debug("self.submitted_ids not exists")
2071 return 0, 0, 0, 0
2072
2073 if "//" in me_dir :
2074 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2075 start = me_dir.split("//")[0]
2076 end = me_dir.split("//")[1]
2077 else :
2078 start = me_dir.split("//")[1]
2079 end = me_dir.split("//")[0]
2080 elif "/" in me_dir :
2081 start = 0
2082 end = 0
2083 elif me_dir.isdigit():
2084 start = me_dir
2085 end = me_dir
2086 elif not me_dir.isdigit():
2087 me_dir = self.submitted_ids[0]
2088 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2089
2090 ongoing = []
2091 idle, run, fail, done = 0, 0, 0, 0
2092
2093 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2094 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2095
2096 for line in status.stdout:
2097 status2 = line.split()[-1]
2098 if status2 is not 'null' or line.split()[0].strip() is not '0':
2099 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2100 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2101
2102 if status2 is 'null' or line.split()[0].strip() is '0':
2103 idle += 1
2104 elif status2 in self.idle_tag:
2105 idle += 1
2106 elif status2 in self.running_tag:
2107 run += 1
2108 elif status2 in self.complete_tag:
2109 done += 1
2110 self.submitted -= 1
2111 if not self.check_termination(line.split()[1]):
2112 idle +=1
2113 else:
2114 fail += 1
2115
2116 return idle, run, self.submitted - (idle+run+fail), fail
2117
2118 @multiple_try()
2119 - def remove(self, *args, **opts):
2120 """Clean the jobson the cluster"""
2121
2122 if not self.submitted_ids:
2123 return
2124 id = self.submitted_ids[0]
2125 if id is not 0 :
2126 cmd = "htcaas-job-cancel -m %s" % str(id)
2127 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2128
2129 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2130 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2131 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2132
2133 onecore=MultiCore(1)
2134
2135