1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21 import sys
22
23 logger = logging.getLogger('madgraph.cluster')
24
25 try:
26 from madgraph import MadGraph5Error
27 import madgraph.various.misc as misc
28 except Exception, error:
29 if __debug__:
30 print str(error)
31 from internal import MadGraph5Error
32 import internal.misc as misc
33
34 pjoin = os.path.join
38
41
42
43 multiple_try = misc.multiple_try
44 pjoin = os.path.join
48
49 def deco_interupt(f):
50 def deco_f_interupt(self, *args, **opt):
51 try:
52 return f(self, *args, **opt)
53 except error:
54 try:
55 self.remove(*args, **opt)
56 except Exception:
57 pass
58 raise error
59 return deco_f_interupt
60 return deco_interupt
61
74 return deco_f_store
75 return deco_store
76
78 """ This function checks whether compression of input files are necessary
79 given the running options given. """
80
81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
82 return False
83 else:
84 return True
85
87 """Basic Class for all cluster type submission"""
88 name = 'mother class'
89 identifier_length = 14
90
92 """Init the cluster"""
93
94 self.submitted = 0
95 self.submitted_ids = []
96 self.finish = 0
97 self.submitted_dirs = []
98 self.submitted_exes = []
99 self.submitted_args = []
100
101 if 'cluster_queue' in opts:
102 self.cluster_queue = opts['cluster_queue']
103 else:
104 self.cluster_queue = 'madgraph'
105 if 'cluster_temp_path' in opts:
106 self.temp_dir = opts['cluster_temp_path']
107 else:
108 self.temp_dir = None
109 self.options = {'cluster_status_update': (600, 30)}
110 for key,value in opts.items():
111 self.options[key] = value
112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
114 self.options = dict(opts)
115 self.retry_args = {}
116
117 self.packet = {}
118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster."""
123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124
125
126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
128 log=None, input_files=[], output_files=[], required_output=[],
129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster.
131 NO SHARE DISK"""
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137
138 if not required_output and output_files:
139 required_output = output_files
140
141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
142 (input_files == [] == output_files):
143 return self.submit(prog, argument, cwd, stdout, stderr, log,
144 required_output=required_output, nb_submit=nb_submit)
145
146 if not input_files and not output_files:
147
148 return self.submit(prog, argument, cwd, stdout, stderr, log,
149 required_output=required_output, nb_submit=nb_submit)
150
151 if cwd is None:
152 cwd = os.getcwd()
153 if not os.path.exists(prog):
154 prog = os.path.join(cwd, prog)
155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
156
157 text = """#!/bin/bash
158 MYTMP=%(tmpdir)s/run$%(job_id)s
159 MYPWD=%(cwd)s
160 mkdir -p $MYTMP
161 cd $MYPWD
162 input_files=( %(input_files)s )
163 for i in ${input_files[@]}
164 do
165 cp -R -L $i $MYTMP
166 done
167 cd $MYTMP
168 echo '%(arguments)s' > arguments
169 chmod +x ./%(script)s
170 %(program)s ./%(script)s %(arguments)s
171 exit=$?
172 output_files=( %(output_files)s )
173 for i in ${output_files[@]}
174 do
175 cp -r $MYTMP/$i $MYPWD
176 done
177 # if [ "$exit" -eq "0" ]
178 # then
179 rm -rf $MYTMP
180 # fi
181 """
182
183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
184 'cwd': cwd, 'job_id': self.job_id,
185 'input_files': ' '.join(input_files + [prog]),
186 'output_files': ' '.join(output_files),
187 'arguments': ' '.join([str(a) for a in argument]),
188 'program': ' ' if '.py' in prog else 'bash'}
189
190
191 new_prog = pjoin(cwd, temp_file_name)
192 open(new_prog, 'w').write(text % dico)
193 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
194
195 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
196 required_output=required_output, nb_submit=nb_submit)
197
198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
200 log=None, input_files=[], output_files=[], required_output=[],
201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant
203 method should not be overwritten (but for DAG type submission)"""
204
205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
206 output_files, required_output, nb_submit)
207
208
209 if not packet_member:
210 return id
211 else:
212 if isinstance(packet_member, Packet):
213 self.id_to_packet[id] = packet_member
214 packet_member.put(id)
215 if packet_member.tag not in self.packet:
216 self.packet[packet_member.tag] = packet_member
217 else:
218 if packet_member in self.packet:
219 packet = self.packet[packet_member]
220 packet.put(id)
221 self.id_to_packet[id] = packet
222 return id
223
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
226 if not self.submitted_ids:
227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
228 idle, run, fail = 0, 0, 0
229 for pid in self.submitted_ids[:]:
230 status = self.control_one_job(id)
231 if status == 'I':
232 idle += 1
233 elif status == 'R':
234 run += 1
235 elif status == 'F':
236 self.finish +=1
237 self.submitted_ids.remove(pid)
238 else:
239 fail += 1
240
241 return idle, run, self.finish, fail
242
244 """ control the status of a single job with it's cluster id """
245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
248 """get a unique run_name for all the jobs helps to identify the runs
249 in the controller for some cluster."""
250
251 if second_path:
252 path = os.path.realpath(pjoin(path, second_path))
253 elif not os.path.exists(path):
254 return path
255
256 if 'SubProcesses' in path:
257 target = path.rsplit('/SubProcesses',1)[0]
258 elif 'MCatNLO' in path:
259 target = path.rsplit('/MCatNLO',1)[0]
260 elif second_path:
261 target=path
262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
263 else:
264 target = path
265
266 if target.endswith('/'):
267 target = target[:-1]
268
269 target = misc.digest(target)[-self.identifier_length:]
270 if not target[0].isalpha():
271 target = 'a' + target[1:]
272
273 return target
274
275
276 @check_interupt()
277 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
278 """Wait that all job are finish.
279 if minimal_job set, then return if idle + run is lower than that number"""
280
281
282 mode = 1
283 nb_iter = 0
284 nb_short = 0
285 change_at = 5
286
287 if update_first:
288 idle, run, finish, fail = self.control(me_dir)
289 update_first(idle, run, finish)
290
291
292 longtime, shorttime = self.options['cluster_status_update']
293
294 nb_job = 0
295
296 if self.options['cluster_type'] == 'htcaas2':
297 me_dir = self.metasubmit(self)
298
299 while 1:
300 old_mode = mode
301 nb_iter += 1
302 idle, run, finish, fail = self.control(me_dir)
303 if nb_job:
304 if idle + run + finish + fail != nb_job:
305 nb_job = idle + run + finish + fail
306 nb_iter = 1
307 else:
308 nb_job = idle + run + finish + fail
309 if fail:
310 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
311 if idle + run == 0:
312
313 logger.info('All jobs finished')
314 fct(idle, run, finish)
315 break
316 if idle + run < minimal_job:
317 return
318 fct(idle, run, finish)
319
320 if nb_iter < change_at:
321 mode = 1
322 elif idle < run:
323 if old_mode == 0:
324 if nb_short:
325 mode = 0
326
327 elif idle:
328 if nb_iter > change_at + int(longtime)//shorttime:
329 mode = 0
330 else:
331 mode = 1
332 nb_short =0
333 else:
334 mode = 1
335 nb_short = 0
336 elif old_mode == 1:
337 nb_short +=1
338 if nb_short > 3* max(change_at, int(longtime)//shorttime):
339 mode = 0
340 else:
341 mode = 0
342
343
344 if old_mode > mode:
345 logger.info('''Start to wait %ss between checking status.
346 Note that you can change this time in the configuration file.
347 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
348
349
350 if mode == 0:
351 try:
352 time.sleep(self.options['cluster_status_update'][0])
353 except KeyboardInterrupt:
354 logger.info('start to update the status')
355 nb_iter = min(0, change_at -2)
356 nb_short = 0
357 else:
358 time.sleep(self.options['cluster_status_update'][1])
359
360
361 self.submitted = 0
362 self.submitted_ids = []
363
365 """Check the termination of the jobs with job_id and relaunch it if needed."""
366
367
368 if job_id not in self.retry_args:
369 if job_id in self.id_to_packet:
370 nb_in_packet = self.id_to_packet[job_id].remove_one()
371 if nb_in_packet == 0:
372
373 packet = self.id_to_packet[job_id]
374
375 packet.queue.join()
376
377 packet.fct(*packet.args)
378 del self.id_to_packet[job_id]
379 return 'resubmit'
380 else:
381 return True
382
383 args = self.retry_args[job_id]
384 if 'time_check' in args:
385 time_check = args['time_check']
386 else:
387 time_check = 0
388
389 for path in args['required_output']:
390 if args['cwd']:
391 path = pjoin(args['cwd'], path)
392
393 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
394 break
395 else:
396
397 if time_check > 0:
398 logger.info('Job %s Finally found the missing output.' % (job_id))
399 del self.retry_args[job_id]
400 self.submitted_ids.remove(job_id)
401
402 if job_id in self.id_to_packet:
403 nb_in_packet = self.id_to_packet[job_id].remove_one()
404 if nb_in_packet == 0:
405
406 packet = self.id_to_packet[job_id]
407
408 packet.queue.join()
409
410 packet.fct(*packet.args)
411 del self.id_to_packet[job_id]
412 return 'resubmit'
413
414 return 'done'
415
416 if time_check == 0:
417 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
418 args['time_check'] = time.time()
419 return 'wait'
420 elif self.cluster_retry_wait > time.time() - time_check:
421 return 'wait'
422
423
424 if self.nb_retry < 0:
425 logger.critical('''Fail to run correctly job %s.
426 with option: %s
427 file missing: %s''' % (job_id, args, path))
428 raw_input('press enter to continue.')
429 elif self.nb_retry == 0:
430 logger.critical('''Fail to run correctly job %s.
431 with option: %s
432 file missing: %s.
433 Stopping all runs.''' % (job_id, args, path))
434 self.remove()
435 elif args['nb_submit'] >= self.nb_retry:
436 logger.critical('''Fail to run correctly job %s.
437 with option: %s
438 file missing: %s
439 Fails %s times
440 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
441 self.remove()
442 else:
443 args['nb_submit'] += 1
444 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
445 del self.retry_args[job_id]
446 self.submitted_ids.remove(job_id)
447 if 'time_check' in args:
448 del args['time_check']
449 if job_id in self.id_to_packet:
450 self.id_to_packet[job_id].remove_one()
451 args['packet_member'] = self.id_to_packet[job_id]
452 del self.id_to_packet[job_id]
453 self.cluster_submit(**args)
454 else:
455 self.submit2(**args)
456 return 'resubmit'
457 return 'done'
458
459 @check_interupt()
460 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
461 stderr=None, log=None, required_output=[], nb_submit=0,
462 input_files=[], output_files=[]):
463 """launch one job on the cluster and wait for it"""
464
465 special_output = False
466 if stderr == -2 and stdout:
467
468 special_output = True
469 stderr = stdout + '.err'
470
471 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
472 required_output=required_output, input_files=input_files,
473 output_files=output_files)
474
475 if self.options['cluster_type']=='htcaas2':
476 if self.submitted == self.submitted_ids[-1]:
477 id = self.metasubmit(self)
478
479 frame = inspect.currentframe()
480 args, _, _, values = inspect.getargvalues(frame)
481 args = dict([(i, values[i]) for i in args if i != 'self'])
482 self.retry_args[id] = args
483
484 nb_wait=0
485 while 1:
486 nb_wait+=1
487 status = self.control_one_job(id)
488 if not status in ['R','I']:
489 status = self.check_termination(id)
490 if status in ['wait']:
491 time.sleep(30)
492 continue
493 elif status in ['resubmit']:
494 id = self.submitted_ids[0]
495 time.sleep(30)
496 continue
497
498 time.sleep(30)
499 break
500 time.sleep(self.options['cluster_status_update'][1])
501
502 if required_output:
503 status = self.check_termination(id)
504 if status == 'wait':
505 run += 1
506 elif status == 'resubmit':
507 idle += 1
508
509
510 if special_output:
511
512
513 for i in range(5):
514 if os.path.exists(stdout):
515 if not os.path.exists(stderr):
516 time.sleep(5)
517 if os.path.exists(stderr):
518 err_text = open(stderr).read()
519 if not err_text:
520 return
521 logger.warning(err_text)
522 text = open(stdout).read()
523 open(stdout,'w').write(text + err_text)
524 else:
525 return
526 time.sleep(10)
527
528 - def remove(self, *args, **opts):
529 """ """
530 logger.warning("""This cluster didn't support job removal,
531 the jobs are still running on the cluster.""")
532
533 @store_input()
537
539 """ an object for handling packet of job, it is designed to be thread safe
540 """
541
542 - def __init__(self, name, fct, args, opts={}):
543 import Queue
544 import threading
545 self.queue = Queue.Queue()
546 self.tag = name
547 self.fct = fct
548 self.args = args
549 self.opts = opts
550 self.done = threading.Event()
551
552 - def put(self, *args, **opts):
554
555 append = put
556
561
563 """class for dealing with the submission in multiple node"""
564
565 job_id = "$"
566
568 """Init the cluster """
569
570
571 super(MultiCore, self).__init__(self, *args, **opt)
572
573 import Queue
574 import threading
575 import thread
576 self.queue = Queue.Queue()
577 self.done = Queue.Queue()
578 self.submitted = Queue.Queue()
579 self.stoprequest = threading.Event()
580 self.demons = []
581 self.nb_done =0
582 if 'nb_core' in opt:
583 self.nb_core = opt['nb_core']
584 elif isinstance(args[0],int):
585 self.nb_core = args[0]
586 else:
587 self.nb_core = 1
588 self.update_fct = None
589
590 self.lock = threading.Event()
591 self.pids = Queue.Queue()
592 self.done_pid = []
593 self.done_pid_queue = Queue.Queue()
594 self.fail_msg = None
595
596
597 for _ in range(self.nb_core):
598 self.start_demon()
599
600
602 import threading
603 t = threading.Thread(target=self.worker)
604 t.daemon = True
605 t.start()
606 self.demons.append(t)
607
608
610 import Queue
611 import thread
612 while not self.stoprequest.isSet():
613 try:
614 args = self.queue.get()
615 tag, exe, arg, opt = args
616 try:
617
618 if isinstance(exe,str):
619 if os.path.exists(exe) and not exe.startswith('/'):
620 exe = './' + exe
621 if opt['stderr'] == None:
622 opt['stderr'] = subprocess.STDOUT
623 proc = misc.Popen([exe] + arg, **opt)
624 pid = proc.pid
625 self.pids.put(pid)
626 proc.wait()
627 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
628 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
629 (' '.join([exe]+arg), proc.returncode)
630 logger.warning(fail_msg)
631 self.stoprequest.set()
632 self.remove(fail_msg)
633
634
635
636
637 else:
638 pid = tag
639 self.pids.put(pid)
640
641
642 returncode = exe(*arg, **opt)
643 if returncode != 0:
644 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
645 self.stoprequest.set()
646 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
647 except Exception,error:
648 self.fail_msg = sys.exc_info()
649 logger.warning(str(error))
650 self.stoprequest.set()
651 self.remove(error)
652
653 if __debug__:
654 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2]
655
656 self.queue.task_done()
657 self.done.put(tag)
658 self.done_pid_queue.put(pid)
659
660 try:
661 self.lock.set()
662 except thread.error:
663 continue
664 except Queue.Empty:
665 continue
666
667
668
669
670 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
671 log=None, required_output=[], nb_submit=0):
672 """submit a job on multicore machine"""
673
674 tag = (prog, tuple(argument), cwd, nb_submit)
675 if isinstance(prog, str):
676
677
678 opt = {'cwd': cwd,
679 'stdout':stdout,
680 'stderr': stderr}
681 self.queue.put((tag, prog, argument, opt))
682 self.submitted.put(1)
683 return tag
684 else:
685
686 self.queue.put((tag, prog, argument, {}))
687 self.submitted.put(1)
688 return tag
689
690 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
691 stderr=None, log=None, **opts):
692 """launch one job and wait for it"""
693 if isinstance(stdout, str):
694 stdout = open(stdout, 'w')
695 if isinstance(stderr, str):
696 stdout = open(stderr, 'w')
697 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
698
699 - def remove(self, error=None):
700 """Ensure that all thread are killed"""
701
702
703 self.stoprequest.set()
704 if error and not self.fail_msg:
705 self.fail_msg = error
706
707
708 while not self.done_pid_queue.empty():
709 pid = self.done_pid_queue.get()
710 self.done_pid.append(pid)
711
712
713 while not self.pids.empty():
714 pid = self.pids.get()
715 self.pids.task_done()
716 if isinstance(pid, tuple):
717 continue
718 if pid in self.done_pid:
719 continue
720 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
721 % {'pid':pid} )
722 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
723
724
725 - def wait(self, me_dir, update_status, update_first=None):
726 """Waiting that all the jobs are done. This function also control that
727 the submission by packet are handle correctly (i.e. submit the function)"""
728
729 import Queue
730 import threading
731
732 try:
733 last_status = (0, 0, 0)
734 sleep_time = 1
735 use_lock = True
736 first = True
737 while True:
738 force_one_more_loop = False
739
740
741
742 while self.done.qsize():
743 try:
744 tag = self.done.get(True, 1)
745 except Queue.Empty:
746 pass
747 else:
748 if self.id_to_packet and tuple(tag) in self.id_to_packet:
749 packet = self.id_to_packet[tuple(tag)]
750 remaining = packet.remove_one()
751 if remaining == 0:
752
753 packet.queue.join()
754 self.submit(packet.fct, packet.args)
755 force_one_more_loop = True
756 self.nb_done += 1
757 self.done.task_done()
758
759
760
761 Idle = self.queue.qsize()
762 Done = self.nb_done + self.done.qsize()
763 Running = max(0, self.submitted.qsize() - Idle - Done)
764
765 if Idle + Running <= 0 and not force_one_more_loop:
766 update_status(Idle, Running, Done)
767
768
769 self.queue.join()
770 break
771
772 if (Idle, Running, Done) != last_status:
773 if first and update_first:
774 update_first(Idle, Running, Done)
775 first = False
776 else:
777 update_status(Idle, Running, Done)
778 last_status = (Idle, Running, Done)
779
780
781 while not self.done_pid_queue.empty():
782 pid = self.done_pid_queue.get()
783 self.done_pid.append(pid)
784 self.done_pid_queue.task_done()
785
786
787
788 if use_lock:
789
790 use_lock = self.lock.wait(300)
791 self.lock.clear()
792 if not use_lock and Idle > 0:
793 use_lock = True
794 else:
795
796
797 time.sleep(sleep_time)
798 sleep_time = min(sleep_time + 2, 180)
799 if update_first:
800 update_first(Idle, Running, Done)
801
802 if self.stoprequest.isSet():
803 if isinstance(self.fail_msg, Exception):
804 raise self.fail_msg
805 elif isinstance(self.fail_msg, str):
806 raise Exception, self.fail_msg
807 else:
808 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
809
810 try:
811 self.lock.clear()
812 except Exception:
813 pass
814 self.done = Queue.Queue()
815 self.done_pid = []
816 self.done_pid_queue = Queue.Queue()
817 self.nb_done = 0
818 self.submitted = Queue.Queue()
819 self.pids = Queue.Queue()
820 self.stoprequest.clear()
821
822 except KeyboardInterrupt:
823
824 if isinstance(self.fail_msg, Exception):
825 raise self.fail_msg
826 elif isinstance(self.fail_msg, str):
827 raise Exception, self.fail_msg
828 elif self.fail_msg:
829 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
830
831 raise
832
834 """Basic class for dealing with cluster submission"""
835
836 name = 'condor'
837 job_id = 'CONDOR_ID'
838
839
840
841 @multiple_try()
842 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
843 required_output=[], nb_submit=0):
844 """Submit a job prog to a Condor cluster"""
845
846 text = """Executable = %(prog)s
847 output = %(stdout)s
848 error = %(stderr)s
849 log = %(log)s
850 %(argument)s
851 environment = CONDOR_ID=$(Cluster).$(Process)
852 Universe = vanilla
853 notification = Error
854 Initialdir = %(cwd)s
855 %(requirement)s
856 getenv=True
857 queue 1
858 """
859
860 if self.cluster_queue not in ['None', None]:
861 requirement = 'Requirements = %s=?=True' % self.cluster_queue
862 else:
863 requirement = ''
864
865 if cwd is None:
866 cwd = os.getcwd()
867 if stdout is None:
868 stdout = '/dev/null'
869 if stderr is None:
870 stderr = '/dev/null'
871 if log is None:
872 log = '/dev/null'
873 if not os.path.exists(prog):
874 prog = os.path.join(cwd, prog)
875 if argument:
876 argument = 'Arguments = %s' % ' '.join(argument)
877 else:
878 argument = ''
879
880
881 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
882 'stderr': stderr,'log': log,'argument': argument,
883 'requirement': requirement}
884
885
886 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
887 stdin=subprocess.PIPE)
888 output, _ = a.communicate(text % dico)
889
890
891
892
893 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
894 try:
895 id = pat.search(output).groups()[0]
896 except:
897 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
898 % output
899 self.submitted += 1
900 self.submitted_ids.append(id)
901 return id
902
903 @store_input()
904 @multiple_try()
905 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
906 log=None, input_files=[], output_files=[], required_output=[],
907 nb_submit=0):
908 """Submit the job on the cluster NO SHARE DISK
909 input/output file should be give relative to cwd
910 """
911
912 if not required_output and output_files:
913 required_output = output_files
914
915 if (input_files == [] == output_files):
916 return self.submit(prog, argument, cwd, stdout, stderr, log,
917 required_output=required_output, nb_submit=nb_submit)
918
919 text = """Executable = %(prog)s
920 output = %(stdout)s
921 error = %(stderr)s
922 log = %(log)s
923 %(argument)s
924 should_transfer_files = YES
925 when_to_transfer_output = ON_EXIT
926 transfer_input_files = %(input_files)s
927 %(output_files)s
928 Universe = vanilla
929 notification = Error
930 Initialdir = %(cwd)s
931 %(requirement)s
932 getenv=True
933 queue 1
934 """
935
936 if self.cluster_queue not in ['None', None]:
937 requirement = 'Requirements = %s=?=True' % self.cluster_queue
938 else:
939 requirement = ''
940
941 if cwd is None:
942 cwd = os.getcwd()
943 if stdout is None:
944 stdout = '/dev/null'
945 if stderr is None:
946 stderr = '/dev/null'
947 if log is None:
948 log = '/dev/null'
949 if not os.path.exists(prog):
950 prog = os.path.join(cwd, prog)
951 if argument:
952 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
953 else:
954 argument = ''
955
956 if input_files:
957 input_files = ','.join(input_files)
958 else:
959 input_files = ''
960 if output_files:
961 output_files = 'transfer_output_files = %s' % ','.join(output_files)
962 else:
963 output_files = ''
964
965
966
967 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
968 'stderr': stderr,'log': log,'argument': argument,
969 'requirement': requirement, 'input_files':input_files,
970 'output_files':output_files}
971
972
973 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
974 stdin=subprocess.PIPE)
975 output, _ = a.communicate(text % dico)
976
977
978
979
980 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
981 try:
982 id = pat.search(output).groups()[0]
983 except:
984 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
985 % output
986 self.submitted += 1
987 self.submitted_ids.append(id)
988 return id
989
990
991
992
993
994 @multiple_try(nb_try=10, sleep=10)
996 """ control the status of a single job with it's cluster id """
997 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
998 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
999 stderr=subprocess.PIPE)
1000
1001 error = status.stderr.read()
1002 if status.returncode or error:
1003 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1004
1005 return status.stdout.readline().strip()
1006
1007 @check_interupt()
1008 @multiple_try(nb_try=10, sleep=10)
1010 """ control the status of a single job with it's cluster id """
1011
1012 if not self.submitted_ids:
1013 return 0, 0, 0, 0
1014
1015 packet = 15000
1016 idle, run, fail = 0, 0, 0
1017 ongoing = []
1018 for i in range(1+(len(self.submitted_ids)-1)//packet):
1019 start = i * packet
1020 stop = (i+1) * packet
1021 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1022 " -format \'%-2s\ ' \'ClusterId\' " + \
1023 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1024
1025 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1026 stderr=subprocess.PIPE)
1027 error = status.stderr.read()
1028 if status.returncode or error:
1029 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1030
1031 for line in status.stdout:
1032 id, status = line.strip().split()
1033 ongoing.append(int(id))
1034 if status in ['I','U']:
1035 idle += 1
1036 elif status == 'R':
1037 run += 1
1038 elif status != 'C':
1039 fail += 1
1040
1041 for id in list(self.submitted_ids):
1042 if int(id) not in ongoing:
1043 status = self.check_termination(id)
1044 if status == 'wait':
1045 run += 1
1046 elif status == 'resubmit':
1047 idle += 1
1048
1049 return idle, run, self.submitted - (idle+run+fail), fail
1050
1051 @multiple_try()
1052 - def remove(self, *args, **opts):
1053 """Clean the jobson the cluster"""
1054
1055 if not self.submitted_ids:
1056 return
1057 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1058
1059 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1060 self.submitted_ids = []
1061
1063 """Basic class for dealing with cluster submission"""
1064
1065 name = 'pbs'
1066 job_id = 'PBS_JOBID'
1067 idle_tag = ['Q']
1068 running_tag = ['T','E','R']
1069 complete_tag = ['C']
1070
1071 maximum_submited_jobs = 2500
1072
1073 @multiple_try()
1074 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1075 required_output=[], nb_submit=0):
1076 """Submit a job prog to a PBS cluster"""
1077
1078 me_dir = self.get_jobs_identifier(cwd, prog)
1079
1080 if len(self.submitted_ids) > self.maximum_submited_jobs:
1081 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1082 self.wait(me_dir, fct, self.maximum_submited_jobs)
1083
1084
1085 text = ""
1086 if cwd is None:
1087 cwd = os.getcwd()
1088 else:
1089 text = " cd %s;" % cwd
1090 if stdout is None:
1091 stdout = '/dev/null'
1092 if stderr is None:
1093 stderr = '/dev/null'
1094 elif stderr == -2:
1095 stderr = stdout
1096 if log is None:
1097 log = '/dev/null'
1098
1099 if not os.path.isabs(prog):
1100 text += "./%s" % prog
1101 else:
1102 text+= prog
1103
1104 if argument:
1105 text += ' ' + ' '.join(argument)
1106
1107 command = ['qsub','-o', stdout,
1108 '-N', me_dir,
1109 '-e', stderr,
1110 '-V']
1111
1112 if self.cluster_queue and self.cluster_queue != 'None':
1113 command.extend(['-q', self.cluster_queue])
1114
1115 a = misc.Popen(command, stdout=subprocess.PIPE,
1116 stderr=subprocess.STDOUT,
1117 stdin=subprocess.PIPE, cwd=cwd)
1118
1119 output = a.communicate(text)[0]
1120 id = output.split('.')[0]
1121 if not id.isdigit() or a.returncode !=0:
1122 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1123 % output
1124
1125 self.submitted += 1
1126 self.submitted_ids.append(id)
1127 return id
1128
1129 @multiple_try()
1131 """ control the status of a single job with it's cluster id """
1132 cmd = 'qstat '+str(id)
1133 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1134 stderr=subprocess.STDOUT)
1135
1136 for line in status.stdout:
1137 line = line.strip()
1138 if 'cannot connect to server' in line or 'cannot read reply' in line:
1139 raise ClusterManagmentError, 'server disconnected'
1140 if 'Unknown' in line:
1141 return 'F'
1142 elif line.startswith(str(id)):
1143 jobstatus = line.split()[4]
1144 else:
1145 jobstatus=""
1146
1147 if status.returncode != 0 and status.returncode is not None:
1148 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1149 if jobstatus in self.idle_tag:
1150 return 'I'
1151 elif jobstatus in self.running_tag:
1152 return 'R'
1153 return 'F'
1154
1155
1156 @multiple_try()
1158 """ control the status of a single job with it's cluster id """
1159 cmd = "qstat"
1160 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1161
1162 me_dir = self.get_jobs_identifier(me_dir)
1163
1164 ongoing = []
1165
1166 idle, run, fail = 0, 0, 0
1167 for line in status.stdout:
1168 if 'cannot connect to server' in line or 'cannot read reply' in line:
1169 raise ClusterManagmentError, 'server disconnected'
1170 if me_dir in line:
1171 ongoing.append(line.split()[0].split('.')[0])
1172 status2 = line.split()[4]
1173 if status2 in self.idle_tag:
1174 idle += 1
1175 elif status2 in self.running_tag:
1176 run += 1
1177 elif status2 in self.complete_tag:
1178 if not self.check_termination(line.split()[0].split('.')[0]):
1179 idle += 1
1180 else:
1181 fail += 1
1182
1183 if status.returncode != 0 and status.returncode is not None:
1184 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1185
1186 for id in list(self.submitted_ids):
1187 if id not in ongoing:
1188 status2 = self.check_termination(id)
1189 if status2 == 'wait':
1190 run += 1
1191 elif status2 == 'resubmit':
1192 idle += 1
1193
1194 return idle, run, self.submitted - (idle+run+fail), fail
1195
1196 @multiple_try()
1197 - def remove(self, *args, **opts):
1198 """Clean the jobs on the cluster"""
1199
1200 if not self.submitted_ids:
1201 return
1202 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1203 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1204 self.submitted_ids = []
1205
1208 """Basic class for dealing with cluster submission"""
1209
1210
1211 name = 'sge'
1212 job_id = 'JOB_ID'
1213 idle_tag = ['qw', 'hqw','hRqw','w']
1214 running_tag = ['r','t','Rr','Rt']
1215 identifier_length = 10
1216
1218 """replace string for path issues"""
1219 location = os.path.realpath(location)
1220 homePath = os.getenv("HOME")
1221 if homePath:
1222 location = location.replace(homePath,'$HOME')
1223 return location
1224
1225 @multiple_try()
1226 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1227 required_output=[], nb_submit=0):
1228 """Submit a job prog to an SGE cluster"""
1229
1230 me_dir = self.get_jobs_identifier(cwd, prog)
1231
1232
1233 if cwd is None:
1234
1235 cwd = self.def_get_path(os.getcwd())
1236 cwd1 = self.def_get_path(cwd)
1237 text = " cd %s;" % cwd1
1238 if stdout is None:
1239 stdout = '/dev/null'
1240 else:
1241 stdout = self.def_get_path(stdout)
1242 if stderr is None:
1243 stderr = '/dev/null'
1244 elif stderr == -2:
1245 stderr = stdout
1246 else:
1247 stderr = self.def_get_path(stderr)
1248
1249 if log is None:
1250 log = '/dev/null'
1251 else:
1252 log = self.def_get_path(log)
1253
1254 text += prog
1255 if argument:
1256 text += ' ' + ' '.join(argument)
1257
1258
1259
1260
1261 homePath = os.getenv("HOME")
1262 if homePath:
1263 text = text.replace(homePath,'$HOME')
1264
1265 logger.debug("!=== input %s" % text)
1266 logger.debug("!=== output %s" % stdout)
1267 logger.debug("!=== error %s" % stderr)
1268 logger.debug("!=== logs %s" % log)
1269
1270 command = ['qsub','-o', stdout,
1271 '-N', me_dir,
1272 '-e', stderr,
1273 '-V']
1274
1275 if self.cluster_queue and self.cluster_queue != 'None':
1276 command.extend(['-q', self.cluster_queue])
1277
1278 a = misc.Popen(command, stdout=subprocess.PIPE,
1279 stderr=subprocess.STDOUT,
1280 stdin=subprocess.PIPE, cwd=cwd)
1281
1282 output = a.communicate(text)[0]
1283 id = output.split(' ')[2]
1284 if not id.isdigit():
1285 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1286 % output
1287 self.submitted += 1
1288 self.submitted_ids.append(id)
1289 logger.debug(output)
1290
1291 return id
1292
1293 @multiple_try()
1295 """ control the status of a single job with it's cluster id """
1296
1297 cmd = 'qstat '
1298 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1299 for line in status.stdout:
1300
1301
1302
1303
1304
1305
1306 if str(id) in line:
1307 status = line.split()[4]
1308
1309 if status in self.idle_tag:
1310 return 'I'
1311 elif status in self.running_tag:
1312 return 'R'
1313 return 'F'
1314
1315 @multiple_try()
1317 """ control the status of a single job with it's cluster id """
1318 cmd = "qstat "
1319 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1320
1321 me_dir = self.get_jobs_identifier(me_dir)
1322
1323 finished = list(self.submitted_ids)
1324
1325 idle, run, fail = 0, 0, 0
1326 for line in status.stdout:
1327 if me_dir in line:
1328 id,_,_,_,status = line.split()[:5]
1329 if status in self.idle_tag:
1330 idle += 1
1331 finished.remove(id)
1332 elif status in self.running_tag:
1333 run += 1
1334 finished.remove(id)
1335 else:
1336 logger.debug(line)
1337 fail += 1
1338 finished.remove(id)
1339
1340 for id in finished:
1341 self.check_termination(id)
1342
1343 return idle, run, self.submitted - (idle+run+fail), fail
1344
1345
1346
1347 @multiple_try()
1348 - def remove(self, *args, **opts):
1349 """Clean the jobs on the cluster"""
1350
1351 if not self.submitted_ids:
1352 return
1353 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1354 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1355 self.submitted_ids = []
1356
1359 """Basic class for dealing with cluster submission"""
1360
1361 name = 'lsf'
1362 job_id = 'LSB_JOBID'
1363
1364 @multiple_try()
1365 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1366 required_output=[], nb_submit=0):
1367 """Submit the job prog to an LSF cluster"""
1368
1369
1370 me_dir = self.get_jobs_identifier(cwd, prog)
1371
1372 text = ""
1373 command = ['bsub', '-C0', '-J', me_dir]
1374 if cwd is None:
1375 cwd = os.getcwd()
1376 else:
1377 text = " cd %s;" % cwd
1378 if stdout and isinstance(stdout, str):
1379 command.extend(['-o', stdout])
1380 if stderr and isinstance(stdout, str):
1381 command.extend(['-e', stderr])
1382 elif stderr == -2:
1383 pass
1384 if log is None:
1385 log = '/dev/null'
1386
1387 text += prog
1388 if argument:
1389 text += ' ' + ' '.join(argument)
1390
1391 if self.cluster_queue and self.cluster_queue != 'None':
1392 command.extend(['-q', self.cluster_queue])
1393
1394 a = misc.Popen(command, stdout=subprocess.PIPE,
1395 stderr=subprocess.STDOUT,
1396 stdin=subprocess.PIPE, cwd=cwd)
1397
1398 output = a.communicate(text)[0]
1399
1400 try:
1401 id = output.split('>',1)[0].split('<')[1]
1402 except:
1403 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1404 % output
1405 if not id.isdigit():
1406 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1407 % output
1408 self.submitted += 1
1409 self.submitted_ids.append(id)
1410 return id
1411
1412
1413 @multiple_try()
1415 """ control the status of a single job with it's cluster id """
1416
1417 cmd = 'bjobs '+str(id)
1418 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1419
1420 for line in status.stdout:
1421 line = line.strip().upper()
1422 if 'JOBID' in line:
1423 continue
1424 elif str(id) not in line:
1425 continue
1426 status = line.split()[2]
1427 if status == 'RUN':
1428 return 'R'
1429 elif status == 'PEND':
1430 return 'I'
1431 elif status == 'DONE':
1432 return 'F'
1433 else:
1434 return 'H'
1435 return 'F'
1436
1437 @multiple_try()
1439 """ control the status of a single job with it's cluster id """
1440
1441 if not self.submitted_ids:
1442 return 0, 0, 0, 0
1443
1444 cmd = "bjobs " + ' '.join(self.submitted_ids)
1445 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1446
1447 jobstatus = {}
1448 for line in status.stdout:
1449 line = line.strip()
1450 if 'JOBID' in line:
1451 continue
1452 splitline = line.split()
1453 id = splitline[0]
1454 if id not in self.submitted_ids:
1455 continue
1456 jobstatus[id] = splitline[2]
1457
1458 idle, run, fail = 0, 0, 0
1459 for id in self.submitted_ids[:]:
1460 if id in jobstatus:
1461 status = jobstatus[id]
1462 else:
1463 status = 'MISSING'
1464 if status == 'RUN':
1465 run += 1
1466 elif status == 'PEND':
1467 idle += 1
1468 else:
1469 status = self.check_termination(id)
1470 if status == 'wait':
1471 run += 1
1472 elif status == 'resubmit':
1473 idle += 1
1474
1475 return idle, run, self.submitted - (idle+run+fail), fail
1476
1477 @multiple_try()
1478 - def remove(self, *args,**opts):
1479 """Clean the jobs on the cluster"""
1480
1481 if not self.submitted_ids:
1482 return
1483 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1484 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1485 self.submitted_ids = []
1486
1488 """Class for dealing with cluster submission on a GE cluster"""
1489
1490 name = 'ge'
1491 job_id = 'JOB_ID'
1492 idle_tag = ['qw']
1493 running_tag = ['r']
1494
1495 @multiple_try()
1496 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1497 required_output=[], nb_submit=0):
1498 """Submit a job prog to a GE cluster"""
1499
1500 text = ""
1501 if cwd is None:
1502 cwd = os.getcwd()
1503 else:
1504 text = " cd %s; bash " % cwd
1505 if stdout is None:
1506 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1507 if stderr is None:
1508 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1509 elif stderr == -2:
1510 stderr = stdout
1511 if log is None:
1512 log = '/dev/null'
1513
1514 text += prog
1515 if argument:
1516 text += ' ' + ' '.join(argument)
1517 text += '\n'
1518 tmp_submit = os.path.join(cwd, 'tmp_submit')
1519 open(tmp_submit,'w').write(text)
1520
1521 a = misc.Popen(['qsub','-o', stdout,
1522 '-e', stderr,
1523 tmp_submit],
1524 stdout=subprocess.PIPE,
1525 stderr=subprocess.STDOUT,
1526 stdin=subprocess.PIPE, cwd=cwd)
1527
1528 output = a.communicate()[0]
1529
1530 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1531 try:
1532 id = pat.search(output).groups()[0]
1533 except:
1534 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1535 % output
1536 self.submitted += 1
1537 self.submitted_ids.append(id)
1538 return id
1539
1540 @multiple_try()
1542 """ control the status of a single job with it's cluster id """
1543 cmd = 'qstat | grep '+str(id)
1544 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1545 if not status:
1546 return 'F'
1547
1548 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1549 stat = ''
1550 for line in status.stdout.read().split('\n'):
1551 if not line:
1552 continue
1553 line = line.strip()
1554 try:
1555 groups = pat.search(line).groups()
1556 except:
1557 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1558 if groups[0] != id: continue
1559 stat = groups[1]
1560 if not stat:
1561 return 'F'
1562 if stat in self.idle_tag:
1563 return 'I'
1564 if stat in self.running_tag:
1565 return 'R'
1566
1567 @multiple_try()
1569 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1570 if not self.submitted_ids:
1571 return 0, 0, 0, 0
1572 idle, run, fail = 0, 0, 0
1573 ongoing = []
1574 for statusflag in ['p', 'r', 'sh']:
1575 cmd = 'qstat -s %s' % statusflag
1576 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1577
1578 pat = re.compile("^(\d+)")
1579 for line in status.stdout.read().split('\n'):
1580 line = line.strip()
1581 try:
1582 id = pat.search(line).groups()[0]
1583 except Exception:
1584 pass
1585 else:
1586 if id not in self.submitted_ids:
1587 continue
1588 ongoing.append(id)
1589 if statusflag == 'p':
1590 idle += 1
1591 if statusflag == 'r':
1592 run += 1
1593 if statusflag == 'sh':
1594 fail += 1
1595 for id in list(self.submitted_ids):
1596 if id not in ongoing:
1597 self.check_termination(id)
1598
1599
1600 return idle, run, self.submitted - idle - run - fail, fail
1601
1602 @multiple_try()
1603 - def remove(self, *args, **opts):
1604 """Clean the jobs on the cluster"""
1605
1606 if not self.submitted_ids:
1607 return
1608 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1609 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1610 self.submitted_ids = []
1611
1613 """start a computation and not wait for it to finish.
1614 this fonction returns a lock which is locked as long as the job is
1615 running."""
1616
1617 mc = MultiCore(1)
1618 mc.submit(exe, argument, cwd, stdout, **opt)
1619 mc.need_waiting = True
1620 return mc.lock
1621
1624 """Basic class for dealing with cluster submission"""
1625
1626 name = 'slurm'
1627 job_id = 'SLURM_JOBID'
1628 idle_tag = ['Q','PD','S','CF']
1629 running_tag = ['R', 'CG']
1630 complete_tag = ['C']
1631 identifier_length = 8
1632
1633 @multiple_try()
1634 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1635 required_output=[], nb_submit=0):
1636 """Submit a job prog to a SLURM cluster"""
1637
1638 me_dir = self.get_jobs_identifier(cwd, prog)
1639
1640
1641 if cwd is None:
1642 cwd = os.getcwd()
1643 if stdout is None:
1644 stdout = '/dev/null'
1645 if stderr is None:
1646 stderr = '/dev/null'
1647 elif stderr == -2:
1648 stderr = stdout
1649 if log is None:
1650 log = '/dev/null'
1651
1652 command = ['sbatch', '-o', stdout,
1653 '-J', me_dir,
1654 '-e', stderr, prog] + argument
1655
1656 if self.cluster_queue and self.cluster_queue != 'None':
1657 command.insert(1, '-p')
1658 command.insert(2, self.cluster_queue)
1659
1660 a = misc.Popen(command, stdout=subprocess.PIPE,
1661 stderr=subprocess.STDOUT,
1662 stdin=subprocess.PIPE, cwd=cwd)
1663
1664 output = a.communicate()
1665 output_arr = output[0].split(' ')
1666 id = output_arr[3].rstrip()
1667
1668 if not id.isdigit():
1669 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1670 % (output[0] + '\n' + output[1])
1671
1672 self.submitted += 1
1673 self.submitted_ids.append(id)
1674 return id
1675
1676 @multiple_try()
1678 """ control the status of a single job with it's cluster id """
1679 cmd = 'squeue j'+str(id)
1680 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1681 stderr=open(os.devnull,'w'))
1682
1683 for line in status.stdout:
1684 line = line.strip()
1685 if 'Invalid' in line:
1686 return 'F'
1687 elif line.startswith(str(id)):
1688 status = line.split()[4]
1689 if status in self.idle_tag:
1690 return 'I'
1691 elif status in self.running_tag:
1692 return 'R'
1693 return 'F'
1694
1695 @multiple_try()
1697 """ control the status of a single job with it's cluster id """
1698 cmd = "squeue"
1699 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1700
1701 me_dir = self.get_jobs_identifier(me_dir)
1702
1703 idle, run, fail = 0, 0, 0
1704 ongoing=[]
1705 for line in pstatus.stdout:
1706 if me_dir in line:
1707 id, _, _,_ , status,_ = line.split(None,5)
1708 ongoing.append(id)
1709 if status in self.idle_tag:
1710 idle += 1
1711 elif status in self.running_tag:
1712 run += 1
1713 elif status in self.complete_tag:
1714 status = self.check_termination(id)
1715 if status == 'wait':
1716 run += 1
1717 elif status == 'resubmit':
1718 idle += 1
1719 else:
1720 fail += 1
1721
1722
1723 for id in list(self.submitted_ids):
1724 if id not in ongoing:
1725 status = self.check_termination(id)
1726 if status == 'wait':
1727 run += 1
1728 elif status == 'resubmit':
1729 idle += 1
1730
1731
1732 return idle, run, self.submitted - (idle+run+fail), fail
1733
1734 @multiple_try()
1735 - def remove(self, *args, **opts):
1736 """Clean the jobs on the cluster"""
1737
1738 if not self.submitted_ids:
1739 return
1740 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1741 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1742 self.submitted_ids = []
1743
1745 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1746
1747 name= 'htcaas'
1748 job_id = 'HTCAAS_JOBID'
1749 idle_tag = ['waiting']
1750 running_tag = ['preparing','running']
1751 complete_tag = ['done']
1752
1753 @store_input()
1754 @multiple_try()
1755 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1756 log=None, input_files=[], output_files=[], required_output=[],
1757 nb_submit=0):
1758 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1759 input/output file should be given as relative to CWd
1760 """
1761
1762 cur_usr = os.getenv('USER')
1763
1764 if cwd is None:
1765 cwd = os.getcwd()
1766
1767 cwd_cp = cwd.rsplit("/",2)
1768
1769 if not stdout is None:
1770 print "stdout: %s" % stdout
1771
1772 if not os.path.exists(prog):
1773 prog = os.path.join(cwd, prog)
1774
1775 if not required_output and output_files:
1776 required_output = output_files
1777
1778 logger.debug(prog)
1779 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1780 cwd_arg = cwd+"/arguments"
1781 temp = ' '.join([str(a) for a in argument])
1782 arg_cmd="echo '"+temp+"' > " + cwd_arg
1783 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1784 if argument :
1785 command.extend(['-a ', '='.join([str(a) for a in argument])])
1786 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1787 id = a.stdout.read().strip()
1788
1789 else:
1790 cwd_arg = cwd+"/arguments"
1791 temp = ' '.join([str(a) for a in argument])
1792 temp_file_name = "sub." + os.path.basename(prog)
1793 text = """#!/bin/bash
1794 MYPWD=%(cwd)s
1795 cd $MYPWD
1796 input_files=(%(input_files)s )
1797 for i in ${input_files[@]}
1798 do
1799 chmod -f +x $i
1800 done
1801 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1802 """
1803 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1804 'arguments': ' '.join([str(a) for a in argument]),
1805 'program': ' ' if '.py' in prog else 'bash'}
1806
1807
1808 new_prog = pjoin(cwd, temp_file_name)
1809 open(new_prog, 'w').write(text % dico)
1810 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1811 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1812 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1813 id = a.stdout.read().strip()
1814 logger.debug(id)
1815
1816 nb_try=0
1817 nb_limit=5
1818 if not id.isdigit() :
1819 print "[ID is not digit]:" + id
1820
1821 while not id.isdigit() :
1822 nb_try+=1
1823 print "[fail_retry]:"+ nb_try
1824 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1825 id = a.stdout.read().strip()
1826 if nb_try > nb_limit :
1827 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1828 break
1829
1830 self.submitted += 1
1831 self.submitted_ids.append(id)
1832
1833 return id
1834
1835 @multiple_try(nb_try=10, sleep=5)
1837 """ control the status of a single job with it's cluster id """
1838
1839 if id == 0 :
1840 status_out ='C'
1841 else :
1842 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1843 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1844 stderr=subprocess.PIPE)
1845 error = status.stderr.read()
1846 if status.returncode or error:
1847 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1848 status_out= status.stdout.read().strip()
1849 status_out= status_out.split(":",1)[1]
1850 if status_out == 'waiting':
1851 status_out='I'
1852 elif status_out == 'preparing' or status_out == 'running':
1853 status_out = 'R'
1854 elif status_out != 'done':
1855 status_out = 'F'
1856 elif status_out == 'done':
1857 status_out = 'C'
1858
1859 return status_out
1860
1861 @multiple_try()
1863 """ control the status of a single job with it's cluster id """
1864 if not self.submitted_ids:
1865 logger.debug("self.submitted_ids not exists")
1866 return 0, 0, 0, 0
1867
1868 ongoing = []
1869 idle, run, fail = 0, 0, 0
1870
1871 start = self.submitted_ids[0]
1872 end = self.submitted_ids[-1]
1873
1874 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1875 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1876
1877 for line in status.stdout:
1878
1879 status2 = line.split()[-1]
1880 if status2 is not 'null' or line.split()[0].strip() is not '0':
1881 ongoing.append(line.split()[0].strip())
1882 logger.debug("["+line.split()[0].strip()+"]"+status2)
1883 if status2 is 'null' or line.split()[0].strip() is '0':
1884 idle += 1
1885 elif status2 in self.idle_tag:
1886 idle += 1
1887 elif status2 in self.running_tag:
1888 run += 1
1889 elif status2 in self.complete_tag:
1890 if not self.check_termination(line.split()[0]):
1891 idle +=1
1892 else:
1893 fail += 1
1894
1895 return idle, run, self.submitted - (idle+run+fail), fail
1896
1897 @multiple_try()
1898 - def remove(self, *args, **opts):
1899 """Clean the jobson the cluster"""
1900
1901 if not self.submitted_ids:
1902 return
1903 for i in range(len(self.submitted_ids)):
1904 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1905 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1906
1908 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1909
1910 name= 'htcaas2'
1911 job_id = 'HTCAAS2_JOBID'
1912 idle_tag = ['waiting']
1913 running_tag = ['preparing','running']
1914 complete_tag = ['done']
1915
1916 @store_input()
1917 @multiple_try()
1918 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1919 log=None, input_files=[], output_files=[], required_output=[],
1920 nb_submit=0):
1921
1922 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1923 input/output file should be given as relative to CWD
1924 """
1925 if cwd is None:
1926 cwd = os.getcwd()
1927
1928 if not os.path.exists(prog):
1929 prog = os.path.join(cwd, prog)
1930
1931 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1932 if cwd or prog :
1933 self.submitted_dirs.append(cwd)
1934 self.submitted_exes.append(prog)
1935 else:
1936 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1937
1938 if argument :
1939 self.submitted_args.append('='.join([str(a) for a in argument]))
1940
1941 if cwd or prog :
1942 self.submitted += 1
1943 id = self.submitted
1944 self.submitted_ids.append(id)
1945 else:
1946 logger.debug("cwd and prog are not exist! ")
1947 id = 0
1948
1949 else:
1950 temp_file_name = "sub."+ os.path.basename(prog)
1951 text = """#!/bin/bash
1952 MYPWD=%(cwd)s
1953 cd $MYPWD
1954 input_files=(%(input_files)s )
1955 for i in ${input_files[@]}
1956 do
1957 chmod -f +x $i
1958 done
1959 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1960 """
1961 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1962 'arguments': ' '.join([str(a) for a in argument]),
1963 'program': ' ' if '.py' in prog else 'bash'}
1964
1965 new_prog = pjoin(cwd, temp_file_name)
1966 open(new_prog, 'w').write(text % dico)
1967 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1968 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
1969 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1970 id = a.stdout.read().strip()
1971 logger.debug("[mode2]-["+str(id)+"]")
1972 if cwd and prog :
1973 self.submitted += 1
1974 self.submitted_ids.append(id)
1975 else:
1976 logger.debug("cwd and prog are not exist! ")
1977 id = 0
1978
1979 return id
1980
1981 @multiple_try()
2025
2026
2027 @multiple_try(nb_try=10, sleep=5)
2029 """ control the status of a single job with it's cluster id """
2030
2031 if self.submitted == self.submitted_ids[-1] :
2032 id = self.metasubmit(self)
2033 tempid = self.submitted_ids[-1]
2034 self.submitted_ids.remove(self.submitted_ids[-1])
2035 self.submitted_ids.append(id)
2036 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2037
2038 if id == 0 :
2039 status_out ='C'
2040 else:
2041 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2042 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2043 stderr=subprocess.PIPE)
2044 error = status.stderr.read()
2045 if status.returncode or error:
2046 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error
2047 status_out= status.stdout.read().strip()
2048 status_out= status_out.split(":",1)[1]
2049 logger.debug("[["+str(id)+"]]"+status_out)
2050 if status_out == 'waiting':
2051 status_out='I'
2052 elif status_out == 'preparing' or status_out == 'running':
2053 status_out = 'R'
2054 elif status_out != 'done':
2055 status_out = 'F'
2056 elif status_out == 'done':
2057 status_out = 'C'
2058 self.submitted -= 1
2059
2060 return status_out
2061
2062 @multiple_try()
2064 """ control the status of a single job with it's cluster id """
2065 if not self.submitted_ids:
2066 logger.debug("self.submitted_ids not exists")
2067 return 0, 0, 0, 0
2068
2069 if "//" in me_dir :
2070 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2071 start = me_dir.split("//")[0]
2072 end = me_dir.split("//")[1]
2073 else :
2074 start = me_dir.split("//")[1]
2075 end = me_dir.split("//")[0]
2076 elif "/" in me_dir :
2077 start = 0
2078 end = 0
2079 elif me_dir.isdigit():
2080 start = me_dir
2081 end = me_dir
2082 elif not me_dir.isdigit():
2083 me_dir = self.submitted_ids[0]
2084 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2085
2086 ongoing = []
2087 idle, run, fail, done = 0, 0, 0, 0
2088
2089 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2090 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2091
2092 for line in status.stdout:
2093 status2 = line.split()[-1]
2094 if status2 is not 'null' or line.split()[0].strip() is not '0':
2095 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2096 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2097
2098 if status2 is 'null' or line.split()[0].strip() is '0':
2099 idle += 1
2100 elif status2 in self.idle_tag:
2101 idle += 1
2102 elif status2 in self.running_tag:
2103 run += 1
2104 elif status2 in self.complete_tag:
2105 done += 1
2106 self.submitted -= 1
2107 if not self.check_termination(line.split()[1]):
2108 idle +=1
2109 else:
2110 fail += 1
2111
2112 return idle, run, self.submitted - (idle+run+fail), fail
2113
2114 @multiple_try()
2115 - def remove(self, *args, **opts):
2116 """Clean the jobson the cluster"""
2117
2118 if not self.submitted_ids:
2119 return
2120 id = self.submitted_ids[0]
2121 if id is not 0 :
2122 cmd = "htcaas-job-cancel -m %s" % str(id)
2123 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2124
2125 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2126 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2127 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2128