1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21
22 logger = logging.getLogger('madgraph.cluster')
23
24 try:
25 from madgraph import MadGraph5Error
26 import madgraph.various.misc as misc
27 except Exception, error:
28 if __debug__:
29 print str(error)
30 from internal import MadGraph5Error
31 import internal.misc as misc
32
33 pjoin = os.path.join
37
40
41
42 multiple_try = misc.multiple_try
43 pjoin = os.path.join
47
48 def deco_interupt(f):
49 def deco_f_interupt(self, *args, **opt):
50 try:
51 return f(self, *args, **opt)
52 except error:
53 try:
54 self.remove(*args, **opt)
55 except Exception:
56 pass
57 raise error
58 return deco_f_interupt
59 return deco_interupt
60
73 return deco_f_store
74 return deco_store
75
78 """Basic Class for all cluster type submission"""
79 name = 'mother class'
80
82 """Init the cluster"""
83
84 self.submitted = 0
85 self.submitted_ids = []
86 self.finish = 0
87 if 'cluster_queue' in opts:
88 self.cluster_queue = opts['cluster_queue']
89 else:
90 self.cluster_queue = 'madgraph'
91 if 'cluster_temp_path' in opts:
92 self.temp_dir = opts['cluster_temp_path']
93 else:
94 self.temp_dir = None
95 self.options = {'cluster_status_update': (600, 30)}
96 for key,value in opts.items():
97 self.options[key] = value
98 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
99 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300
100 self.options = dict(opts)
101 self.retry_args = {}
102
103
104 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
105 log=None, required_output=[], nb_submit=0):
106 """How to make one submission. Return status id on the cluster."""
107 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
108
109 @store_input()
110 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
111 log=None, input_files=[], output_files=[], required_output=[],nb_submit=0):
112 """How to make one submission. Return status id on the cluster.
113 NO SHARE DISK"""
114
115 if cwd is None:
116 cwd = os.getcwd()
117 if not os.path.exists(prog):
118 prog = os.path.join(cwd, prog)
119
120 if not required_output and output_files:
121 required_output = output_files
122
123 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
124 (input_files == [] == output_files):
125 return self.submit(prog, argument, cwd, stdout, stderr, log,
126 required_output=required_output, nb_submit=nb_submit)
127
128 if not input_files and not output_files:
129
130 return self.submit(prog, argument, cwd, stdout, stderr, log,
131 required_output=required_output, nb_submit=nb_submit)
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
138
139 text = """#!/bin/bash
140 MYTMP=%(tmpdir)s/run$%(job_id)s
141 MYPWD=%(cwd)s
142 mkdir -p $MYTMP
143 cd $MYPWD
144 input_files=( %(input_files)s )
145 for i in ${input_files[@]}
146 do
147 cp -R -L $i $MYTMP
148 done
149 cd $MYTMP
150 echo '%(arguments)s' > arguments
151 chmod +x ./%(script)s
152 %(program)s ./%(script)s %(arguments)s
153 output_files=( %(output_files)s )
154 for i in ${output_files[@]}
155 do
156 cp -r $MYTMP/$i $MYPWD
157 done
158 rm -rf $MYTMP
159 """
160 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
161 'cwd': cwd, 'job_id': self.job_id,
162 'input_files': ' '.join(input_files + [prog]),
163 'output_files': ' '.join(output_files),
164 'arguments': ' '.join([str(a) for a in argument]),
165 'program': ' ' if '.py' in prog else 'bash'}
166
167
168 new_prog = pjoin(cwd, temp_file_name)
169 open(new_prog, 'w').write(text % dico)
170 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
171
172 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
173 required_output=required_output, nb_submit=nb_submit)
174
175
177 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
178 if not self.submitted_ids:
179 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
180 idle, run, fail = 0, 0, 0
181 for pid in self.submitted_ids[:]:
182 status = self.control_one_job(id)
183 if status == 'I':
184 idle += 1
185 elif status == 'R':
186 run += 1
187 elif status == 'F':
188 self.finish +=1
189 self.submitted_ids.remove(pid)
190 else:
191 fail += 1
192
193 return idle, run, self.finish, fail
194
196 """ control the status of a single job with it's cluster id """
197 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
198
199 @check_interupt()
200 - def wait(self, me_dir, fct, minimal_job=0):
201 """Wait that all job are finish.
202 if minimal_job set, then return if idle + run is lower than that number"""
203
204
205 mode = 1
206 nb_iter = 0
207 nb_short = 0
208 change_at = 5
209
210 longtime, shorttime = self.options['cluster_status_update']
211
212 while 1:
213 old_mode = mode
214 nb_iter += 1
215 idle, run, finish, fail = self.control(me_dir)
216 if fail:
217 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
218 if idle + run == 0:
219
220 logger.info('All jobs finished')
221 break
222 if idle + run < minimal_job:
223 return
224 fct(idle, run, finish)
225
226 if nb_iter < change_at:
227 mode = 1
228 elif idle < run:
229 if old_mode == 0:
230 if nb_short:
231 mode = 0
232
233 elif idle:
234 if nb_iter > change_at + int(longtime)//shorttime:
235 mode = 0
236 else:
237 mode = 1
238 nb_short =0
239 else:
240 mode = 1
241 nb_short = 0
242 elif old_mode == 1:
243 nb_short +=1
244 if nb_short > 3* max(change_at, int(longtime)//shorttime):
245 mode = 0
246 else:
247 mode = 0
248
249
250 if old_mode > mode:
251 logger.info('''Start to wait %ss between checking status.
252 Note that you can change this time in the configuration file.
253 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
254
255
256 if mode == 0:
257 try:
258 time.sleep(self.options['cluster_status_update'][0])
259 except KeyboardInterrupt:
260 logger.info('start to update the status')
261 nb_iter = min(0, change_at -2)
262 nb_short = 0
263 else:
264 time.sleep(self.options['cluster_status_update'][1])
265
266
267 self.submitted = 0
268 self.submitted_ids = []
269
271 """Check the termination of the jobs with job_id and relaunch it if needed."""
272
273
274 if job_id not in self.retry_args:
275 return True
276
277 args = self.retry_args[job_id]
278 if 'time_check' in args:
279 time_check = args['time_check']
280 else:
281 time_check = 0
282
283 for path in args['required_output']:
284 if args['cwd']:
285 path = pjoin(args['cwd'], path)
286
287 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
288 break
289 else:
290
291 if time_check > 0:
292 logger.info('Job %s Finally found the missing output.' % (job_id))
293 del self.retry_args[job_id]
294 self.submitted_ids.remove(job_id)
295 return 'done'
296
297 if time_check == 0:
298 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
299 args['time_check'] = time.time()
300 return 'wait'
301 elif self.cluster_retry_wait > time.time() - time_check:
302 return 'wait'
303
304
305 if self.nb_retry < 0:
306 logger.critical('''Fail to run correctly job %s.
307 with option: %s
308 file missing: %s''' % (job_id, args, path))
309 raw_input('press enter to continue.')
310 elif self.nb_retry == 0:
311 logger.critical('''Fail to run correctly job %s.
312 with option: %s
313 file missing: %s.
314 Stopping all runs.''' % (job_id, args, path))
315
316 elif args['nb_submit'] >= self.nb_retry:
317 logger.critical('''Fail to run correctly job %s.
318 with option: %s
319 file missing: %s
320 Fails %s times
321 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
322
323 else:
324 args['nb_submit'] += 1
325 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
326 del self.retry_args[job_id]
327 self.submitted_ids.remove(job_id)
328 if 'time_check' in args:
329 del args['time_check']
330 self.submit2(**args)
331 return 'resubmit'
332 return 'done'
333
334
335
336 @check_interupt()
337 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
338 stderr=None, log=None, required_output=[], nb_submit=0,
339 input_files=[], output_files=[]):
340 """launch one job on the cluster and wait for it"""
341
342 special_output = False
343 if stderr == -2 and stdout:
344
345 special_output = True
346 stderr = stdout + '.err'
347
348 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
349 required_output=required_output, input_files=input_files,
350 output_files=output_files)
351
352 frame = inspect.currentframe()
353 args, _, _, values = inspect.getargvalues(frame)
354 args = dict([(i, values[i]) for i in args if i != 'self'])
355 self.retry_args[id] = args
356
357 nb_wait=0
358 while 1:
359 nb_wait+=1
360 status = self.control_one_job(id)
361 if not status in ['R','I']:
362 status = self.check_termination(id)
363 if status in ['wait']:
364 time.sleep(30)
365 continue
366 elif status in ['resubmit']:
367 id = self.submitted_ids[0]
368 time.sleep(30)
369 continue
370
371 time.sleep(30)
372 break
373 time.sleep(self.options['cluster_status_update'][1])
374
375 if required_output:
376 status = self.check_termination(id)
377 if status == 'wait':
378 run += 1
379 elif status == 'resubmit':
380 idle += 1
381
382
383 if special_output:
384
385
386 for i in range(5):
387 if os.path.exists(stdout):
388 if not os.path.exists(stderr):
389 time.sleep(5)
390 if os.path.exists(stderr):
391 err_text = open(stderr).read()
392 if not err_text:
393 return
394 logger.warning(err_text)
395 text = open(stdout).read()
396 open(stdout,'w').write(text + err_text)
397 else:
398 return
399 time.sleep(10)
400
401 - def remove(self, *args, **opts):
402 """ """
403 logger.warning("""This cluster didn't support job removal,
404 the jobs are still running on the cluster.""")
405
407 """ class for dealing with the submission in multiple node"""
408
409 job_id = '$'
410
412 """Init the cluster"""
413 import thread
414 super(MultiCore, self).__init__(self, *args, **opt)
415
416
417 self.submitted = 0
418 self.finish = 0
419 if 'nb_core' in opt:
420 self.nb_core = opt['nb_core']
421 elif isinstance(args[0],int):
422 self.nb_core = args[0]
423 else:
424 self.nb_core = 1
425 self.update_fct = None
426
427
428 self.need_waiting = False
429 self.nb_used = 0
430 self.lock = thread.allocate_lock()
431 self.done = 0
432 self.waiting_submission = []
433 self.pids = []
434 self.fail_msg = None
435
436 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
437 stderr=None, log=None, **opts):
438 """launch one job and wait for it"""
439 if isinstance(stdout, str):
440 stdout = open(stdout, 'w')
441 if isinstance(stderr, str):
442 stdout = open(stderr, 'w')
443 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
444
445
446 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
447 log=None, required_output=[], nb_submit=0):
448 """submit a job on multicore machine"""
449
450 self.submitted +=1
451 if cwd is None:
452 cwd = os.getcwd()
453 if isinstance(prog, str):
454 if not os.path.exists(prog) and not misc.which(prog):
455 prog = os.path.join(cwd, prog)
456
457 import thread
458 if self.waiting_submission or self.nb_used == self.nb_core:
459 self.waiting_submission.append((prog, argument,cwd, stdout))
460
461 while self.nb_used < self.nb_core and self.waiting_submission:
462 arg = self.waiting_submission.pop(0)
463 self.nb_used += 1
464 thread.start_new_thread(self.launch, arg)
465 elif self.nb_used < self.nb_core -1:
466 self.nb_used += 1
467 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
468 elif self.nb_used == self.nb_core -1:
469 self.nb_used += 1
470 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
471
472
473 - def launch(self, exe, argument, cwd, stdout):
474 """ way to launch for multicore. If exe is a string then treat it as
475 an executable. Otherwise treat it as a function"""
476 import thread
477 def end(self, pid):
478 self.nb_used -= 1
479 self.done += 1
480 try:
481 self.pids.remove(pid)
482 except:
483 pass
484
485 fail_msg = None
486 try:
487 if isinstance(exe,str):
488 if os.path.exists(exe) and not exe.startswith('/'):
489 exe = './' + exe
490 proc = misc.Popen([exe] + argument, cwd=cwd, stdout=stdout,
491 stderr=subprocess.STDOUT)
492 pid = proc.pid
493 self.pids.append(pid)
494 proc.wait()
495 if proc.returncode not in [0, 143, -15]:
496 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
497 (' '.join([exe]+argument), proc.returncode)
498
499 logger.warning(fail_msg)
500 try:
501 log = open(glob.glob(pjoin(cwd,'*','log.txt'))[0]).read()
502 logger.warning('Last 15 lines of logfile %s:\n%s\n' % \
503 (pjoin(cwd,'*','log.txt'), '\n'.join(log.split('\n')[-15:-1]) + '\n'))
504 except IOError, AttributeError:
505 logger.warning('Please look for possible logfiles in %s' % cwd)
506 pass
507 self.remove(fail_msg)
508 else:
509 pid = tuple([id(o) for o in [exe] + argument])
510 self.pids.append(pid)
511
512
513 returncode = exe(argument)
514 if returncode != 0:
515 logger.warning(returncode)
516 self.remove()
517
518
519
520
521 security = 0
522
523 while 1:
524 while not self.lock.locked():
525 if not self.need_waiting:
526
527 end(self, pid)
528 return
529 elif security > 60:
530 end(self, pid)
531 return
532 security += 1
533 time.sleep(1)
534 try:
535 self.lock.release()
536 except thread.error:
537 continue
538 break
539 end(self, pid)
540
541
542 except Exception, error:
543
544 self.remove()
545 raise
546
547
548
549
550 - def wait(self, me_dir, update_status):
551 """Wait that all thread finish
552 self.nb_used and self.done are update via each jobs (thread and local)
553 self.submitted is the nb of times that submitted has been call (local)
554 remaining is the nb of job that we still have to wait. (local)
555 self.pids is the list of the BASH pid of the submitted jobs. (thread)
556
557 WARNING: In principle all those value are coherent but since some are
558 modified in various thread, those data can be corrupted. (not the local
559 one). Nb_used in particular shouldn't be trusted too much.
560 This code check in different ways that all jobs have finished.
561
562 In principle, the statement related to '#security #X' are not used.
563 In practise they are times to times.
564 """
565
566 import thread
567
568 remaining = self.submitted - self.done
569
570 while self.nb_used < self.nb_core:
571 if self.waiting_submission:
572 arg = self.waiting_submission.pop(0)
573 thread.start_new_thread(self.launch, arg)
574 self.nb_used += 1
575 else:
576 break
577
578 try:
579 self.need_waiting = True
580 self.lock.acquire()
581 no_in_queue = 0
582 secure_mode = False
583 while self.waiting_submission or self.nb_used:
584 if self.fail_msg:
585 msg, self.fail_msg = self.fail_msg, None
586 self.remove()
587 raise Exception, msg
588 if update_status:
589 update_status(len(self.waiting_submission), self.nb_used, self.done)
590
591
592 if len(self.waiting_submission) == 0 == remaining :
593 self.done = self.submitted
594 break
595
596
597 if len(self.waiting_submission) == 0 and len(self.pids) == 0:
598 if self.submitted == self.done:
599 break
600 logger.debug('Found too many jobs. Recovering')
601 no_in_queue += 1
602 time.sleep(min(180, 5 * no_in_queue))
603 if no_in_queue > 3:
604 logger.debug('Still too many jobs. Continue')
605 break
606 continue
607
608
609 if not secure_mode and len(self.waiting_submission) != 0:
610 if self.nb_used != self.nb_core:
611 if self.nb_used != len(self.pids):
612 secure_mode = True
613
614 if secure_mode and not self.waiting_submission:
615 self.need_waiting = False
616 if self.lock.locked():
617 self.lock.release()
618 break
619
620
621 self.lock.acquire()
622 remaining -=1
623
624 if self.waiting_submission:
625 arg = self.waiting_submission.pop(0)
626 thread.start_new_thread(self.launch, arg)
627 self.nb_used += 1
628
629 if self.fail_msg:
630 msg, self.fail_msg = self.fail_msg, None
631 self.remove()
632 raise Exception, msg
633
634
635 no_in_queue = 0
636 while self.submitted > self.done:
637 if self.fail_msg:
638 msg, self.fail_msg = self.fail_msg, None
639 self.remove()
640 raise Exception, msg
641 if no_in_queue == 0:
642 logger.debug('Some jobs have been lost. Try to recover')
643
644 if not len(self.pids):
645
646 logger.critical('Some jobs have been lost in the multicore treatment.')
647 logger.critical('The results might be incomplete. (Trying to continue anyway)')
648 break
649 elif update_status:
650 update_status(len(self.waiting_submission), len(self.pids) ,
651 self.done)
652
653 if not secure_mode:
654 self.lock.acquire()
655 else:
656 no_in_queue += 1
657 try:
658 time.sleep(min(180,5*no_in_queue))
659 if no_in_queue > 5 * 3600.0 / 162:
660 break
661 except KeyboardInterrupt:
662 logger.warning('CTRL-C assumes that all jobs are done. Continue the code')
663 self.pids = []
664 break
665
666
667 no_in_queue = 0
668 while len(self.pids):
669 if self.fail_msg:
670 msg, self.fail_msg = self.fail_msg, None
671 self.remove()
672 raise Exception, msg
673 self.need_waiting = False
674 if self.lock.locked():
675 self.lock.release()
676 secure_mode = True
677 if no_in_queue == 0 :
678 logger.warning('Some jobs have been lost. Try to recover.')
679 logger.warning('Hitting ctrl-c will consider that all jobs are done and continue the code.')
680 try:
681
682 if update_status:
683 update_status(len(self.waiting_submission), len(self.pids) ,
684 self.done)
685 time.sleep(min(5*no_in_queue, 180))
686 no_in_queue += 1
687 if no_in_queue > 5 * 3600.0 / 162:
688 break
689 except KeyboardInterrupt:
690 break
691
692
693 if update_status:
694 self.next_update = 0
695 update_status(len(self.waiting_submission), 0, self.done)
696
697
698 self.need_waiting = False
699 security = 0
700 while not self.lock.locked() and security < 10:
701
702 if secure_mode:
703 security = 10
704 security +=1
705 time.sleep(1)
706 if security < 10:
707 self.lock.release()
708 self.done = 0
709 self.nb_used = 0
710 self.submitted = 0
711 self.pids = []
712
713 except KeyboardInterrupt:
714 self.remove()
715 raise
716 if self.fail_msg:
717 msg, self.fail_msg = self.fail_msg, None
718 self.remove()
719 raise Exception, msg
720
721
722 - def remove(self, error=None):
723 """Ensure that all thread are killed"""
724 logger.info('remove job currently running')
725 self.waiting_submission = []
726 if error:
727 self.fail_msg = error
728 for pid in list(self.pids):
729 if isinstance(pid, tuple):
730 continue
731 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
732 % {'pid':pid} )
733 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
734 if out == 0:
735 try:
736 self.pids.remove(pid)
737 except:
738 pass
739
740
741 time.sleep(1)
742 for pid in list(self.pids):
743 if isinstance(pid, tuple):
744 continue
745 out = os.system('CPIDS=$(pgrep -P %s); kill -15 $CPIDS > /dev/null 2>&1' % pid )
746 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
747 if out == 0:
748 try:
749 self.pids.remove(pid)
750 except:
751 pass
752
754 """Basic class for dealing with cluster submission"""
755
756 name = 'condor'
757 job_id = 'CONDOR_ID'
758
759
760
761 @multiple_try()
762 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
763 required_output=[], nb_submit=0):
764 """Submit a job prog to a Condor cluster"""
765
766 text = """Executable = %(prog)s
767 output = %(stdout)s
768 error = %(stderr)s
769 log = %(log)s
770 %(argument)s
771 environment = CONDOR_ID=$(Cluster).$(Process)
772 Universe = vanilla
773 notification = Error
774 Initialdir = %(cwd)s
775 %(requirement)s
776 getenv=True
777 queue 1
778 """
779
780 if self.cluster_queue not in ['None', None]:
781 requirement = 'Requirements = %s=?=True' % self.cluster_queue
782 else:
783 requirement = ''
784
785 if cwd is None:
786 cwd = os.getcwd()
787 if stdout is None:
788 stdout = '/dev/null'
789 if stderr is None:
790 stderr = '/dev/null'
791 if log is None:
792 log = '/dev/null'
793 if not os.path.exists(prog):
794 prog = os.path.join(cwd, prog)
795 if argument:
796 argument = 'Arguments = %s' % ' '.join(argument)
797 else:
798 argument = ''
799
800
801 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
802 'stderr': stderr,'log': log,'argument': argument,
803 'requirement': requirement}
804
805 open('submit_condor','w').write(text % dico)
806 a = misc.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE)
807 output = a.stdout.read()
808
809
810
811 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
812 try:
813 id = pat.search(output).groups()[0]
814 except:
815 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
816 % output
817 self.submitted += 1
818 self.submitted_ids.append(id)
819 return id
820
821 @store_input()
822 @multiple_try()
823 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
824 log=None, input_files=[], output_files=[], required_output=[],
825 nb_submit=0):
826 """Submit the job on the cluster NO SHARE DISK
827 input/output file should be give relative to cwd
828 """
829
830 if not required_output and output_files:
831 required_output = output_files
832
833 if (input_files == [] == output_files):
834 return self.submit(prog, argument, cwd, stdout, stderr, log,
835 required_output=required_output, nb_submit=nb_submit)
836
837 text = """Executable = %(prog)s
838 output = %(stdout)s
839 error = %(stderr)s
840 log = %(log)s
841 %(argument)s
842 should_transfer_files = YES
843 when_to_transfer_output = ON_EXIT
844 transfer_input_files = %(input_files)s
845 %(output_files)s
846 Universe = vanilla
847 notification = Error
848 Initialdir = %(cwd)s
849 %(requirement)s
850 getenv=True
851 queue 1
852 """
853
854 if self.cluster_queue not in ['None', None]:
855 requirement = 'Requirements = %s=?=True' % self.cluster_queue
856 else:
857 requirement = ''
858
859 if cwd is None:
860 cwd = os.getcwd()
861 if stdout is None:
862 stdout = '/dev/null'
863 if stderr is None:
864 stderr = '/dev/null'
865 if log is None:
866 log = '/dev/null'
867 if not os.path.exists(prog):
868 prog = os.path.join(cwd, prog)
869 if argument:
870 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
871 else:
872 argument = ''
873
874 if input_files:
875 input_files = ','.join(input_files)
876 else:
877 input_files = ''
878 if output_files:
879 output_files = 'transfer_output_files = %s' % ','.join(output_files)
880 else:
881 output_files = ''
882
883
884
885 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
886 'stderr': stderr,'log': log,'argument': argument,
887 'requirement': requirement, 'input_files':input_files,
888 'output_files':output_files}
889
890 open('submit_condor','w').write(text % dico)
891 a = subprocess.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE)
892 output = a.stdout.read()
893
894
895
896 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
897 try:
898 id = pat.search(output).groups()[0]
899 except:
900 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
901 % output
902 self.submitted += 1
903 self.submitted_ids.append(id)
904 return id
905
906
907
908
909
910 @multiple_try(nb_try=10, sleep=10)
912 """ control the status of a single job with it's cluster id """
913 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
914 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
915 stderr=subprocess.PIPE)
916
917 error = status.stderr.read()
918 if status.returncode or error:
919 raise ClusterManagmentError, 'condor_q returns error: %s' % error
920
921 return status.stdout.readline().strip()
922
923 @check_interupt()
924 @multiple_try(nb_try=10, sleep=10)
926 """ control the status of a single job with it's cluster id """
927
928 if not self.submitted_ids:
929 return 0, 0, 0, 0
930
931 packet = 15000
932 idle, run, fail = 0, 0, 0
933 ongoing = []
934 for i in range(1+(len(self.submitted_ids)-1)//packet):
935 start = i * packet
936 stop = (i+1) * packet
937 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
938 " -format \'%-2s\ ' \'ClusterId\' " + \
939 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
940
941 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
942 stderr=subprocess.PIPE)
943 error = status.stderr.read()
944 if status.returncode or error:
945 raise ClusterManagmentError, 'condor_q returns error: %s' % error
946
947 for line in status.stdout:
948 id, status = line.strip().split()
949 ongoing.append(int(id))
950 if status in ['I','U']:
951 idle += 1
952 elif status == 'R':
953 run += 1
954 elif status != 'C':
955 fail += 1
956
957 for id in list(self.submitted_ids):
958 if int(id) not in ongoing:
959 status = self.check_termination(id)
960 if status == 'wait':
961 run += 1
962 elif status == 'resubmit':
963 idle += 1
964
965 return idle, run, self.submitted - (idle+run+fail), fail
966
967 @multiple_try()
968 - def remove(self, *args, **opts):
969 """Clean the jobson the cluster"""
970
971 if not self.submitted_ids:
972 return
973 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
974
975 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
976
978 """Basic class for dealing with cluster submission"""
979
980 name = 'pbs'
981 job_id = 'PBS_JOBID'
982 idle_tag = ['Q']
983 running_tag = ['T','E','R']
984 complete_tag = ['C']
985
986 maximum_submited_jobs = 2500
987
988 @multiple_try()
989 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
990 required_output=[], nb_submit=0):
991 """Submit a job prog to a PBS cluster"""
992
993
994 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
995 me_dir = misc.digest(me_dir)[-14:]
996 if not me_dir[0].isalpha():
997 me_dir = 'a' + me_dir[1:]
998
999 if len(self.submitted_ids) > self.maximum_submited_jobs:
1000 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1001 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1002 self.wait(me_dir, fct, self.maximum_submited_jobs)
1003
1004
1005 text = ""
1006 if cwd is None:
1007 cwd = os.getcwd()
1008 else:
1009 text = " cd %s;" % cwd
1010 if stdout is None:
1011 stdout = '/dev/null'
1012 if stderr is None:
1013 stderr = '/dev/null'
1014 elif stderr == -2:
1015 stderr = stdout
1016 if log is None:
1017 log = '/dev/null'
1018
1019 if not os.path.isabs(prog):
1020 text += "./%s" % prog
1021 else:
1022 text+= prog
1023
1024 if argument:
1025 text += ' ' + ' '.join(argument)
1026
1027 command = ['qsub','-o', stdout,
1028 '-N', me_dir,
1029 '-e', stderr,
1030 '-V']
1031
1032 if self.cluster_queue and self.cluster_queue != 'None':
1033 command.extend(['-q', self.cluster_queue])
1034
1035 a = misc.Popen(command, stdout=subprocess.PIPE,
1036 stderr=subprocess.STDOUT,
1037 stdin=subprocess.PIPE, cwd=cwd)
1038
1039 output = a.communicate(text)[0]
1040 id = output.split('.')[0]
1041 if not id.isdigit() or a.returncode !=0:
1042 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1043 % output
1044
1045 self.submitted += 1
1046 self.submitted_ids.append(id)
1047 return id
1048
1049 @multiple_try()
1051 """ control the status of a single job with it's cluster id """
1052 cmd = 'qstat '+str(id)
1053 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1054 stderr=subprocess.STDOUT)
1055
1056 for line in status.stdout:
1057 line = line.strip()
1058 if 'cannot connect to server' in line or 'cannot read reply' in line:
1059 raise ClusterManagmentError, 'server disconnected'
1060 if 'Unknown' in line:
1061 return 'F'
1062 elif line.startswith(str(id)):
1063 jobstatus = line.split()[4]
1064 else:
1065 jobstatus=""
1066
1067 if status.returncode != 0 and status.returncode is not None:
1068 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1069 if jobstatus in self.idle_tag:
1070 return 'I'
1071 elif jobstatus in self.running_tag:
1072 return 'R'
1073 return 'F'
1074
1075
1076 @multiple_try()
1078 """ control the status of a single job with it's cluster id """
1079 cmd = "qstat"
1080 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1081
1082 if me_dir.endswith('/'):
1083 me_dir = me_dir[:-1]
1084 me_dir = misc.digest(me_dir)[-14:]
1085 if not me_dir[0].isalpha():
1086 me_dir = 'a' + me_dir[1:]
1087 ongoing = []
1088
1089 idle, run, fail = 0, 0, 0
1090 for line in status.stdout:
1091 if 'cannot connect to server' in line or 'cannot read reply' in line:
1092 raise ClusterManagmentError, 'server disconnected'
1093 if me_dir in line:
1094 ongoing.append(line.split()[0].split('.')[0])
1095 status2 = line.split()[4]
1096 if status2 in self.idle_tag:
1097 idle += 1
1098 elif status2 in self.running_tag:
1099 run += 1
1100 elif status2 in self.complete_tag:
1101 if not self.check_termination(line.split()[0].split('.')[0]):
1102 idle += 1
1103 else:
1104 fail += 1
1105
1106 if status.returncode != 0 and status.returncode is not None:
1107 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1108
1109 for id in list(self.submitted_ids):
1110 if id not in ongoing:
1111 status2 = self.check_termination(id)
1112 if status2 == 'wait':
1113 run += 1
1114 elif status2 == 'resubmit':
1115 idle += 1
1116
1117 return idle, run, self.submitted - (idle+run+fail), fail
1118
1119 @multiple_try()
1120 - def remove(self, *args, **opts):
1121 """Clean the jobs on the cluster"""
1122
1123 if not self.submitted_ids:
1124 return
1125 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1126 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1127
1130 """Basic class for dealing with cluster submission"""
1131
1132
1133 name = 'sge'
1134 job_id = 'JOB_ID'
1135 idle_tag = ['qw', 'hqw','hRqw','w']
1136 running_tag = ['r','t','Rr','Rt']
1137
1139 """replace string for path issues"""
1140 location = os.path.realpath(location)
1141 homePath = os.getenv("HOME")
1142 if homePath:
1143 location = location.replace(homePath,'$HOME')
1144 return location
1145
1146 @multiple_try()
1147 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1148 required_output=[], nb_submit=0):
1149 """Submit a job prog to an SGE cluster"""
1150
1151 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1152 me_dir = misc.digest(me_dir)[-10:]
1153 if not me_dir[0].isalpha():
1154 me_dir = 'a' + me_dir[1:]
1155
1156 if cwd is None:
1157
1158 cwd = self.def_get_path(os.getcwd())
1159 cwd1 = self.def_get_path(cwd)
1160 text = " cd %s;" % cwd1
1161 if stdout is None:
1162 stdout = '/dev/null'
1163 else:
1164 stdout = self.def_get_path(stdout)
1165 if stderr is None:
1166 stderr = '/dev/null'
1167 elif stderr == -2:
1168 stderr = stdout
1169 else:
1170 stderr = self.def_get_path(stderr)
1171
1172 if log is None:
1173 log = '/dev/null'
1174 else:
1175 log = self.def_get_path(log)
1176
1177 text += prog
1178 if argument:
1179 text += ' ' + ' '.join(argument)
1180
1181
1182
1183
1184 homePath = os.getenv("HOME")
1185 if homePath:
1186 text = text.replace(homePath,'$HOME')
1187
1188 logger.debug("!=== input %s" % text)
1189 logger.debug("!=== output %s" % stdout)
1190 logger.debug("!=== error %s" % stderr)
1191 logger.debug("!=== logs %s" % log)
1192
1193 command = ['qsub','-o', stdout,
1194 '-N', me_dir,
1195 '-e', stderr,
1196 '-V']
1197
1198 if self.cluster_queue and self.cluster_queue != 'None':
1199 command.extend(['-q', self.cluster_queue])
1200
1201 a = misc.Popen(command, stdout=subprocess.PIPE,
1202 stderr=subprocess.STDOUT,
1203 stdin=subprocess.PIPE, cwd=cwd)
1204
1205 output = a.communicate(text)[0]
1206 id = output.split(' ')[2]
1207 if not id.isdigit():
1208 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1209 % output
1210 self.submitted += 1
1211 self.submitted_ids.append(id)
1212 logger.debug(output)
1213
1214 return id
1215
1216 @multiple_try()
1218 """ control the status of a single job with it's cluster id """
1219
1220 cmd = 'qstat '
1221 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1222 for line in status.stdout:
1223
1224
1225
1226
1227
1228
1229 if str(id) in line:
1230 status = line.split()[4]
1231
1232 if status in self.idle_tag:
1233 return 'I'
1234 elif status in self.running_tag:
1235 return 'R'
1236 return 'F'
1237
1238 @multiple_try()
1240 """ control the status of a single job with it's cluster id """
1241 cmd = "qstat "
1242 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1243
1244 if me_dir.endswith('/'):
1245 me_dir = me_dir[:-1]
1246 me_dir = misc.digest(me_dir)[-10:]
1247 if not me_dir[0].isalpha():
1248 me_dir = 'a' + me_dir[1:]
1249
1250 idle, run, fail = 0, 0, 0
1251 for line in status.stdout:
1252 if me_dir in line:
1253 status = line.split()[4]
1254 if status in self.idle_tag:
1255 idle += 1
1256 elif status in self.running_tag:
1257 run += 1
1258 else:
1259 logger.debug(line)
1260 fail += 1
1261
1262 return idle, run, self.submitted - (idle+run+fail), fail
1263
1264
1265
1266 @multiple_try()
1267 - def remove(self, *args, **opts):
1268 """Clean the jobs on the cluster"""
1269
1270 if not self.submitted_ids:
1271 return
1272 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1273 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1274
1277 """Basic class for dealing with cluster submission"""
1278
1279 name = 'lsf'
1280 job_id = 'LSB_JOBID'
1281
1282 @multiple_try()
1283 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1284 required_output=[], nb_submit=0):
1285 """Submit the job prog to an LSF cluster"""
1286
1287 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1288 me_dir = misc.digest(me_dir)[-14:]
1289 if not me_dir[0].isalpha():
1290 me_dir = 'a' + me_dir[1:]
1291
1292 text = ""
1293 command = ['bsub', '-J', me_dir]
1294 if cwd is None:
1295 cwd = os.getcwd()
1296 else:
1297 text = " cd %s;" % cwd
1298 if stdout and isinstance(stdout, str):
1299 command.extend(['-o', stdout])
1300 if stderr and isinstance(stdout, str):
1301 command.extend(['-e', stderr])
1302 elif stderr == -2:
1303 pass
1304 if log is None:
1305 log = '/dev/null'
1306
1307 text += prog
1308 if argument:
1309 text += ' ' + ' '.join(argument)
1310
1311 if self.cluster_queue and self.cluster_queue != 'None':
1312 command.extend(['-q', self.cluster_queue])
1313
1314 a = misc.Popen(command, stdout=subprocess.PIPE,
1315 stderr=subprocess.STDOUT,
1316 stdin=subprocess.PIPE, cwd=cwd)
1317
1318 output = a.communicate(text)[0]
1319
1320 try:
1321 id = output.split('>',1)[0].split('<')[1]
1322 except:
1323 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1324 % output
1325 if not id.isdigit():
1326 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1327 % output
1328 self.submitted += 1
1329 self.submitted_ids.append(id)
1330 return id
1331
1332
1333 @multiple_try()
1335 """ control the status of a single job with it's cluster id """
1336
1337 cmd = 'bjobs '+str(id)
1338 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1339
1340 for line in status.stdout:
1341 line = line.strip().upper()
1342 if 'JOBID' in line:
1343 continue
1344 elif str(id) not in line:
1345 continue
1346 status = line.split()[2]
1347 if status == 'RUN':
1348 return 'R'
1349 elif status == 'PEND':
1350 return 'I'
1351 elif status == 'DONE':
1352 return 'F'
1353 else:
1354 return 'H'
1355 return 'F'
1356
1357 @multiple_try()
1359 """ control the status of a single job with it's cluster id """
1360
1361 if not self.submitted_ids:
1362 return 0, 0, 0, 0
1363
1364 cmd = "bjobs " + ' '.join(self.submitted_ids)
1365 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1366
1367 idle, run, fail = 0, 0, 0
1368 for line in status.stdout:
1369 line = line.strip()
1370 if 'JOBID' in line:
1371 continue
1372 splitline = line.split()
1373 id = splitline[0]
1374 if id not in self.submitted_ids:
1375 continue
1376 status = splitline[2]
1377 if status == 'RUN':
1378 run += 1
1379 elif status == 'PEND':
1380 idle += 1
1381 elif status == 'DONE':
1382 status = self.check_termination(id)
1383 if status == 'wait':
1384 run += 1
1385 elif status == 'resubmit':
1386 idle += 1
1387 else:
1388 fail += 1
1389
1390 return idle, run, self.submitted - (idle+run+fail), fail
1391
1392 @multiple_try()
1393 - def remove(self, *args,**opts):
1394 """Clean the jobs on the cluster"""
1395
1396 if not self.submitted_ids:
1397 return
1398 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1399 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1400
1402 """Class for dealing with cluster submission on a GE cluster"""
1403
1404 name = 'ge'
1405 job_id = 'JOB_ID'
1406 idle_tag = ['qw']
1407 running_tag = ['r']
1408
1409 @multiple_try()
1410 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1411 required_output=[], nb_submit=0):
1412 """Submit a job prog to a GE cluster"""
1413
1414 text = ""
1415 if cwd is None:
1416 cwd = os.getcwd()
1417 else:
1418 text = " cd %s; bash " % cwd
1419 if stdout is None:
1420 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1421 if stderr is None:
1422 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1423 elif stderr == -2:
1424 stderr = stdout
1425 if log is None:
1426 log = '/dev/null'
1427
1428 text += prog
1429 if argument:
1430 text += ' ' + ' '.join(argument)
1431 text += '\n'
1432 tmp_submit = os.path.join(cwd, 'tmp_submit')
1433 open(tmp_submit,'w').write(text)
1434
1435 a = misc.Popen(['qsub','-o', stdout,
1436 '-e', stderr,
1437 tmp_submit],
1438 stdout=subprocess.PIPE,
1439 stderr=subprocess.STDOUT,
1440 stdin=subprocess.PIPE, cwd=cwd)
1441
1442 output = a.communicate()[0]
1443
1444 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1445 try:
1446 id = pat.search(output).groups()[0]
1447 except:
1448 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1449 % output
1450 self.submitted += 1
1451 self.submitted_ids.append(id)
1452 return id
1453
1454 @multiple_try()
1456 """ control the status of a single job with it's cluster id """
1457 cmd = 'qstat | grep '+str(id)
1458 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1459 if not status:
1460 return 'F'
1461
1462 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1463 stat = ''
1464 for line in status.stdout.read().split('\n'):
1465 if not line:
1466 continue
1467 line = line.strip()
1468 try:
1469 groups = pat.search(line).groups()
1470 except:
1471 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1472 if groups[0] != id: continue
1473 stat = groups[1]
1474 if not stat:
1475 return 'F'
1476 if stat in self.idle_tag:
1477 return 'I'
1478 if stat in self.running_tag:
1479 return 'R'
1480
1481 @multiple_try()
1483 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1484 if not self.submitted_ids:
1485 return 0, 0, 0, 0
1486 idle, run, fail = 0, 0, 0
1487 ongoing = []
1488 for statusflag in ['p', 'r', 'sh']:
1489 cmd = 'qstat -s %s' % statusflag
1490 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1491
1492 pat = re.compile("^(\d+)")
1493 for line in status.stdout.read().split('\n'):
1494 line = line.strip()
1495 try:
1496 id = pat.search(line).groups()[0]
1497 except Exception:
1498 pass
1499 else:
1500 if id not in self.submitted_ids:
1501 continue
1502 ongoing.append(id)
1503 if statusflag == 'p':
1504 idle += 1
1505 if statusflag == 'r':
1506 run += 1
1507 if statusflag == 'sh':
1508 fail += 1
1509 for id in list(self.submitted_ids):
1510 if id not in ongoing:
1511 self.check_termination(id)
1512
1513
1514 return idle, run, self.submitted - idle - run - fail, fail
1515
1516 @multiple_try()
1517 - def remove(self, *args, **opts):
1518 """Clean the jobs on the cluster"""
1519
1520 if not self.submitted_ids:
1521 return
1522 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1523 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1524
1526 """start a computation and not wait for it to finish.
1527 this fonction returns a lock which is locked as long as the job is
1528 running."""
1529
1530 mc = MultiCore(1)
1531 mc.submit(exe, argument, cwd, stdout, **opt)
1532 mc.need_waiting = True
1533 mc.lock.acquire()
1534 return mc.lock
1535
1538 """Basic class for dealing with cluster submission"""
1539
1540 name = 'slurm'
1541 job_id = 'SLURM_JOBID'
1542 idle_tag = ['Q','PD','S','CF']
1543 running_tag = ['R', 'CG']
1544 complete_tag = ['C']
1545
1546 @multiple_try()
1547 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1548 required_output=[], nb_submit=0):
1549 """Submit a job prog to a SLURM cluster"""
1550
1551 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1552 me_dir = misc.digest(me_dir)[-8:]
1553
1554 if not me_dir[0].isalpha():
1555 me_dir = 'a' + me_dir[1:]
1556
1557 if cwd is None:
1558 cwd = os.getcwd()
1559 if stdout is None:
1560 stdout = '/dev/null'
1561 if stderr is None:
1562 stderr = '/dev/null'
1563 elif stderr == -2:
1564 stderr = stdout
1565 if log is None:
1566 log = '/dev/null'
1567
1568 command = ['sbatch', '-o', stdout,
1569 '-J', me_dir,
1570 '-e', stderr, prog] + argument
1571
1572 if self.cluster_queue and self.cluster_queue != 'None':
1573 command.insert(1, '-p')
1574 command.insert(2, self.cluster_queue)
1575
1576 a = misc.Popen(command, stdout=subprocess.PIPE,
1577 stderr=subprocess.STDOUT,
1578 stdin=subprocess.PIPE, cwd=cwd)
1579
1580 output = a.communicate()
1581 output_arr = output[0].split(' ')
1582 id = output_arr[3].rstrip()
1583
1584 if not id.isdigit():
1585 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1586
1587 self.submitted += 1
1588 self.submitted_ids.append(id)
1589 return id
1590
1591 @multiple_try()
1593 """ control the status of a single job with it's cluster id """
1594 cmd = 'squeue j'+str(id)
1595 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1596 stderr=open(os.devnull,'w'))
1597
1598 for line in status.stdout:
1599 line = line.strip()
1600 if 'Invalid' in line:
1601 return 'F'
1602 elif line.startswith(str(id)):
1603 status = line.split()[4]
1604 if status in self.idle_tag:
1605 return 'I'
1606 elif status in self.running_tag:
1607 return 'R'
1608 return 'F'
1609
1610 @multiple_try()
1612 """ control the status of a single job with it's cluster id """
1613 cmd = "squeue"
1614 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1615
1616 if me_dir.endswith('/'):
1617 me_dir = me_dir[:-1]
1618 me_dir = misc.digest(me_dir)[-8:]
1619 if not me_dir[0].isalpha():
1620 me_dir = 'a' + me_dir[1:]
1621
1622 idle, run, fail = 0, 0, 0
1623 ongoing=[]
1624 for line in status.stdout:
1625 if me_dir in line:
1626 id, _, _,_ , status,_ = line.split(None,5)
1627 ongoing.append(id)
1628 if status in self.idle_tag:
1629 idle += 1
1630 elif status in self.running_tag:
1631 run += 1
1632 elif status in self.complete_tag:
1633 status = self.check_termination(id)
1634 if status == 'wait':
1635 run += 1
1636 elif status == 'resubmit':
1637 idle += 1
1638 else:
1639 fail += 1
1640
1641
1642 for id in list(self.submitted_ids):
1643 if id not in ongoing:
1644 status = self.check_termination(id)
1645 if status == 'wait':
1646 run += 1
1647 elif status == 'resubmit':
1648 idle += 1
1649
1650
1651 return idle, run, self.submitted - (idle+run+fail), fail
1652
1653 @multiple_try()
1654 - def remove(self, *args, **opts):
1655 """Clean the jobs on the cluster"""
1656
1657 if not self.submitted_ids:
1658 return
1659 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1660 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1661
1663 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1664
1665 name= 'htcaas'
1666 job_id = 'HTCAAS_JOBID'
1667
1668 @store_input()
1669 @multiple_try()
1670 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1671 log=None, input_files=[], output_files=[], required_output=[],
1672 nb_submit=0):
1673 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1674 input/output file should be give relative to cwd
1675 """
1676
1677 if 'ajob' in prog:
1678 prog_num = prog.rsplit("ajob",1)[1]
1679 else:
1680 prog_num = '0'
1681
1682 cur_usr = os.getenv('USER')
1683
1684 if cwd is None:
1685 cwd = os.getcwd()
1686
1687 cwd_cp = cwd.rsplit("/",2)
1688
1689
1690 if not stdout is None:
1691 print "stdout: %s" % stdout
1692
1693 if not os.path.exists(prog):
1694 prog = os.path.join(cwd, prog)
1695
1696 if not required_output and output_files:
1697 required_output = output_files
1698
1699
1700 if not 'combine' and not 'pythia' in prog :
1701 cwd_arg = cwd+"/arguments"
1702 temp = ' '.join([str(a) for a in argument])
1703 arg_cmd="echo '"+temp+"' > " + cwd_arg
1704
1705
1706 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1707 if argument :
1708 command.extend(['-a ', '='.join([str(a) for a in argument])])
1709 print command
1710 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1711 id = a.stdout.read().strip()
1712
1713 else:
1714 cwd_arg = cwd+"/arguments"
1715 temp = ' '.join([str(a) for a in argument])
1716
1717
1718
1719
1720 temp_file_name = "sub." + os.path.basename(prog)
1721 text = """#!/bin/bash
1722 MYPWD=%(cwd)s
1723 cd $MYPWD
1724 input_files=(%(input_files)s )
1725 for i in ${input_files[@]}
1726 do
1727 chmod -f +x $i
1728 done
1729 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1730 """
1731 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1732 'arguments': ' '.join([str(a) for a in argument]),
1733 'program': ' ' if '.py' in prog else 'bash'}
1734
1735
1736 new_prog = pjoin(cwd, temp_file_name)
1737 open(new_prog, 'w').write(text % dico)
1738 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1739 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1740 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1741 id = a.stdout.read().strip()
1742
1743 nb_try=0
1744 nb_limit=5
1745 if not id.isdigit() :
1746 print "[ID is not digit]:" + id
1747
1748 while not id.isdigit() :
1749 nb_try+=1
1750 print "[fail_retry]:"+ nb_try
1751 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1752 id = a.stdout.read().strip()
1753 if nb_try > nb_limit :
1754 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1755 break
1756
1757 self.submitted += 1
1758 self.submitted_ids.append(id)
1759
1760 return id
1761
1762 @multiple_try(nb_try=10, sleep=10)
1764 """ control the status of a single job with it's cluster id """
1765
1766 if id == 0 :
1767 status_out ='C'
1768 else :
1769 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1770 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1771 stderr=subprocess.PIPE)
1772 error = status.stderr.read()
1773 if status.returncode or error:
1774 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1775 status_out= status.stdout.read().strip()
1776 status_out= status_out.split(":",1)[1]
1777 if status_out == 'waiting':
1778 status_out='I'
1779 elif status_out == 'preparing' or status_out == 'running':
1780 status_out = 'R'
1781 elif status_out != 'done':
1782 status_out = 'F'
1783 elif status_out == 'done':
1784 status_out = 'C'
1785
1786 return status_out
1787
1788 @multiple_try(nb_try=15, sleep=1)
1790 """ control the status of a single job with it's cluster id """
1791
1792 if not self.submitted_ids:
1793 return 0, 0, 0, 0
1794
1795 ongoing = []
1796 idle, run, fail = 0, 0, 0
1797
1798 if id == 0 :
1799 return 0 , 0, 0, 0
1800 else :
1801 for i in range(len(self.submitted_ids)):
1802 ongoing.append(int(self.submitted_ids[i]))
1803 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
1804 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1805 status_out= status.stdout.read().strip()
1806 status_out= status_out.split(":",1)[1]
1807 if status_out == 'waiting':
1808 idle += 1
1809 elif status_out == 'preparing':
1810 run += 1
1811 elif status_out == 'running':
1812 run += 1
1813 elif status_out != 'done':
1814 fail += 1
1815
1816 if status_out != 'done':
1817 print "["+ self.submitted_ids[i] + "] " + status_out
1818 '''
1819 for i in range(len(self.submitted_ids)):
1820 if int(self.submitted_ids[i]) not in ongoing:
1821 status = self.check_termination(int(self.submitted_ids[i]))
1822 if status = 'waiting':
1823 idle += 1
1824 elif status == 'resubmit':
1825 idle += 1
1826 elif status == 'failed':
1827 fail += 1
1828 '''
1829
1830 return idle, run, self.submitted - (idle+run+fail), fail
1831
1832 @multiple_try()
1833 - def remove(self, *args, **opts):
1834 """Clean the jobson the cluster"""
1835
1836 if not self.submitted_ids:
1837 return
1838 for i in range(len(self.submitted_ids)):
1839 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
1840 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1841
1844 """Class for dealing with cluster submission on a HTCaaS cluster"""
1845
1846 name= 'htcaas2'
1847 job_id = 'HTCAAS2_JOBID'
1848
1849 @store_input()
1850 @multiple_try()
1851 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1852 log=None, input_files=[], output_files=[], required_output=[],
1853 nb_submit=0):
1854 """Submit the job on the cluster NO SHARE DISK
1855 input/output file should be give relative to cwd
1856 """
1857
1858 if 'ajob' in prog:
1859 prog_num = prog.rsplit("ajob",1)[1]
1860 elif 'run_combine' in prog:
1861 prog_num = '0'
1862 else:
1863 prog_num = prog
1864
1865 cur_usr = os.getenv('USER')
1866
1867 import uuid
1868 dir = str(uuid.uuid4().hex)
1869
1870 prog_dir = '_run%s'% prog_num
1871 prog_dir = dir+prog_dir
1872
1873 if cwd is None:
1874 cwd = os.getcwd()
1875
1876 cwd_cp = cwd.rsplit("/",2)
1877
1878 if stdout is None:
1879 stdout='/dev/null'
1880
1881 if not os.path.exists(prog):
1882 prog = os.path.join(cwd, prog)
1883
1884 if not required_output and output_files:
1885 required_output = output_files
1886
1887 if '/' in argument :
1888 temp_file_name = "sub." + os.path.basename(prog)
1889 else :
1890 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
1891
1892
1893 if 'combine' in prog or 'pythia' in prog :
1894 text = """#!/bin/bash
1895 MYPWD=%(cwd)s
1896 cd $MYPWD
1897 script=%(script)s
1898 input_files=(%(input_files)s )
1899 if [ $# -ge 1 ]; then
1900 arg1=$1
1901 else
1902 arg1=''
1903 fi
1904 args=' %(arguments)s'
1905 for i in ${input_files[@]}; do
1906 if [[ "$i" == *$script* ]]; then
1907 script=$i
1908 fi
1909 chmod -f +x $i
1910 done
1911 /bin/bash ${script} ${args} > %(stdout)s
1912 """
1913
1914 elif 'shower' in prog :
1915 text = """#!/bin/bash
1916 MYPWD=%(cwd)s
1917 cd $MYPWD
1918 args=' %(arguments)s'
1919 input_files=( %(input_files)s )
1920 for i in ${input_files[@]}
1921 do
1922 chmod -f +x $i
1923 done
1924 /bin/bash %(script)s ${args} > $MYPWD/done
1925 """
1926
1927 else :
1928 text = """#!/bin/bash
1929 MYPWD=%(cwd)s
1930 #mkdir -p $MYTMP
1931 cd $MYPWD
1932 input_files=( %(input_files)s )
1933 for i in ${input_files[@]}
1934 do
1935 if [[ $i != */*/* ]]; then
1936 i=$PWD"/"$i
1937 fi
1938 echo $i
1939 if [ -d $i ]; then
1940 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1941 else
1942 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1943 fi
1944 done
1945 """
1946
1947 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
1948 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
1949 'input_files': ' '.join(input_files + [prog]),
1950 'output_files': ' '.join(output_files), 'stdout': stdout,
1951 'arguments': ' '.join([str(a) for a in argument]),
1952 'program': ' ' if '.py' in prog else 'bash'}
1953
1954
1955 new_prog = pjoin(cwd, temp_file_name)
1956 open(new_prog, 'w').write(text % dico)
1957 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1958
1959
1960 cmd1='/bin/bash '+ cwd+'/'+temp_file_name
1961 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE,
1962 stderr=subprocess.PIPE)
1963
1964
1965
1966 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog:
1967
1968 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s"""
1969 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
1970 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) ,
1971 'prog_dir': prog_dir }
1972 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE,
1973 stderr=subprocess.PIPE)
1974 id = status3.stdout.read().strip()
1975
1976 nb_try=0
1977 nb_limit=5
1978 while not id.isdigit() :
1979 nb_try+=1
1980 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1981 id = a.stdout.read().strip()
1982 if nb_try > nb_limit :
1983 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id
1984 break
1985
1986 temp_file_name2 = "sub." +id
1987 text2 = """#!/bin/bash
1988 MYPWD=%(cwd)s
1989 output_files=( %(output_files)s )
1990 result=done
1991 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then
1992 for i in ${output_files[@]}
1993 do
1994 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s
1995 chmod -Rf 777 ${MYPWD}/$i
1996 done
1997 for i in ${output_files[@]}; do
1998 if [[ -e ${MYPWD}/$i ]]; then
1999 result=done
2000 else
2001 result=running
2002 echo $result
2003 exit 0
2004 fi
2005 done
2006 echo $result
2007 touch ${MYPWD}/done.%(job_id)s
2008 else
2009 for i in ${output_files[@]}; do
2010 if [ -e ${MYPWD}/$i ]; then
2011 result=done
2012 else
2013 rm -f ${MYPWD}/done.%(job_id)s
2014 result=running
2015 echo $result
2016 exit 0
2017 fi
2018 done
2019 echo $result
2020
2021 fi
2022
2023 """
2024 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
2025 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
2026 'output_files': ' '.join(output_files), 'job_id': id,
2027 'program': ' ' if '.py' in prog else 'bash'}
2028
2029 homePath = os.getenv("HOME")
2030 outPath = homePath +"/MG5"
2031
2032 new_prog2 = pjoin(outPath, temp_file_name2)
2033 open(new_prog2, 'w').write(text2 % dico2)
2034 misc.Popen(['chmod','+x',new_prog2],cwd=cwd)
2035
2036
2037 self.submitted += 1
2038 self.submitted_ids.append(id)
2039
2040 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog:
2041 if '/dev/null' in stdout :
2042 stdout=''
2043
2044 temp_file_shower = "sub.out"
2045 text_shower = """#!/bin/bash
2046 MYPWD=%(cwd)s
2047 result=done
2048 output_files=(%(output_files)s)
2049 for i in ${output_files[@]}; do
2050 if [ -e $MYPWD/$i -o -e $i ]; then
2051 result=done
2052 else
2053 result=running
2054 echo $result
2055 exit 0
2056 fi
2057 done
2058 echo $result
2059 """
2060 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files),
2061 'program': ' ' if '.py' in prog else 'bash'}
2062 homePath = os.getenv("HOME")
2063 outPath = homePath +"/MG5"
2064 new_prog_shower = pjoin(outPath, temp_file_shower)
2065 open(new_prog_shower, 'w').write(text_shower % dico_shower)
2066 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd)
2067
2068 id='-1'
2069 self.submitted += 1
2070 self.submitted_ids.append(id)
2071
2072 else :
2073 id='-2'
2074 self.submitted += 1
2075 self.submitted_ids.append(id)
2076
2077 return id
2078
2079 @multiple_try(nb_try=10, sleep=10)
2081 """ control the status of a single job with it's cluster id """
2082
2083 homePath = os.getenv("HOME")
2084 outPath = homePath +"/MG5"
2085
2086
2087 if id == '0' or id=='-2' :
2088 status_out ='done'
2089 elif id == '-1' :
2090 cmd='/bin/bash ' +outPath+'/sub.out'
2091 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2092 status_out=status.stdout.read().strip()
2093 print "["+id+"]" + status_out
2094 if status_out == 'waiting':
2095 status_out='wait'
2096 elif status_out == 'preparing' or status_out == 'running':
2097 status_out = 'R'
2098 elif status_out != 'done':
2099 status_out = 'F'
2100 elif status_out == 'done':
2101 status_out = 'C'
2102
2103 print "["+id+"]" + status_out
2104 else :
2105 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
2106 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
2107 stderr=subprocess.PIPE)
2108 error = status.stderr.read()
2109 if status.returncode or error:
2110 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
2111 status_out= status.stdout.read().strip()
2112 status_out= status_out.split(":",1)[1]
2113 print "["+id+"]" + status_out
2114 if status_out == 'waiting':
2115 status_out='wait'
2116 elif status_out == 'preparing' or status_out == 'running':
2117 status_out = 'R'
2118 elif status_out == 'failed' :
2119 args = self.retry_args[id]
2120 id_temp = self.submit2(**args)
2121 del self.retry_args[id]
2122 self.submitted_ids.remove(id)
2123 status_out = 'I'
2124 elif status_out != 'done':
2125 status_out = 'F'
2126 elif status_out == 'done':
2127 status_out = 'C'
2128
2129 return status_out
2130
2131
2132 @check_interupt()
2133 @multiple_try(nb_try=15, sleep=10)
2135 """ control the status of a single job with it's cluster id """
2136
2137 if not self.submitted_ids:
2138 return 0, 0, 0, 0
2139
2140 ongoing = []
2141 idle, run, fail = 0, 0, 0
2142
2143 homePath = os.getenv("HOME")
2144 outPath = homePath +"/MG5"
2145
2146 for i in range(len(self.submitted_ids)):
2147 ongoing.append(self.submitted_ids[i])
2148 if self.submitted_ids[i] == '-2' :
2149 return 0,0,0,0
2150 if self.submitted_ids[i] == '0' :
2151
2152 status_out='done'
2153 elif self.submitted_ids[i] == '-1' :
2154 cmd='/bin/bash ' +outPath+'/sub.out'
2155 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2156 status_out=status.stdout.read().strip()
2157 if status_out == 'waiting':
2158 idle += 1
2159 elif status_out == 'preparing':
2160 run += 1
2161 elif status_out == 'running':
2162 run += 1
2163 elif status_out != 'done':
2164 fail += 1
2165 else :
2166 args = self.retry_args[str(self.submitted_ids[i])]
2167 if 'required_output'in args and not args['required_output']:
2168 args['required_output'] = args['output_files']
2169 self.retry_args[str(self.submitted_ids[i])] = args
2170
2171 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
2172 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2173 status_out= status.stdout.read().strip()
2174 status_out= status_out.split(":",1)[1]
2175 if status_out == 'waiting':
2176 idle += 1
2177 elif status_out == 'preparing':
2178 run += 1
2179 elif status_out == 'running':
2180 run += 1
2181 elif status_out == 'failed' or status_out == 'canceled':
2182 id = self.submit2(**args)
2183
2184 del self.retry_args[self.submitted_ids[i]]
2185 self.submitted_ids.remove(self.submitted_ids[i])
2186 self.submitted-=1
2187 idle += 1
2188 elif status_out != 'done':
2189 fail += 1
2190 if status_out == 'done':
2191 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i]
2192 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2193 aa= status2.stdout.read().strip()
2194
2195
2196
2197
2198
2199
2200
2201 for path in args['required_output']:
2202 if args['cwd']:
2203 path = pjoin(args['cwd'], path)
2204
2205 temp1=os.path.exists(path)
2206 temp2=os.stat(path).st_size
2207 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
2208 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2209 aa= status2.stdout.read().strip()
2210 if aa == 'done':
2211 self.submitted_ids[i] = '0'
2212 elif aa == 'running':
2213 run += 1
2214 else :
2215 self.submitted_ids[i]='0'
2216
2217
2218 for i in range(len(self.submitted_ids)):
2219 if str(self.submitted_ids[i]) not in ongoing:
2220 status2= self.check_termination(str(self.submitted_ids[i]))
2221 if status2 == 'wait':
2222 run += 1
2223 elif status2 == 'resubmit':
2224 idle += 1
2225
2226 return idle, run, self.submitted - (idle+run+fail), fail
2227
2228 @multiple_try()
2229 - def remove(self, *args, **opts):
2230 """Clean the jobson the cluster"""
2231
2232 if not self.submitted_ids:
2233 return
2234 for i in range(len(self.submitted_ids)):
2235 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
2236 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2237
2238
2239 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2240 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2241 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2242