1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21
22 logger = logging.getLogger('madgraph.cluster')
23
24 try:
25 from madgraph import MadGraph5Error
26 import madgraph.various.misc as misc
27 except Exception, error:
28 if __debug__:
29 print str(error)
30 from internal import MadGraph5Error
31 import internal.misc as misc
32
33 pjoin = os.path.join
37
40
41
42 multiple_try = misc.multiple_try
43 pjoin = os.path.join
47
48 def deco_interupt(f):
49 def deco_f_interupt(self, *args, **opt):
50 try:
51 return f(self, *args, **opt)
52 except error:
53 try:
54 self.remove(*args, **opt)
55 except Exception:
56 pass
57 raise error
58 return deco_f_interupt
59 return deco_interupt
60
73 return deco_f_store
74 return deco_store
75
78 """Basic Class for all cluster type submission"""
79 name = 'mother class'
80
82 """Init the cluster"""
83
84 self.submitted = 0
85 self.submitted_ids = []
86 self.finish = 0
87 if 'cluster_queue' in opts:
88 self.cluster_queue = opts['cluster_queue']
89 else:
90 self.cluster_queue = 'madgraph'
91 if 'cluster_temp_path' in opts:
92 self.temp_dir = opts['cluster_temp_path']
93 else:
94 self.temp_dir = None
95 self.options = {'cluster_status_update': (600, 30)}
96 for key,value in opts.items():
97 self.options[key] = value
98 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
99 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300
100 self.options = dict(opts)
101 self.retry_args = {}
102
103
104 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
105 log=None, required_output=[], nb_submit=0):
106 """How to make one submission. Return status id on the cluster."""
107 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
108
109 @store_input()
110 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
111 log=None, input_files=[], output_files=[], required_output=[],nb_submit=0):
112 """How to make one submission. Return status id on the cluster.
113 NO SHARE DISK"""
114
115 if cwd is None:
116 cwd = os.getcwd()
117 if not os.path.exists(prog):
118 prog = os.path.join(cwd, prog)
119
120 if not required_output and output_files:
121 required_output = output_files
122
123 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
124 (input_files == [] == output_files):
125 return self.submit(prog, argument, cwd, stdout, stderr, log,
126 required_output=required_output, nb_submit=nb_submit)
127
128 if not input_files and not output_files:
129
130 return self.submit(prog, argument, cwd, stdout, stderr, log,
131 required_output=required_output, nb_submit=nb_submit)
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
138
139 text = """#!/bin/bash
140 MYTMP=%(tmpdir)s/run$%(job_id)s
141 MYPWD=%(cwd)s
142 mkdir -p $MYTMP
143 cd $MYPWD
144 input_files=( %(input_files)s )
145 for i in ${input_files[@]}
146 do
147 cp -R -L $i $MYTMP
148 done
149 cd $MYTMP
150 echo '%(arguments)s' > arguments
151 chmod +x ./%(script)s
152 %(program)s ./%(script)s %(arguments)s
153 output_files=( %(output_files)s )
154 for i in ${output_files[@]}
155 do
156 cp -r $MYTMP/$i $MYPWD
157 done
158 rm -rf $MYTMP
159 """
160 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
161 'cwd': cwd, 'job_id': self.job_id,
162 'input_files': ' '.join(input_files + [prog]),
163 'output_files': ' '.join(output_files),
164 'arguments': ' '.join([str(a) for a in argument]),
165 'program': ' ' if '.py' in prog else 'bash'}
166
167
168 new_prog = pjoin(cwd, temp_file_name)
169 open(new_prog, 'w').write(text % dico)
170 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
171
172 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
173 required_output=required_output, nb_submit=nb_submit)
174
175
177 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
178 if not self.submitted_ids:
179 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
180 idle, run, fail = 0, 0, 0
181 for pid in self.submitted_ids[:]:
182 status = self.control_one_job(id)
183 if status == 'I':
184 idle += 1
185 elif status == 'R':
186 run += 1
187 elif status == 'F':
188 self.finish +=1
189 self.submitted_ids.remove(pid)
190 else:
191 fail += 1
192
193 return idle, run, self.finish, fail
194
196 """ control the status of a single job with it's cluster id """
197 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
198
199 @check_interupt()
200 - def wait(self, me_dir, fct, minimal_job=0):
201 """Wait that all job are finish.
202 if minimal_job set, then return if idle + run is lower than that number"""
203
204
205 mode = 1
206 nb_iter = 0
207 nb_short = 0
208 change_at = 5
209
210 longtime, shorttime = self.options['cluster_status_update']
211
212 while 1:
213 old_mode = mode
214 nb_iter += 1
215 idle, run, finish, fail = self.control(me_dir)
216 if fail:
217 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
218 if idle + run == 0:
219
220 logger.info('All jobs finished')
221 break
222 if idle + run < minimal_job:
223 return
224 fct(idle, run, finish)
225
226 if nb_iter < change_at:
227 mode = 1
228 elif idle < run:
229 if old_mode == 0:
230 if nb_short:
231 mode = 0
232
233 elif idle:
234 if nb_iter > change_at + int(longtime)//shorttime:
235 mode = 0
236 else:
237 mode = 1
238 nb_short =0
239 else:
240 mode = 1
241 nb_short = 0
242 elif old_mode == 1:
243 nb_short +=1
244 if nb_short > 3* max(change_at, int(longtime)//shorttime):
245 mode = 0
246 else:
247 mode = 0
248
249
250 if old_mode > mode:
251 logger.info('''Start to wait %ss between checking status.
252 Note that you can change this time in the configuration file.
253 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
254
255
256 if mode == 0:
257 try:
258 time.sleep(self.options['cluster_status_update'][0])
259 except KeyboardInterrupt:
260 logger.info('start to update the status')
261 nb_iter = min(0, change_at -2)
262 nb_short = 0
263 else:
264 time.sleep(self.options['cluster_status_update'][1])
265
266
267 self.submitted = 0
268 self.submitted_ids = []
269
271 """Check the termination of the jobs with job_id and relaunch it if needed."""
272
273
274 if job_id not in self.retry_args:
275 return True
276
277 args = self.retry_args[job_id]
278 if 'time_check' in args:
279 time_check = args['time_check']
280 else:
281 time_check = 0
282
283 for path in args['required_output']:
284 if args['cwd']:
285 path = pjoin(args['cwd'], path)
286
287 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
288 break
289 else:
290
291 if time_check > 0:
292 logger.info('Job %s Finally found the missing output.' % (job_id))
293 del self.retry_args[job_id]
294 self.submitted_ids.remove(job_id)
295 return 'done'
296
297 if time_check == 0:
298 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
299 args['time_check'] = time.time()
300 return 'wait'
301 elif self.cluster_retry_wait > time.time() - time_check:
302 return 'wait'
303
304
305 if self.nb_retry < 0:
306 logger.critical('''Fail to run correctly job %s.
307 with option: %s
308 file missing: %s''' % (job_id, args, path))
309 raw_input('press enter to continue.')
310 elif self.nb_retry == 0:
311 logger.critical('''Fail to run correctly job %s.
312 with option: %s
313 file missing: %s.
314 Stopping all runs.''' % (job_id, args, path))
315
316 elif args['nb_submit'] >= self.nb_retry:
317 logger.critical('''Fail to run correctly job %s.
318 with option: %s
319 file missing: %s
320 Fails %s times
321 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
322
323 else:
324 args['nb_submit'] += 1
325 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
326 del self.retry_args[job_id]
327 self.submitted_ids.remove(job_id)
328 if 'time_check' in args:
329 del args['time_check']
330 self.submit2(**args)
331 return 'resubmit'
332 return 'done'
333
334
335
336 @check_interupt()
337 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
338 stderr=None, log=None, required_output=[], nb_submit=0,
339 input_files=[], output_files=[]):
340 """launch one job on the cluster and wait for it"""
341
342 special_output = False
343 if stderr == -2 and stdout:
344
345 special_output = True
346 stderr = stdout + '.err'
347
348 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
349 required_output=required_output, input_files=input_files,
350 output_files=output_files)
351
352 frame = inspect.currentframe()
353 args, _, _, values = inspect.getargvalues(frame)
354 args = dict([(i, values[i]) for i in args if i != 'self'])
355 self.retry_args[id] = args
356
357 nb_wait=0
358 while 1:
359 nb_wait+=1
360 status = self.control_one_job(id)
361 if not status in ['R','I']:
362 status = self.check_termination(id)
363 if status in ['wait']:
364 time.sleep(30)
365 continue
366 elif status in ['resubmit']:
367 id = self.submitted_ids[0]
368 time.sleep(30)
369 continue
370
371 time.sleep(30)
372 break
373 time.sleep(self.options['cluster_status_update'][1])
374
375 if required_output:
376 status = self.check_termination(id)
377 if status == 'wait':
378 run += 1
379 elif status == 'resubmit':
380 idle += 1
381
382
383 if special_output:
384
385
386 for i in range(5):
387 if os.path.exists(stdout):
388 if not os.path.exists(stderr):
389 time.sleep(5)
390 if os.path.exists(stderr):
391 err_text = open(stderr).read()
392 if not err_text:
393 return
394 logger.warning(err_text)
395 text = open(stdout).read()
396 open(stdout,'w').write(text + err_text)
397 else:
398 return
399 time.sleep(10)
400
401 - def remove(self, *args, **opts):
402 """ """
403 logger.warning("""This cluster didn't support job removal,
404 the jobs are still running on the cluster.""")
405
407 """ class for dealing with the submission in multiple node"""
408
409 job_id = '$'
410
412 """Init the cluster"""
413 import thread
414 super(MultiCore, self).__init__(self, *args, **opt)
415
416
417 self.submitted = 0
418 self.finish = 0
419 if 'nb_core' in opt:
420 self.nb_core = opt['nb_core']
421 elif isinstance(args[0],int):
422 self.nb_core = args[0]
423 else:
424 self.nb_core = 1
425 self.update_fct = None
426
427
428 self.need_waiting = False
429 self.nb_used = 0
430 self.lock = thread.allocate_lock()
431 self.done = 0
432 self.waiting_submission = []
433 self.pids = []
434 self.fail_msg = None
435
436 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
437 stderr=None, log=None, **opts):
438 """launch one job and wait for it"""
439 if isinstance(stdout, str):
440 stdout = open(stdout, 'w')
441 if isinstance(stderr, str):
442 stdout = open(stderr, 'w')
443 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
444
445
446 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
447 log=None, required_output=[], nb_submit=0):
448 """submit a job on multicore machine"""
449
450 self.submitted +=1
451 if cwd is None:
452 cwd = os.getcwd()
453 if isinstance(prog, str):
454 if not os.path.exists(prog) and not misc.which(prog):
455 prog = os.path.join(cwd, prog)
456
457 import thread
458 if self.waiting_submission or self.nb_used == self.nb_core:
459 self.waiting_submission.append((prog, argument,cwd, stdout))
460
461 while self.nb_used < self.nb_core and self.waiting_submission:
462 arg = self.waiting_submission.pop(0)
463 self.nb_used += 1
464 thread.start_new_thread(self.launch, arg)
465 elif self.nb_used < self.nb_core -1:
466 self.nb_used += 1
467 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
468 elif self.nb_used == self.nb_core -1:
469 self.nb_used += 1
470 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
471
472
473 - def launch(self, exe, argument, cwd, stdout):
474 """ way to launch for multicore. If exe is a string then treat it as
475 an executable. Otherwise treat it as a function"""
476 import thread
477 def end(self, pid):
478 self.nb_used -= 1
479 self.done += 1
480 try:
481 self.pids.remove(pid)
482 except:
483 pass
484
485 fail_msg = None
486 try:
487 if isinstance(exe,str):
488 if os.path.exists(exe) and not exe.startswith('/'):
489 exe = './' + exe
490 proc = misc.Popen([exe] + argument, cwd=cwd, stdout=stdout,
491 stderr=subprocess.STDOUT)
492 pid = proc.pid
493 self.pids.append(pid)
494 proc.wait()
495 if proc.returncode not in [0, 143, -15]:
496 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
497 (' '.join([exe]+argument), proc.returncode)
498
499 logger.warning(fail_msg)
500 try:
501 log = open(glob.glob(pjoin(cwd,'*','log.txt'))[0]).read()
502 logger.warning('Last 15 lines of logfile %s:\n%s\n' % \
503 (pjoin(cwd,'*','log.txt'), '\n'.join(log.split('\n')[-15:-1]) + '\n'))
504 except IOError, AttributeError:
505 logger.warning('Please look for possible logfiles in %s' % cwd)
506 pass
507 self.remove(fail_msg)
508 else:
509 pid = tuple([id(o) for o in [exe] + argument])
510 self.pids.append(pid)
511
512
513 returncode = exe(argument)
514 if returncode != 0:
515 logger.warning(returncode)
516 self.remove()
517
518
519
520
521 security = 0
522
523 while 1:
524 while not self.lock.locked():
525 if not self.need_waiting:
526
527 end(self, pid)
528 return
529 elif security > 60:
530 end(self, pid)
531 return
532 security += 1
533 time.sleep(1)
534 try:
535 self.lock.release()
536 except thread.error:
537 continue
538 break
539 end(self, pid)
540
541
542 except Exception, error:
543
544 self.remove()
545 raise
546
547
548
549
550 - def wait(self, me_dir, update_status):
551 """Wait that all thread finish
552 self.nb_used and self.done are update via each jobs (thread and local)
553 self.submitted is the nb of times that submitted has been call (local)
554 remaining is the nb of job that we still have to wait. (local)
555 self.pids is the list of the BASH pid of the submitted jobs. (thread)
556
557 WARNING: In principle all those value are coherent but since some are
558 modified in various thread, those data can be corrupted. (not the local
559 one). Nb_used in particular shouldn't be trusted too much.
560 This code check in different ways that all jobs have finished.
561
562 In principle, the statement related to '#security #X' are not used.
563 In practise they are times to times.
564 """
565
566 import thread
567
568 remaining = self.submitted - self.done
569
570 while self.nb_used < self.nb_core:
571 if self.waiting_submission:
572 arg = self.waiting_submission.pop(0)
573 thread.start_new_thread(self.launch, arg)
574 self.nb_used += 1
575 else:
576 break
577
578 try:
579 self.need_waiting = True
580 self.lock.acquire()
581 no_in_queue = 0
582 secure_mode = False
583 while self.waiting_submission or self.nb_used:
584 if self.fail_msg:
585 msg, self.fail_msg = self.fail_msg, None
586 self.remove()
587 raise Exception, msg
588 if update_status:
589 update_status(len(self.waiting_submission), self.nb_used, self.done)
590
591
592 if len(self.waiting_submission) == 0 == remaining :
593 self.done = self.submitted
594 break
595
596
597 if len(self.waiting_submission) == 0 and len(self.pids) == 0:
598 if self.submitted == self.done:
599 break
600 logger.debug('Found too many jobs. Recovering')
601 no_in_queue += 1
602 time.sleep(min(180, 5 * no_in_queue))
603 if no_in_queue > 3:
604 logger.debug('Still too many jobs. Continue')
605 break
606 continue
607
608
609 if not secure_mode and len(self.waiting_submission) != 0:
610 if self.nb_used != self.nb_core:
611 if self.nb_used != len(self.pids):
612 secure_mode = True
613
614 if secure_mode and not self.waiting_submission:
615 self.need_waiting = False
616 if self.lock.locked():
617 self.lock.release()
618 break
619
620
621 self.lock.acquire()
622 remaining -=1
623
624 if self.waiting_submission:
625 arg = self.waiting_submission.pop(0)
626 thread.start_new_thread(self.launch, arg)
627 self.nb_used += 1
628
629 if self.fail_msg:
630 msg, self.fail_msg = self.fail_msg, None
631 self.remove()
632 raise Exception, msg
633
634
635 no_in_queue = 0
636 while self.submitted > self.done:
637 if self.fail_msg:
638 msg, self.fail_msg = self.fail_msg, None
639 self.remove()
640 raise Exception, msg
641 if no_in_queue == 0:
642 logger.debug('Some jobs have been lost. Try to recover')
643
644 if not len(self.pids):
645
646 logger.critical('Some jobs have been lost in the multicore treatment.')
647 logger.critical('The results might be incomplete. (Trying to continue anyway)')
648 break
649 elif update_status:
650 update_status(len(self.waiting_submission), len(self.pids) ,
651 self.done)
652
653 if not secure_mode:
654 self.lock.acquire()
655 else:
656 no_in_queue += 1
657 try:
658 time.sleep(min(180,5*no_in_queue))
659 if no_in_queue > 5 * 3600.0 / 162:
660 break
661 except KeyboardInterrupt:
662 logger.warning('CTRL-C assumes that all jobs are done. Continue the code')
663 self.pids = []
664 break
665
666
667 no_in_queue = 0
668 while len(self.pids):
669 if self.fail_msg:
670 msg, self.fail_msg = self.fail_msg, None
671 self.remove()
672 raise Exception, msg
673 self.need_waiting = False
674 if self.lock.locked():
675 self.lock.release()
676 secure_mode = True
677 if no_in_queue == 0 :
678 logger.warning('Some jobs have been lost. Try to recover.')
679 logger.warning('Hitting ctrl-c will consider that all jobs are done and continue the code.')
680 try:
681
682 if update_status:
683 update_status(len(self.waiting_submission), len(self.pids) ,
684 self.done)
685 time.sleep(min(5*no_in_queue, 180))
686 no_in_queue += 1
687 if no_in_queue > 5 * 3600.0 / 162:
688 break
689 except KeyboardInterrupt:
690 break
691
692
693 if update_status:
694 self.next_update = 0
695 update_status(len(self.waiting_submission), 0, self.done)
696
697
698 self.need_waiting = False
699 security = 0
700 while not self.lock.locked() and security < 10:
701
702 if secure_mode:
703 security = 10
704 security +=1
705 time.sleep(1)
706 if security < 10:
707 self.lock.release()
708 self.done = 0
709 self.nb_used = 0
710 self.submitted = 0
711 self.pids = []
712
713 except KeyboardInterrupt:
714 self.remove()
715 raise
716 if self.fail_msg:
717 msg, self.fail_msg = self.fail_msg, None
718 self.remove()
719 raise Exception, msg
720
721
722 - def remove(self, error=None):
723 """Ensure that all thread are killed"""
724 logger.info('remove job currently running')
725 self.waiting_submission = []
726 if error:
727 self.fail_msg = error
728 for pid in list(self.pids):
729 if isinstance(pid, tuple):
730 continue
731 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
732 % {'pid':pid} )
733 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
734 if out == 0:
735 try:
736 self.pids.remove(pid)
737 except:
738 pass
739
740
741 time.sleep(1)
742 for pid in list(self.pids):
743 if isinstance(pid, tuple):
744 continue
745 out = os.system('CPIDS=$(pgrep -P %s); kill -15 $CPIDS > /dev/null 2>&1' % pid )
746 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
747 if out == 0:
748 try:
749 self.pids.remove(pid)
750 except:
751 pass
752
754 """Basic class for dealing with cluster submission"""
755
756 name = 'condor'
757 job_id = 'CONDOR_ID'
758
759
760
761 @multiple_try()
762 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
763 required_output=[], nb_submit=0):
764 """Submit a job prog to a Condor cluster"""
765
766 text = """Executable = %(prog)s
767 output = %(stdout)s
768 error = %(stderr)s
769 log = %(log)s
770 %(argument)s
771 environment = CONDOR_ID=$(Cluster).$(Process)
772 Universe = vanilla
773 notification = Error
774 Initialdir = %(cwd)s
775 %(requirement)s
776 getenv=True
777 queue 1
778 """
779
780 if self.cluster_queue not in ['None', None]:
781 requirement = 'Requirements = %s=?=True' % self.cluster_queue
782 else:
783 requirement = ''
784
785 if cwd is None:
786 cwd = os.getcwd()
787 if stdout is None:
788 stdout = '/dev/null'
789 if stderr is None:
790 stderr = '/dev/null'
791 if log is None:
792 log = '/dev/null'
793 if not os.path.exists(prog):
794 prog = os.path.join(cwd, prog)
795 if argument:
796 argument = 'Arguments = %s' % ' '.join(argument)
797 else:
798 argument = ''
799
800
801 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
802 'stderr': stderr,'log': log,'argument': argument,
803 'requirement': requirement}
804
805 open('submit_condor','w').write(text % dico)
806 a = misc.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE)
807 output = a.stdout.read()
808
809
810
811 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
812 try:
813 id = pat.search(output).groups()[0]
814 except:
815 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
816 % output
817 self.submitted += 1
818 self.submitted_ids.append(id)
819 return id
820
821 @store_input()
822 @multiple_try()
823 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
824 log=None, input_files=[], output_files=[], required_output=[],
825 nb_submit=0):
826 """Submit the job on the cluster NO SHARE DISK
827 input/output file should be give relative to cwd
828 """
829
830 if not required_output and output_files:
831 required_output = output_files
832
833 if (input_files == [] == output_files):
834 return self.submit(prog, argument, cwd, stdout, stderr, log,
835 required_output=required_output, nb_submit=nb_submit)
836
837 text = """Executable = %(prog)s
838 output = %(stdout)s
839 error = %(stderr)s
840 log = %(log)s
841 %(argument)s
842 should_transfer_files = YES
843 when_to_transfer_output = ON_EXIT
844 transfer_input_files = %(input_files)s
845 %(output_files)s
846 Universe = vanilla
847 notification = Error
848 Initialdir = %(cwd)s
849 %(requirement)s
850 getenv=True
851 queue 1
852 """
853
854 if self.cluster_queue not in ['None', None]:
855 requirement = 'Requirements = %s=?=True' % self.cluster_queue
856 else:
857 requirement = ''
858
859 if cwd is None:
860 cwd = os.getcwd()
861 if stdout is None:
862 stdout = '/dev/null'
863 if stderr is None:
864 stderr = '/dev/null'
865 if log is None:
866 log = '/dev/null'
867 if not os.path.exists(prog):
868 prog = os.path.join(cwd, prog)
869 if argument:
870 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
871 else:
872 argument = ''
873
874 if input_files:
875 input_files = ','.join(input_files)
876 else:
877 input_files = ''
878 if output_files:
879 output_files = 'transfer_output_files = %s' % ','.join(output_files)
880 else:
881 output_files = ''
882
883
884
885 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
886 'stderr': stderr,'log': log,'argument': argument,
887 'requirement': requirement, 'input_files':input_files,
888 'output_files':output_files}
889
890 open('submit_condor','w').write(text % dico)
891 a = subprocess.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE)
892 output = a.stdout.read()
893
894
895
896 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
897 try:
898 id = pat.search(output).groups()[0]
899 except:
900 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
901 % output
902 self.submitted += 1
903 self.submitted_ids.append(id)
904 return id
905
906
907
908
909
910 @multiple_try(nb_try=10, sleep=10)
912 """ control the status of a single job with it's cluster id """
913 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
914 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
915 stderr=subprocess.PIPE)
916
917 error = status.stderr.read()
918 if status.returncode or error:
919 raise ClusterManagmentError, 'condor_q returns error: %s' % error
920
921 return status.stdout.readline().strip()
922
923 @check_interupt()
924 @multiple_try(nb_try=10, sleep=10)
926 """ control the status of a single job with it's cluster id """
927
928 if not self.submitted_ids:
929 return 0, 0, 0, 0
930
931 packet = 15000
932 idle, run, fail = 0, 0, 0
933 ongoing = []
934 for i in range(1+(len(self.submitted_ids)-1)//packet):
935 start = i * packet
936 stop = (i+1) * packet
937 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
938 " -format \'%-2s\ ' \'ClusterId\' " + \
939 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
940
941 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
942 stderr=subprocess.PIPE)
943 error = status.stderr.read()
944 if status.returncode or error:
945 raise ClusterManagmentError, 'condor_q returns error: %s' % error
946
947 for line in status.stdout:
948 id, status = line.strip().split()
949 ongoing.append(int(id))
950 if status in ['I','U']:
951 idle += 1
952 elif status == 'R':
953 run += 1
954 elif status != 'C':
955 fail += 1
956
957 for id in list(self.submitted_ids):
958 if int(id) not in ongoing:
959 status = self.check_termination(id)
960 if status == 'wait':
961 run += 1
962 elif status == 'resubmit':
963 idle += 1
964
965 return idle, run, self.submitted - (idle+run+fail), fail
966
967 @multiple_try()
968 - def remove(self, *args, **opts):
969 """Clean the jobson the cluster"""
970
971 if not self.submitted_ids:
972 return
973 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
974
975 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
976
978 """Basic class for dealing with cluster submission"""
979
980 name = 'pbs'
981 job_id = 'PBS_JOBID'
982 idle_tag = ['Q']
983 running_tag = ['T','E','R']
984 complete_tag = ['C']
985
986 maximum_submited_jobs = 2500
987
988 @multiple_try()
989 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
990 required_output=[], nb_submit=0):
991 """Submit a job prog to a PBS cluster"""
992
993
994 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
995 me_dir = misc.digest(me_dir)[-14:]
996 if not me_dir[0].isalpha():
997 me_dir = 'a' + me_dir[1:]
998
999 if len(self.submitted_ids) > self.maximum_submited_jobs:
1000 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1001 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1002 self.wait(me_dir, fct, self.maximum_submited_jobs)
1003
1004
1005 text = ""
1006 if cwd is None:
1007 cwd = os.getcwd()
1008 else:
1009 text = " cd %s;" % cwd
1010 if stdout is None:
1011 stdout = '/dev/null'
1012 if stderr is None:
1013 stderr = '/dev/null'
1014 elif stderr == -2:
1015 stderr = stdout
1016 if log is None:
1017 log = '/dev/null'
1018
1019 if not os.path.isabs(prog):
1020 text += "./%s" % prog
1021 else:
1022 text+= prog
1023
1024 if argument:
1025 text += ' ' + ' '.join(argument)
1026
1027 command = ['qsub','-o', stdout,
1028 '-N', me_dir,
1029 '-e', stderr,
1030 '-V']
1031
1032 if self.cluster_queue and self.cluster_queue != 'None':
1033 command.extend(['-q', self.cluster_queue])
1034
1035 a = misc.Popen(command, stdout=subprocess.PIPE,
1036 stderr=subprocess.STDOUT,
1037 stdin=subprocess.PIPE, cwd=cwd)
1038
1039 output = a.communicate(text)[0]
1040 id = output.split('.')[0]
1041 if not id.isdigit() or a.returncode !=0:
1042 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1043 % output
1044
1045 self.submitted += 1
1046 self.submitted_ids.append(id)
1047 return id
1048
1049 @multiple_try()
1051 """ control the status of a single job with it's cluster id """
1052 cmd = 'qstat '+str(id)
1053 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1054 stderr=subprocess.STDOUT)
1055
1056 for line in status.stdout:
1057 line = line.strip()
1058 if 'cannot connect to server' in line or 'cannot read reply' in line:
1059 raise ClusterManagmentError, 'server disconnected'
1060 if 'Unknown' in line:
1061 return 'F'
1062 elif line.startswith(str(id)):
1063 jobstatus = line.split()[4]
1064 else:
1065 jobstatus=""
1066
1067 if status.returncode != 0 and status.returncode is not None:
1068 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1069 if jobstatus in self.idle_tag:
1070 return 'I'
1071 elif jobstatus in self.running_tag:
1072 return 'R'
1073 return 'F'
1074
1075
1076 @multiple_try()
1078 """ control the status of a single job with it's cluster id """
1079 cmd = "qstat"
1080 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1081
1082 if me_dir.endswith('/'):
1083 me_dir = me_dir[:-1]
1084 me_dir = misc.digest(me_dir)[-14:]
1085 if not me_dir[0].isalpha():
1086 me_dir = 'a' + me_dir[1:]
1087 ongoing = []
1088
1089 idle, run, fail = 0, 0, 0
1090 for line in status.stdout:
1091 if 'cannot connect to server' in line or 'cannot read reply' in line:
1092 raise ClusterManagmentError, 'server disconnected'
1093 if me_dir in line:
1094 ongoing.append(line.split()[0].split('.')[0])
1095 status2 = line.split()[4]
1096 if status2 in self.idle_tag:
1097 idle += 1
1098 elif status2 in self.running_tag:
1099 run += 1
1100 elif status2 in self.complete_tag:
1101 if not self.check_termination(line.split()[0].split('.')[0]):
1102 idle += 1
1103 else:
1104 fail += 1
1105
1106 if status.returncode != 0 and status.returncode is not None:
1107 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1108
1109 for id in list(self.submitted_ids):
1110 if id not in ongoing:
1111 status2 = self.check_termination(id)
1112 if status2 == 'wait':
1113 run += 1
1114 elif status2 == 'resubmit':
1115 idle += 1
1116
1117 return idle, run, self.submitted - (idle+run+fail), fail
1118
1119 @multiple_try()
1120 - def remove(self, *args, **opts):
1121 """Clean the jobs on the cluster"""
1122
1123 if not self.submitted_ids:
1124 return
1125 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1126 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1127
1130 """Basic class for dealing with cluster submission"""
1131
1132
1133 name = 'sge'
1134 job_id = 'JOB_ID'
1135 idle_tag = ['qw', 'hqw','hRqw','w']
1136 running_tag = ['r','t','Rr','Rt']
1137
1139 """replace string for path issues"""
1140 location = os.path.realpath(location)
1141 homePath = os.getenv("HOME")
1142 if homePath:
1143 location = location.replace(homePath,'$HOME')
1144 return location
1145
1146 @multiple_try()
1147 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1148 required_output=[], nb_submit=0):
1149 """Submit a job prog to an SGE cluster"""
1150
1151 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1152 me_dir = misc.digest(me_dir)[-10:]
1153 if not me_dir[0].isalpha():
1154 me_dir = 'a' + me_dir[1:]
1155
1156 if cwd is None:
1157
1158 cwd = self.def_get_path(os.getcwd())
1159 cwd1 = self.def_get_path(cwd)
1160 text = " cd %s;" % cwd1
1161 if stdout is None:
1162 stdout = '/dev/null'
1163 else:
1164 stdout = self.def_get_path(stdout)
1165 if stderr is None:
1166 stderr = '/dev/null'
1167 elif stderr == -2:
1168 stderr = stdout
1169 else:
1170 stderr = self.def_get_path(stderr)
1171
1172 if log is None:
1173 log = '/dev/null'
1174 else:
1175 log = self.def_get_path(log)
1176
1177 text += prog
1178 if argument:
1179 text += ' ' + ' '.join(argument)
1180
1181
1182
1183
1184 homePath = os.getenv("HOME")
1185 if homePath:
1186 text = text.replace(homePath,'$HOME')
1187
1188 logger.debug("!=== input %s" % text)
1189 logger.debug("!=== output %s" % stdout)
1190 logger.debug("!=== error %s" % stderr)
1191 logger.debug("!=== logs %s" % log)
1192
1193 command = ['qsub','-o', stdout,
1194 '-N', me_dir,
1195 '-e', stderr,
1196 '-V']
1197
1198 if self.cluster_queue and self.cluster_queue != 'None':
1199 command.extend(['-q', self.cluster_queue])
1200
1201 a = misc.Popen(command, stdout=subprocess.PIPE,
1202 stderr=subprocess.STDOUT,
1203 stdin=subprocess.PIPE, cwd=cwd)
1204
1205 output = a.communicate(text)[0]
1206 id = output.split(' ')[2]
1207 if not id.isdigit():
1208 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1209 % output
1210 self.submitted += 1
1211 self.submitted_ids.append(id)
1212 logger.debug(output)
1213
1214 return id
1215
1216 @multiple_try()
1218 """ control the status of a single job with it's cluster id """
1219
1220 cmd = 'qstat '
1221 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1222 for line in status.stdout:
1223
1224
1225
1226
1227
1228
1229 if str(id) in line:
1230 status = line.split()[4]
1231
1232 if status in self.idle_tag:
1233 return 'I'
1234 elif status in self.running_tag:
1235 return 'R'
1236 return 'F'
1237
1238 @multiple_try()
1240 """ control the status of a single job with it's cluster id """
1241 cmd = "qstat "
1242 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1243
1244 if me_dir.endswith('/'):
1245 me_dir = me_dir[:-1]
1246 me_dir = misc.digest(me_dir)[-10:]
1247 if not me_dir[0].isalpha():
1248 me_dir = 'a' + me_dir[1:]
1249
1250 idle, run, fail = 0, 0, 0
1251 for line in status.stdout:
1252 if me_dir in line:
1253 status = line.split()[4]
1254 if status in self.idle_tag:
1255 idle += 1
1256 elif status in self.running_tag:
1257 run += 1
1258 else:
1259 logger.debug(line)
1260 fail += 1
1261
1262 return idle, run, self.submitted - (idle+run+fail), fail
1263
1264
1265
1266 @multiple_try()
1267 - def remove(self, *args, **opts):
1268 """Clean the jobs on the cluster"""
1269
1270 if not self.submitted_ids:
1271 return
1272 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1273 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1274
1277 """Basic class for dealing with cluster submission"""
1278
1279 name = 'lsf'
1280 job_id = 'LSB_JOBID'
1281
1282 @multiple_try()
1283 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1284 required_output=[], nb_submit=0):
1285 """Submit the job prog to an LSF cluster"""
1286
1287 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1288 me_dir = misc.digest(me_dir)[-14:]
1289 if not me_dir[0].isalpha():
1290 me_dir = 'a' + me_dir[1:]
1291
1292 text = ""
1293 command = ['bsub', '-C0', '-J', me_dir]
1294 if cwd is None:
1295 cwd = os.getcwd()
1296 else:
1297 text = " cd %s;" % cwd
1298 if stdout and isinstance(stdout, str):
1299 command.extend(['-o', stdout])
1300 if stderr and isinstance(stdout, str):
1301 command.extend(['-e', stderr])
1302 elif stderr == -2:
1303 pass
1304 if log is None:
1305 log = '/dev/null'
1306
1307 text += prog
1308 if argument:
1309 text += ' ' + ' '.join(argument)
1310
1311 if self.cluster_queue and self.cluster_queue != 'None':
1312 command.extend(['-q', self.cluster_queue])
1313
1314 a = misc.Popen(command, stdout=subprocess.PIPE,
1315 stderr=subprocess.STDOUT,
1316 stdin=subprocess.PIPE, cwd=cwd)
1317
1318 output = a.communicate(text)[0]
1319
1320 try:
1321 id = output.split('>',1)[0].split('<')[1]
1322 except:
1323 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1324 % output
1325 if not id.isdigit():
1326 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1327 % output
1328 self.submitted += 1
1329 self.submitted_ids.append(id)
1330 return id
1331
1332
1333 @multiple_try()
1335 """ control the status of a single job with it's cluster id """
1336
1337 cmd = 'bjobs '+str(id)
1338 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1339
1340 for line in status.stdout:
1341 line = line.strip().upper()
1342 if 'JOBID' in line:
1343 continue
1344 elif str(id) not in line:
1345 continue
1346 status = line.split()[2]
1347 if status == 'RUN':
1348 return 'R'
1349 elif status == 'PEND':
1350 return 'I'
1351 elif status == 'DONE':
1352 return 'F'
1353 else:
1354 return 'H'
1355 return 'F'
1356
1357 @multiple_try()
1359 """ control the status of a single job with it's cluster id """
1360
1361 if not self.submitted_ids:
1362 return 0, 0, 0, 0
1363
1364 cmd = "bjobs " + ' '.join(self.submitted_ids)
1365 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1366
1367 jobstatus = {}
1368 for line in status.stdout:
1369 line = line.strip()
1370 if 'JOBID' in line:
1371 continue
1372 splitline = line.split()
1373 id = splitline[0]
1374 if id not in self.submitted_ids:
1375 continue
1376 jobstatus[id] = splitline[2]
1377
1378 idle, run, fail = 0, 0, 0
1379 for id in self.submitted_ids[:]:
1380 if id in jobstatus:
1381 status = jobstatus[id]
1382 else:
1383 status = 'MISSING'
1384 if status == 'RUN':
1385 run += 1
1386 elif status == 'PEND':
1387 idle += 1
1388 else:
1389 status = self.check_termination(id)
1390 if status == 'wait':
1391 run += 1
1392 elif status == 'resubmit':
1393 idle += 1
1394
1395 return idle, run, self.submitted - (idle+run+fail), fail
1396
1397 @multiple_try()
1398 - def remove(self, *args,**opts):
1399 """Clean the jobs on the cluster"""
1400
1401 if not self.submitted_ids:
1402 return
1403 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1404 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1405
1407 """Class for dealing with cluster submission on a GE cluster"""
1408
1409 name = 'ge'
1410 job_id = 'JOB_ID'
1411 idle_tag = ['qw']
1412 running_tag = ['r']
1413
1414 @multiple_try()
1415 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1416 required_output=[], nb_submit=0):
1417 """Submit a job prog to a GE cluster"""
1418
1419 text = ""
1420 if cwd is None:
1421 cwd = os.getcwd()
1422 else:
1423 text = " cd %s; bash " % cwd
1424 if stdout is None:
1425 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1426 if stderr is None:
1427 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1428 elif stderr == -2:
1429 stderr = stdout
1430 if log is None:
1431 log = '/dev/null'
1432
1433 text += prog
1434 if argument:
1435 text += ' ' + ' '.join(argument)
1436 text += '\n'
1437 tmp_submit = os.path.join(cwd, 'tmp_submit')
1438 open(tmp_submit,'w').write(text)
1439
1440 a = misc.Popen(['qsub','-o', stdout,
1441 '-e', stderr,
1442 tmp_submit],
1443 stdout=subprocess.PIPE,
1444 stderr=subprocess.STDOUT,
1445 stdin=subprocess.PIPE, cwd=cwd)
1446
1447 output = a.communicate()[0]
1448
1449 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1450 try:
1451 id = pat.search(output).groups()[0]
1452 except:
1453 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1454 % output
1455 self.submitted += 1
1456 self.submitted_ids.append(id)
1457 return id
1458
1459 @multiple_try()
1461 """ control the status of a single job with it's cluster id """
1462 cmd = 'qstat | grep '+str(id)
1463 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1464 if not status:
1465 return 'F'
1466
1467 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1468 stat = ''
1469 for line in status.stdout.read().split('\n'):
1470 if not line:
1471 continue
1472 line = line.strip()
1473 try:
1474 groups = pat.search(line).groups()
1475 except:
1476 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1477 if groups[0] != id: continue
1478 stat = groups[1]
1479 if not stat:
1480 return 'F'
1481 if stat in self.idle_tag:
1482 return 'I'
1483 if stat in self.running_tag:
1484 return 'R'
1485
1486 @multiple_try()
1488 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1489 if not self.submitted_ids:
1490 return 0, 0, 0, 0
1491 idle, run, fail = 0, 0, 0
1492 ongoing = []
1493 for statusflag in ['p', 'r', 'sh']:
1494 cmd = 'qstat -s %s' % statusflag
1495 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1496
1497 pat = re.compile("^(\d+)")
1498 for line in status.stdout.read().split('\n'):
1499 line = line.strip()
1500 try:
1501 id = pat.search(line).groups()[0]
1502 except Exception:
1503 pass
1504 else:
1505 if id not in self.submitted_ids:
1506 continue
1507 ongoing.append(id)
1508 if statusflag == 'p':
1509 idle += 1
1510 if statusflag == 'r':
1511 run += 1
1512 if statusflag == 'sh':
1513 fail += 1
1514 for id in list(self.submitted_ids):
1515 if id not in ongoing:
1516 self.check_termination(id)
1517
1518
1519 return idle, run, self.submitted - idle - run - fail, fail
1520
1521 @multiple_try()
1522 - def remove(self, *args, **opts):
1523 """Clean the jobs on the cluster"""
1524
1525 if not self.submitted_ids:
1526 return
1527 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1528 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1529
1531 """start a computation and not wait for it to finish.
1532 this fonction returns a lock which is locked as long as the job is
1533 running."""
1534
1535 mc = MultiCore(1)
1536 mc.submit(exe, argument, cwd, stdout, **opt)
1537 mc.need_waiting = True
1538 mc.lock.acquire()
1539 return mc.lock
1540
1543 """Basic class for dealing with cluster submission"""
1544
1545 name = 'slurm'
1546 job_id = 'SLURM_JOBID'
1547 idle_tag = ['Q','PD','S','CF']
1548 running_tag = ['R', 'CG']
1549 complete_tag = ['C']
1550
1551 @multiple_try()
1552 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1553 required_output=[], nb_submit=0):
1554 """Submit a job prog to a SLURM cluster"""
1555
1556 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0]
1557 me_dir = misc.digest(me_dir)[-8:]
1558
1559 if not me_dir[0].isalpha():
1560 me_dir = 'a' + me_dir[1:]
1561
1562 if cwd is None:
1563 cwd = os.getcwd()
1564 if stdout is None:
1565 stdout = '/dev/null'
1566 if stderr is None:
1567 stderr = '/dev/null'
1568 elif stderr == -2:
1569 stderr = stdout
1570 if log is None:
1571 log = '/dev/null'
1572
1573 command = ['sbatch', '-o', stdout,
1574 '-J', me_dir,
1575 '-e', stderr, prog] + argument
1576
1577 if self.cluster_queue and self.cluster_queue != 'None':
1578 command.insert(1, '-p')
1579 command.insert(2, self.cluster_queue)
1580
1581 a = misc.Popen(command, stdout=subprocess.PIPE,
1582 stderr=subprocess.STDOUT,
1583 stdin=subprocess.PIPE, cwd=cwd)
1584
1585 output = a.communicate()
1586 output_arr = output[0].split(' ')
1587 id = output_arr[3].rstrip()
1588
1589 if not id.isdigit():
1590 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1591
1592 self.submitted += 1
1593 self.submitted_ids.append(id)
1594 return id
1595
1596 @multiple_try()
1598 """ control the status of a single job with it's cluster id """
1599 cmd = 'squeue j'+str(id)
1600 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1601 stderr=open(os.devnull,'w'))
1602
1603 for line in status.stdout:
1604 line = line.strip()
1605 if 'Invalid' in line:
1606 return 'F'
1607 elif line.startswith(str(id)):
1608 status = line.split()[4]
1609 if status in self.idle_tag:
1610 return 'I'
1611 elif status in self.running_tag:
1612 return 'R'
1613 return 'F'
1614
1615 @multiple_try()
1617 """ control the status of a single job with it's cluster id """
1618 cmd = "squeue"
1619 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1620
1621 if me_dir.endswith('/'):
1622 me_dir = me_dir[:-1]
1623 me_dir = misc.digest(me_dir)[-8:]
1624 if not me_dir[0].isalpha():
1625 me_dir = 'a' + me_dir[1:]
1626
1627 idle, run, fail = 0, 0, 0
1628 ongoing=[]
1629 for line in status.stdout:
1630 if me_dir in line:
1631 id, _, _,_ , status,_ = line.split(None,5)
1632 ongoing.append(id)
1633 if status in self.idle_tag:
1634 idle += 1
1635 elif status in self.running_tag:
1636 run += 1
1637 elif status in self.complete_tag:
1638 status = self.check_termination(id)
1639 if status == 'wait':
1640 run += 1
1641 elif status == 'resubmit':
1642 idle += 1
1643 else:
1644 fail += 1
1645
1646
1647 for id in list(self.submitted_ids):
1648 if id not in ongoing:
1649 status = self.check_termination(id)
1650 if status == 'wait':
1651 run += 1
1652 elif status == 'resubmit':
1653 idle += 1
1654
1655
1656 return idle, run, self.submitted - (idle+run+fail), fail
1657
1658 @multiple_try()
1659 - def remove(self, *args, **opts):
1660 """Clean the jobs on the cluster"""
1661
1662 if not self.submitted_ids:
1663 return
1664 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1665 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1666
1668 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1669
1670 name= 'htcaas'
1671 job_id = 'HTCAAS_JOBID'
1672
1673 @store_input()
1674 @multiple_try()
1675 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1676 log=None, input_files=[], output_files=[], required_output=[],
1677 nb_submit=0):
1678 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1679 input/output file should be give relative to cwd
1680 """
1681
1682 if 'ajob' in prog:
1683 prog_num = prog.rsplit("ajob",1)[1]
1684 else:
1685 prog_num = '0'
1686
1687 cur_usr = os.getenv('USER')
1688
1689 if cwd is None:
1690 cwd = os.getcwd()
1691
1692 cwd_cp = cwd.rsplit("/",2)
1693
1694
1695 if not stdout is None:
1696 print "stdout: %s" % stdout
1697
1698 if not os.path.exists(prog):
1699 prog = os.path.join(cwd, prog)
1700
1701 if not required_output and output_files:
1702 required_output = output_files
1703
1704
1705 if not 'combine' and not 'pythia' in prog :
1706 cwd_arg = cwd+"/arguments"
1707 temp = ' '.join([str(a) for a in argument])
1708 arg_cmd="echo '"+temp+"' > " + cwd_arg
1709
1710
1711 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1712 if argument :
1713 command.extend(['-a ', '='.join([str(a) for a in argument])])
1714 print command
1715 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1716 id = a.stdout.read().strip()
1717
1718 else:
1719 cwd_arg = cwd+"/arguments"
1720 temp = ' '.join([str(a) for a in argument])
1721
1722
1723
1724
1725 temp_file_name = "sub." + os.path.basename(prog)
1726 text = """#!/bin/bash
1727 MYPWD=%(cwd)s
1728 cd $MYPWD
1729 input_files=(%(input_files)s )
1730 for i in ${input_files[@]}
1731 do
1732 chmod -f +x $i
1733 done
1734 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1735 """
1736 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1737 'arguments': ' '.join([str(a) for a in argument]),
1738 'program': ' ' if '.py' in prog else 'bash'}
1739
1740
1741 new_prog = pjoin(cwd, temp_file_name)
1742 open(new_prog, 'w').write(text % dico)
1743 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1744 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1745 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1746 id = a.stdout.read().strip()
1747
1748 nb_try=0
1749 nb_limit=5
1750 if not id.isdigit() :
1751 print "[ID is not digit]:" + id
1752
1753 while not id.isdigit() :
1754 nb_try+=1
1755 print "[fail_retry]:"+ nb_try
1756 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1757 id = a.stdout.read().strip()
1758 if nb_try > nb_limit :
1759 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1760 break
1761
1762 self.submitted += 1
1763 self.submitted_ids.append(id)
1764
1765 return id
1766
1767 @multiple_try(nb_try=10, sleep=10)
1769 """ control the status of a single job with it's cluster id """
1770
1771 if id == 0 :
1772 status_out ='C'
1773 else :
1774 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1775 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1776 stderr=subprocess.PIPE)
1777 error = status.stderr.read()
1778 if status.returncode or error:
1779 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1780 status_out= status.stdout.read().strip()
1781 status_out= status_out.split(":",1)[1]
1782 if status_out == 'waiting':
1783 status_out='I'
1784 elif status_out == 'preparing' or status_out == 'running':
1785 status_out = 'R'
1786 elif status_out != 'done':
1787 status_out = 'F'
1788 elif status_out == 'done':
1789 status_out = 'C'
1790
1791 return status_out
1792
1793 @multiple_try(nb_try=15, sleep=1)
1795 """ control the status of a single job with it's cluster id """
1796
1797 if not self.submitted_ids:
1798 return 0, 0, 0, 0
1799
1800 ongoing = []
1801 idle, run, fail = 0, 0, 0
1802
1803 if id == 0 :
1804 return 0 , 0, 0, 0
1805 else :
1806 for i in range(len(self.submitted_ids)):
1807 ongoing.append(int(self.submitted_ids[i]))
1808 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
1809 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1810 status_out= status.stdout.read().strip()
1811 status_out= status_out.split(":",1)[1]
1812 if status_out == 'waiting':
1813 idle += 1
1814 elif status_out == 'preparing':
1815 run += 1
1816 elif status_out == 'running':
1817 run += 1
1818 elif status_out != 'done':
1819 fail += 1
1820
1821 if status_out != 'done':
1822 print "["+ self.submitted_ids[i] + "] " + status_out
1823 '''
1824 for i in range(len(self.submitted_ids)):
1825 if int(self.submitted_ids[i]) not in ongoing:
1826 status = self.check_termination(int(self.submitted_ids[i]))
1827 if status = 'waiting':
1828 idle += 1
1829 elif status == 'resubmit':
1830 idle += 1
1831 elif status == 'failed':
1832 fail += 1
1833 '''
1834
1835 return idle, run, self.submitted - (idle+run+fail), fail
1836
1837 @multiple_try()
1838 - def remove(self, *args, **opts):
1839 """Clean the jobson the cluster"""
1840
1841 if not self.submitted_ids:
1842 return
1843 for i in range(len(self.submitted_ids)):
1844 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
1845 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1846
1849 """Class for dealing with cluster submission on a HTCaaS cluster"""
1850
1851 name= 'htcaas2'
1852 job_id = 'HTCAAS2_JOBID'
1853
1854 @store_input()
1855 @multiple_try()
1856 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1857 log=None, input_files=[], output_files=[], required_output=[],
1858 nb_submit=0):
1859 """Submit the job on the cluster NO SHARE DISK
1860 input/output file should be give relative to cwd
1861 """
1862
1863 if 'ajob' in prog:
1864 prog_num = prog.rsplit("ajob",1)[1]
1865 elif 'run_combine' in prog:
1866 prog_num = '0'
1867 else:
1868 prog_num = prog
1869
1870 cur_usr = os.getenv('USER')
1871
1872 import uuid
1873 dir = str(uuid.uuid4().hex)
1874
1875 prog_dir = '_run%s'% prog_num
1876 prog_dir = dir+prog_dir
1877
1878 if cwd is None:
1879 cwd = os.getcwd()
1880
1881 cwd_cp = cwd.rsplit("/",2)
1882
1883 if stdout is None:
1884 stdout='/dev/null'
1885
1886 if not os.path.exists(prog):
1887 prog = os.path.join(cwd, prog)
1888
1889 if not required_output and output_files:
1890 required_output = output_files
1891
1892 if '/' in argument :
1893 temp_file_name = "sub." + os.path.basename(prog)
1894 else :
1895 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
1896
1897
1898 if 'combine' in prog or 'pythia' in prog :
1899 text = """#!/bin/bash
1900 MYPWD=%(cwd)s
1901 cd $MYPWD
1902 script=%(script)s
1903 input_files=(%(input_files)s )
1904 if [ $# -ge 1 ]; then
1905 arg1=$1
1906 else
1907 arg1=''
1908 fi
1909 args=' %(arguments)s'
1910 for i in ${input_files[@]}; do
1911 if [[ "$i" == *$script* ]]; then
1912 script=$i
1913 fi
1914 chmod -f +x $i
1915 done
1916 /bin/bash ${script} ${args} > %(stdout)s
1917 """
1918
1919 elif 'shower' in prog :
1920 text = """#!/bin/bash
1921 MYPWD=%(cwd)s
1922 cd $MYPWD
1923 args=' %(arguments)s'
1924 input_files=( %(input_files)s )
1925 for i in ${input_files[@]}
1926 do
1927 chmod -f +x $i
1928 done
1929 /bin/bash %(script)s ${args} > $MYPWD/done
1930 """
1931
1932 else :
1933 text = """#!/bin/bash
1934 MYPWD=%(cwd)s
1935 #mkdir -p $MYTMP
1936 cd $MYPWD
1937 input_files=( %(input_files)s )
1938 for i in ${input_files[@]}
1939 do
1940 if [[ $i != */*/* ]]; then
1941 i=$PWD"/"$i
1942 fi
1943 echo $i
1944 if [ -d $i ]; then
1945 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1946 else
1947 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1948 fi
1949 done
1950 """
1951
1952 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
1953 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
1954 'input_files': ' '.join(input_files + [prog]),
1955 'output_files': ' '.join(output_files), 'stdout': stdout,
1956 'arguments': ' '.join([str(a) for a in argument]),
1957 'program': ' ' if '.py' in prog else 'bash'}
1958
1959
1960 new_prog = pjoin(cwd, temp_file_name)
1961 open(new_prog, 'w').write(text % dico)
1962 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1963
1964
1965 cmd1='/bin/bash '+ cwd+'/'+temp_file_name
1966 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE,
1967 stderr=subprocess.PIPE)
1968
1969
1970
1971 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog:
1972
1973 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s"""
1974 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
1975 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) ,
1976 'prog_dir': prog_dir }
1977 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE,
1978 stderr=subprocess.PIPE)
1979 id = status3.stdout.read().strip()
1980
1981 nb_try=0
1982 nb_limit=5
1983 while not id.isdigit() :
1984 nb_try+=1
1985 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1986 id = a.stdout.read().strip()
1987 if nb_try > nb_limit :
1988 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id
1989 break
1990
1991 temp_file_name2 = "sub." +id
1992 text2 = """#!/bin/bash
1993 MYPWD=%(cwd)s
1994 output_files=( %(output_files)s )
1995 result=done
1996 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then
1997 for i in ${output_files[@]}
1998 do
1999 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s
2000 chmod -Rf 777 ${MYPWD}/$i
2001 done
2002 for i in ${output_files[@]}; do
2003 if [[ -e ${MYPWD}/$i ]]; then
2004 result=done
2005 else
2006 result=running
2007 echo $result
2008 exit 0
2009 fi
2010 done
2011 echo $result
2012 touch ${MYPWD}/done.%(job_id)s
2013 else
2014 for i in ${output_files[@]}; do
2015 if [ -e ${MYPWD}/$i ]; then
2016 result=done
2017 else
2018 rm -f ${MYPWD}/done.%(job_id)s
2019 result=running
2020 echo $result
2021 exit 0
2022 fi
2023 done
2024 echo $result
2025
2026 fi
2027
2028 """
2029 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
2030 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
2031 'output_files': ' '.join(output_files), 'job_id': id,
2032 'program': ' ' if '.py' in prog else 'bash'}
2033
2034 homePath = os.getenv("HOME")
2035 outPath = homePath +"/MG5"
2036
2037 new_prog2 = pjoin(outPath, temp_file_name2)
2038 open(new_prog2, 'w').write(text2 % dico2)
2039 misc.Popen(['chmod','+x',new_prog2],cwd=cwd)
2040
2041
2042 self.submitted += 1
2043 self.submitted_ids.append(id)
2044
2045 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog:
2046 if '/dev/null' in stdout :
2047 stdout=''
2048
2049 temp_file_shower = "sub.out"
2050 text_shower = """#!/bin/bash
2051 MYPWD=%(cwd)s
2052 result=done
2053 output_files=(%(output_files)s)
2054 for i in ${output_files[@]}; do
2055 if [ -e $MYPWD/$i -o -e $i ]; then
2056 result=done
2057 else
2058 result=running
2059 echo $result
2060 exit 0
2061 fi
2062 done
2063 echo $result
2064 """
2065 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files),
2066 'program': ' ' if '.py' in prog else 'bash'}
2067 homePath = os.getenv("HOME")
2068 outPath = homePath +"/MG5"
2069 new_prog_shower = pjoin(outPath, temp_file_shower)
2070 open(new_prog_shower, 'w').write(text_shower % dico_shower)
2071 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd)
2072
2073 id='-1'
2074 self.submitted += 1
2075 self.submitted_ids.append(id)
2076
2077 else :
2078 id='-2'
2079 self.submitted += 1
2080 self.submitted_ids.append(id)
2081
2082 return id
2083
2084 @multiple_try(nb_try=10, sleep=10)
2086 """ control the status of a single job with it's cluster id """
2087
2088 homePath = os.getenv("HOME")
2089 outPath = homePath +"/MG5"
2090
2091
2092 if id == '0' or id=='-2' :
2093 status_out ='done'
2094 elif id == '-1' :
2095 cmd='/bin/bash ' +outPath+'/sub.out'
2096 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2097 status_out=status.stdout.read().strip()
2098 print "["+id+"]" + status_out
2099 if status_out == 'waiting':
2100 status_out='wait'
2101 elif status_out == 'preparing' or status_out == 'running':
2102 status_out = 'R'
2103 elif status_out != 'done':
2104 status_out = 'F'
2105 elif status_out == 'done':
2106 status_out = 'C'
2107
2108 print "["+id+"]" + status_out
2109 else :
2110 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
2111 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
2112 stderr=subprocess.PIPE)
2113 error = status.stderr.read()
2114 if status.returncode or error:
2115 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
2116 status_out= status.stdout.read().strip()
2117 status_out= status_out.split(":",1)[1]
2118 print "["+id+"]" + status_out
2119 if status_out == 'waiting':
2120 status_out='wait'
2121 elif status_out == 'preparing' or status_out == 'running':
2122 status_out = 'R'
2123 elif status_out == 'failed' :
2124 args = self.retry_args[id]
2125 id_temp = self.submit2(**args)
2126 del self.retry_args[id]
2127 self.submitted_ids.remove(id)
2128 status_out = 'I'
2129 elif status_out != 'done':
2130 status_out = 'F'
2131 elif status_out == 'done':
2132 status_out = 'C'
2133
2134 return status_out
2135
2136
2137 @check_interupt()
2138 @multiple_try(nb_try=15, sleep=10)
2140 """ control the status of a single job with it's cluster id """
2141
2142 if not self.submitted_ids:
2143 return 0, 0, 0, 0
2144
2145 ongoing = []
2146 idle, run, fail = 0, 0, 0
2147
2148 homePath = os.getenv("HOME")
2149 outPath = homePath +"/MG5"
2150
2151 for i in range(len(self.submitted_ids)):
2152 ongoing.append(self.submitted_ids[i])
2153 if self.submitted_ids[i] == '-2' :
2154 return 0,0,0,0
2155 if self.submitted_ids[i] == '0' :
2156
2157 status_out='done'
2158 elif self.submitted_ids[i] == '-1' :
2159 cmd='/bin/bash ' +outPath+'/sub.out'
2160 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2161 status_out=status.stdout.read().strip()
2162 if status_out == 'waiting':
2163 idle += 1
2164 elif status_out == 'preparing':
2165 run += 1
2166 elif status_out == 'running':
2167 run += 1
2168 elif status_out != 'done':
2169 fail += 1
2170 else :
2171 args = self.retry_args[str(self.submitted_ids[i])]
2172 if 'required_output'in args and not args['required_output']:
2173 args['required_output'] = args['output_files']
2174 self.retry_args[str(self.submitted_ids[i])] = args
2175
2176 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
2177 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2178 status_out= status.stdout.read().strip()
2179 status_out= status_out.split(":",1)[1]
2180 if status_out == 'waiting':
2181 idle += 1
2182 elif status_out == 'preparing':
2183 run += 1
2184 elif status_out == 'running':
2185 run += 1
2186 elif status_out == 'failed' or status_out == 'canceled':
2187 id = self.submit2(**args)
2188
2189 del self.retry_args[self.submitted_ids[i]]
2190 self.submitted_ids.remove(self.submitted_ids[i])
2191 self.submitted-=1
2192 idle += 1
2193 elif status_out != 'done':
2194 fail += 1
2195 if status_out == 'done':
2196 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i]
2197 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2198 aa= status2.stdout.read().strip()
2199
2200
2201
2202
2203
2204
2205
2206 for path in args['required_output']:
2207 if args['cwd']:
2208 path = pjoin(args['cwd'], path)
2209
2210 temp1=os.path.exists(path)
2211 temp2=os.stat(path).st_size
2212 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
2213 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2214 aa= status2.stdout.read().strip()
2215 if aa == 'done':
2216 self.submitted_ids[i] = '0'
2217 elif aa == 'running':
2218 run += 1
2219 else :
2220 self.submitted_ids[i]='0'
2221
2222
2223 for i in range(len(self.submitted_ids)):
2224 if str(self.submitted_ids[i]) not in ongoing:
2225 status2= self.check_termination(str(self.submitted_ids[i]))
2226 if status2 == 'wait':
2227 run += 1
2228 elif status2 == 'resubmit':
2229 idle += 1
2230
2231 return idle, run, self.submitted - (idle+run+fail), fail
2232
2233 @multiple_try()
2234 - def remove(self, *args, **opts):
2235 """Clean the jobson the cluster"""
2236
2237 if not self.submitted_ids:
2238 return
2239 for i in range(len(self.submitted_ids)):
2240 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
2241 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2242
2243
2244 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2245 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2246 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2247