Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21   
  22  logger = logging.getLogger('madgraph.cluster')  
  23   
  24  try: 
  25      from madgraph import MadGraph5Error 
  26      import madgraph.various.misc as misc 
  27  except Exception, error: 
  28      if __debug__: 
  29          print  str(error) 
  30      from internal import MadGraph5Error 
  31      import internal.misc as misc 
  32   
  33  pjoin = os.path.join 
34 35 -class ClusterManagmentError(MadGraph5Error):
36 pass
37
38 -class NotImplemented(MadGraph5Error):
39 pass
40 41 42 multiple_try = misc.multiple_try 43 pjoin = os.path.join
44 45 46 -def check_interupt(error=KeyboardInterrupt):
47 48 def deco_interupt(f): 49 def deco_f_interupt(self, *args, **opt): 50 try: 51 return f(self, *args, **opt) 52 except error: 53 try: 54 self.remove(*args, **opt) 55 except Exception: 56 pass 57 raise error
58 return deco_f_interupt 59 return deco_interupt 60
61 -def store_input(arg=''):
62 63 def deco_store(f): 64 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 65 input_files=[], output_files=[], required_output=[], nb_submit=0): 66 frame = inspect.currentframe() 67 args, _, _, values = inspect.getargvalues(frame) 68 args = dict([(i, values[i]) for i in args if i != 'self']) 69 id = f(self, **args) 70 if self.nb_retry > 0: 71 self.retry_args[id] = args 72 return id
73 return deco_f_store 74 return deco_store 75
76 77 -class Cluster(object):
78 """Basic Class for all cluster type submission""" 79 name = 'mother class' 80
81 - def __init__(self,*args, **opts):
82 """Init the cluster""" 83 84 self.submitted = 0 85 self.submitted_ids = [] 86 self.finish = 0 87 if 'cluster_queue' in opts: 88 self.cluster_queue = opts['cluster_queue'] 89 else: 90 self.cluster_queue = 'madgraph' 91 if 'cluster_temp_path' in opts: 92 self.temp_dir = opts['cluster_temp_path'] 93 else: 94 self.temp_dir = None 95 self.options = {'cluster_status_update': (600, 30)} 96 for key,value in opts.items(): 97 self.options[key] = value 98 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 99 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300 100 self.options = dict(opts) 101 self.retry_args = {}
102 103
104 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 105 log=None, required_output=[], nb_submit=0):
106 """How to make one submission. Return status id on the cluster.""" 107 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
108 109 @store_input()
110 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 111 log=None, input_files=[], output_files=[], required_output=[],nb_submit=0):
112 """How to make one submission. Return status id on the cluster. 113 NO SHARE DISK""" 114 115 if cwd is None: 116 cwd = os.getcwd() 117 if not os.path.exists(prog): 118 prog = os.path.join(cwd, prog) 119 120 if not required_output and output_files: 121 required_output = output_files 122 123 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 124 (input_files == [] == output_files): 125 return self.submit(prog, argument, cwd, stdout, stderr, log, 126 required_output=required_output, nb_submit=nb_submit) 127 128 if not input_files and not output_files: 129 # not input/output so not using submit2 130 return self.submit(prog, argument, cwd, stdout, stderr, log, 131 required_output=required_output, nb_submit=nb_submit) 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 138 139 text = """#!/bin/bash 140 MYTMP=%(tmpdir)s/run$%(job_id)s 141 MYPWD=%(cwd)s 142 mkdir -p $MYTMP 143 cd $MYPWD 144 input_files=( %(input_files)s ) 145 for i in ${input_files[@]} 146 do 147 cp -R -L $i $MYTMP 148 done 149 cd $MYTMP 150 echo '%(arguments)s' > arguments 151 chmod +x ./%(script)s 152 %(program)s ./%(script)s %(arguments)s 153 output_files=( %(output_files)s ) 154 for i in ${output_files[@]} 155 do 156 cp -r $MYTMP/$i $MYPWD 157 done 158 rm -rf $MYTMP 159 """ 160 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 161 'cwd': cwd, 'job_id': self.job_id, 162 'input_files': ' '.join(input_files + [prog]), 163 'output_files': ' '.join(output_files), 164 'arguments': ' '.join([str(a) for a in argument]), 165 'program': ' ' if '.py' in prog else 'bash'} 166 167 # writing a new script for the submission 168 new_prog = pjoin(cwd, temp_file_name) 169 open(new_prog, 'w').write(text % dico) 170 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 171 172 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 173 required_output=required_output, nb_submit=nb_submit)
174 175
176 - def control(self, me_dir=None):
177 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 178 if not self.submitted_ids: 179 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 180 idle, run, fail = 0, 0, 0 181 for pid in self.submitted_ids[:]: 182 status = self.control_one_job(id) 183 if status == 'I': 184 idle += 1 185 elif status == 'R': 186 run += 1 187 elif status == 'F': 188 self.finish +=1 189 self.submitted_ids.remove(pid) 190 else: 191 fail += 1 192 193 return idle, run, self.finish, fail
194
195 - def control_one_job(self, pid):
196 """ control the status of a single job with it's cluster id """ 197 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
198 199 @check_interupt()
200 - def wait(self, me_dir, fct, minimal_job=0):
201 """Wait that all job are finish. 202 if minimal_job set, then return if idle + run is lower than that number""" 203 204 205 mode = 1 # 0 is long waiting/ 1 is short waiting 206 nb_iter = 0 207 nb_short = 0 208 change_at = 5 # number of iteration from which we wait longer between update. 209 #usefull shortcut for readibility 210 longtime, shorttime = self.options['cluster_status_update'] 211 212 while 1: 213 old_mode = mode 214 nb_iter += 1 215 idle, run, finish, fail = self.control(me_dir) 216 if fail: 217 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 218 if idle + run == 0: 219 #time.sleep(20) #security to ensure that the file are really written on the disk 220 logger.info('All jobs finished') 221 break 222 if idle + run < minimal_job: 223 return 224 fct(idle, run, finish) 225 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 226 if nb_iter < change_at: 227 mode = 1 228 elif idle < run: 229 if old_mode == 0: 230 if nb_short: 231 mode = 0 #we already be back from short to long so stay in long 232 #check if we need to go back to short mode 233 elif idle: 234 if nb_iter > change_at + int(longtime)//shorttime: 235 mode = 0 #stay in long waiting mode 236 else: 237 mode = 1 # pass in short waiting mode 238 nb_short =0 239 else: 240 mode = 1 # pass in short waiting mode 241 nb_short = 0 242 elif old_mode == 1: 243 nb_short +=1 244 if nb_short > 3* max(change_at, int(longtime)//shorttime): 245 mode = 0 #go back in slow waiting 246 else: 247 mode = 0 248 249 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 250 if old_mode > mode: 251 logger.info('''Start to wait %ss between checking status. 252 Note that you can change this time in the configuration file. 253 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 254 255 #now Waiting! 256 if mode == 0: 257 try: 258 time.sleep(self.options['cluster_status_update'][0]) 259 except KeyboardInterrupt: 260 logger.info('start to update the status') 261 nb_iter = min(0, change_at -2) 262 nb_short = 0 263 else: 264 time.sleep(self.options['cluster_status_update'][1]) 265 266 267 self.submitted = 0 268 self.submitted_ids = []
269
270 - def check_termination(self, job_id):
271 """Check the termination of the jobs with job_id and relaunch it if needed.""" 272 273 274 if job_id not in self.retry_args: 275 return True 276 277 args = self.retry_args[job_id] 278 if 'time_check' in args: 279 time_check = args['time_check'] 280 else: 281 time_check = 0 282 283 for path in args['required_output']: 284 if args['cwd']: 285 path = pjoin(args['cwd'], path) 286 # check that file exists and is not empty. 287 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 288 break 289 else: 290 # all requested output are present 291 if time_check > 0: 292 logger.info('Job %s Finally found the missing output.' % (job_id)) 293 del self.retry_args[job_id] 294 self.submitted_ids.remove(job_id) 295 return 'done' 296 297 if time_check == 0: 298 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 299 args['time_check'] = time.time() 300 return 'wait' 301 elif self.cluster_retry_wait > time.time() - time_check: 302 return 'wait' 303 304 #jobs failed to be completed even after waiting time!! 305 if self.nb_retry < 0: 306 logger.critical('''Fail to run correctly job %s. 307 with option: %s 308 file missing: %s''' % (job_id, args, path)) 309 raw_input('press enter to continue.') 310 elif self.nb_retry == 0: 311 logger.critical('''Fail to run correctly job %s. 312 with option: %s 313 file missing: %s. 314 Stopping all runs.''' % (job_id, args, path)) 315 #self.remove() 316 elif args['nb_submit'] >= self.nb_retry: 317 logger.critical('''Fail to run correctly job %s. 318 with option: %s 319 file missing: %s 320 Fails %s times 321 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 322 #self.remove() 323 else: 324 args['nb_submit'] += 1 325 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 326 del self.retry_args[job_id] 327 self.submitted_ids.remove(job_id) 328 if 'time_check' in args: 329 del args['time_check'] 330 self.submit2(**args) 331 return 'resubmit' 332 return 'done'
333 334 335 336 @check_interupt()
337 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 338 stderr=None, log=None, required_output=[], nb_submit=0, 339 input_files=[], output_files=[]):
340 """launch one job on the cluster and wait for it""" 341 342 special_output = False # tag for concatenate the error with the output. 343 if stderr == -2 and stdout: 344 #We are suppose to send the output to stdout 345 special_output = True 346 stderr = stdout + '.err' 347 348 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 349 required_output=required_output, input_files=input_files, 350 output_files=output_files) 351 352 frame = inspect.currentframe() 353 args, _, _, values = inspect.getargvalues(frame) 354 args = dict([(i, values[i]) for i in args if i != 'self']) 355 self.retry_args[id] = args 356 357 nb_wait=0 358 while 1: 359 nb_wait+=1 360 status = self.control_one_job(id) 361 if not status in ['R','I']: 362 status = self.check_termination(id) 363 if status in ['wait']: 364 time.sleep(30) 365 continue 366 elif status in ['resubmit']: 367 id = self.submitted_ids[0] 368 time.sleep(30) 369 continue 370 #really stop! 371 time.sleep(30) #security to ensure that the file are really written on the disk 372 break 373 time.sleep(self.options['cluster_status_update'][1]) 374 375 if required_output: 376 status = self.check_termination(id) 377 if status == 'wait': 378 run += 1 379 elif status == 'resubmit': 380 idle += 1 381 382 383 if special_output: 384 # combine the stdout and the stderr 385 #wait up to 50 s to see if those files exists 386 for i in range(5): 387 if os.path.exists(stdout): 388 if not os.path.exists(stderr): 389 time.sleep(5) 390 if os.path.exists(stderr): 391 err_text = open(stderr).read() 392 if not err_text: 393 return 394 logger.warning(err_text) 395 text = open(stdout).read() 396 open(stdout,'w').write(text + err_text) 397 else: 398 return 399 time.sleep(10)
400
401 - def remove(self, *args, **opts):
402 """ """ 403 logger.warning("""This cluster didn't support job removal, 404 the jobs are still running on the cluster.""")
405
406 -class MultiCore(Cluster):
407 """ class for dealing with the submission in multiple node""" 408 409 job_id = '$' 410
411 - def __init__(self, *args, **opt):
412 """Init the cluster""" 413 import thread 414 super(MultiCore, self).__init__(self, *args, **opt) 415 416 417 self.submitted = 0 418 self.finish = 0 419 if 'nb_core' in opt: 420 self.nb_core = opt['nb_core'] 421 elif isinstance(args[0],int): 422 self.nb_core = args[0] 423 else: 424 self.nb_core = 1 425 self.update_fct = None 426 427 # initialize the thread controler 428 self.need_waiting = False 429 self.nb_used = 0 430 self.lock = thread.allocate_lock() 431 self.done = 0 432 self.waiting_submission = [] 433 self.pids = [] 434 self.fail_msg = None
435
436 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 437 stderr=None, log=None, **opts):
438 """launch one job and wait for it""" 439 if isinstance(stdout, str): 440 stdout = open(stdout, 'w') 441 if isinstance(stderr, str): 442 stdout = open(stderr, 'w') 443 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
444 445
446 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 447 log=None, required_output=[], nb_submit=0):
448 """submit a job on multicore machine""" 449 450 self.submitted +=1 451 if cwd is None: 452 cwd = os.getcwd() 453 if isinstance(prog, str): 454 if not os.path.exists(prog) and not misc.which(prog): 455 prog = os.path.join(cwd, prog) 456 457 import thread 458 if self.waiting_submission or self.nb_used == self.nb_core: 459 self.waiting_submission.append((prog, argument,cwd, stdout)) 460 # check that none submission is already finished 461 while self.nb_used < self.nb_core and self.waiting_submission: 462 arg = self.waiting_submission.pop(0) 463 self.nb_used += 1 # udpate the number of running thread 464 thread.start_new_thread(self.launch, arg) 465 elif self.nb_used < self.nb_core -1: 466 self.nb_used += 1 # upate the number of running thread 467 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout)) 468 elif self.nb_used == self.nb_core -1: 469 self.nb_used += 1 # upate the number of running thread 470 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
471 472
473 - def launch(self, exe, argument, cwd, stdout):
474 """ way to launch for multicore. If exe is a string then treat it as 475 an executable. Otherwise treat it as a function""" 476 import thread 477 def end(self, pid): 478 self.nb_used -= 1 479 self.done += 1 480 try: 481 self.pids.remove(pid) 482 except: 483 pass
484 485 fail_msg = None 486 try: 487 if isinstance(exe,str): 488 if os.path.exists(exe) and not exe.startswith('/'): 489 exe = './' + exe 490 proc = misc.Popen([exe] + argument, cwd=cwd, stdout=stdout, 491 stderr=subprocess.STDOUT) 492 pid = proc.pid 493 self.pids.append(pid) 494 proc.wait() 495 if proc.returncode not in [0, 143, -15]: 496 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 497 (' '.join([exe]+argument), proc.returncode) 498 #self.fail_msg = fail_msg 499 logger.warning(fail_msg) 500 try: 501 log = open(glob.glob(pjoin(cwd,'*','log.txt'))[0]).read() 502 logger.warning('Last 15 lines of logfile %s:\n%s\n' % \ 503 (pjoin(cwd,'*','log.txt'), '\n'.join(log.split('\n')[-15:-1]) + '\n')) 504 except IOError, AttributeError: 505 logger.warning('Please look for possible logfiles in %s' % cwd) 506 pass 507 self.remove(fail_msg) 508 else: 509 pid = tuple([id(o) for o in [exe] + argument]) 510 self.pids.append(pid) 511 # the function should return 0 if everything is fine 512 # the error message otherwise 513 returncode = exe(argument) 514 if returncode != 0: 515 logger.warning(returncode) 516 self.remove() 517 518 519 520 # release the lock for allowing to launch the next job 521 security = 0 522 # check that the status is locked to avoid coincidence unlock 523 while 1: 524 while not self.lock.locked(): 525 if not self.need_waiting: 526 # Main is not yet locked 527 end(self, pid) 528 return 529 elif security > 60: 530 end(self, pid) 531 return 532 security += 1 533 time.sleep(1) 534 try: 535 self.lock.release() 536 except thread.error: 537 continue 538 break 539 end(self, pid) 540 541 542 except Exception, error: 543 #logger.critical('one core fails with %s' % error) 544 self.remove() 545 raise
546 547 548 549
550 - def wait(self, me_dir, update_status):
551 """Wait that all thread finish 552 self.nb_used and self.done are update via each jobs (thread and local) 553 self.submitted is the nb of times that submitted has been call (local) 554 remaining is the nb of job that we still have to wait. (local) 555 self.pids is the list of the BASH pid of the submitted jobs. (thread) 556 557 WARNING: In principle all those value are coherent but since some are 558 modified in various thread, those data can be corrupted. (not the local 559 one). Nb_used in particular shouldn't be trusted too much. 560 This code check in different ways that all jobs have finished. 561 562 In principle, the statement related to '#security #X' are not used. 563 In practise they are times to times. 564 """ 565 566 import thread 567 568 remaining = self.submitted - self.done 569 570 while self.nb_used < self.nb_core: 571 if self.waiting_submission: 572 arg = self.waiting_submission.pop(0) 573 thread.start_new_thread(self.launch, arg) 574 self.nb_used += 1 # update the number of running thread 575 else: 576 break 577 578 try: 579 self.need_waiting = True 580 self.lock.acquire() 581 no_in_queue = 0 582 secure_mode = False # forbid final acauire if in securemode 583 while self.waiting_submission or self.nb_used: 584 if self.fail_msg: 585 msg, self.fail_msg = self.fail_msg, None 586 self.remove() 587 raise Exception, msg 588 if update_status: 589 update_status(len(self.waiting_submission), self.nb_used, self.done) 590 # security#1 that all job expected to be launched since 591 # we enter in this function are indeed launched. 592 if len(self.waiting_submission) == 0 == remaining : 593 self.done = self.submitted 594 break 595 596 # security #2: nb_used >0 but nothing remains as BASH PID 597 if len(self.waiting_submission) == 0 and len(self.pids) == 0: 598 if self.submitted == self.done: 599 break 600 logger.debug('Found too many jobs. Recovering') 601 no_in_queue += 1 602 time.sleep(min(180, 5 * no_in_queue)) 603 if no_in_queue > 3: 604 logger.debug('Still too many jobs. Continue') 605 break 606 continue 607 608 # security #3: if nb_used not reliable pass in secure mode 609 if not secure_mode and len(self.waiting_submission) != 0: 610 if self.nb_used != self.nb_core: 611 if self.nb_used != len(self.pids): 612 secure_mode = True 613 # security #4: nb_used not reliable use secure mode to finish the run 614 if secure_mode and not self.waiting_submission: 615 self.need_waiting = False 616 if self.lock.locked(): 617 self.lock.release() 618 break 619 620 # Wait for core to finish 621 self.lock.acquire() 622 remaining -=1 # update remaining job 623 #submit next one 624 if self.waiting_submission: 625 arg = self.waiting_submission.pop(0) 626 thread.start_new_thread(self.launch, arg) 627 self.nb_used += 1 # update the number of running thread 628 629 if self.fail_msg: 630 msg, self.fail_msg = self.fail_msg, None 631 self.remove() 632 raise Exception, msg 633 # security #5: checked that self.nb_used is not lower than expected 634 #This is the most current problem. 635 no_in_queue = 0 636 while self.submitted > self.done: 637 if self.fail_msg: 638 msg, self.fail_msg = self.fail_msg, None 639 self.remove() 640 raise Exception, msg 641 if no_in_queue == 0: 642 logger.debug('Some jobs have been lost. Try to recover') 643 #something bad happens 644 if not len(self.pids): 645 # The job is not running 646 logger.critical('Some jobs have been lost in the multicore treatment.') 647 logger.critical('The results might be incomplete. (Trying to continue anyway)') 648 break 649 elif update_status: 650 update_status(len(self.waiting_submission), len(self.pids) , 651 self.done) 652 # waiting that those jobs ends. 653 if not secure_mode: 654 self.lock.acquire() 655 else: 656 no_in_queue += 1 657 try: 658 time.sleep(min(180,5*no_in_queue)) 659 if no_in_queue > 5 * 3600.0 / 162: 660 break 661 except KeyboardInterrupt: 662 logger.warning('CTRL-C assumes that all jobs are done. Continue the code') 663 self.pids = [] # avoid security 6 664 break 665 666 # security #6. check that queue is empty. don't 667 no_in_queue = 0 668 while len(self.pids): 669 if self.fail_msg: 670 msg, self.fail_msg = self.fail_msg, None 671 self.remove() 672 raise Exception, msg 673 self.need_waiting = False 674 if self.lock.locked(): 675 self.lock.release() 676 secure_mode = True 677 if no_in_queue == 0 : 678 logger.warning('Some jobs have been lost. Try to recover.') 679 logger.warning('Hitting ctrl-c will consider that all jobs are done and continue the code.') 680 try: 681 #something very bad happens 682 if update_status: 683 update_status(len(self.waiting_submission), len(self.pids) , 684 self.done) 685 time.sleep(min(5*no_in_queue, 180)) 686 no_in_queue += 1 687 if no_in_queue > 5 * 3600.0 / 162: 688 break 689 except KeyboardInterrupt: 690 break 691 692 # print a last time the status (forcing 0 for the running) 693 if update_status: 694 self.next_update = 0 695 update_status(len(self.waiting_submission), 0, self.done) 696 697 # reset variable for next submission 698 self.need_waiting = False 699 security = 0 700 while not self.lock.locked() and security < 10: 701 # check that the status is locked to avoid coincidence unlock 702 if secure_mode: 703 security = 10 704 security +=1 705 time.sleep(1) 706 if security < 10: 707 self.lock.release() 708 self.done = 0 709 self.nb_used = 0 710 self.submitted = 0 711 self.pids = [] 712 713 except KeyboardInterrupt: 714 self.remove() 715 raise 716 if self.fail_msg: 717 msg, self.fail_msg = self.fail_msg, None 718 self.remove() 719 raise Exception, msg
720 721
722 - def remove(self, error=None):
723 """Ensure that all thread are killed""" 724 logger.info('remove job currently running') 725 self.waiting_submission = [] 726 if error: 727 self.fail_msg = error 728 for pid in list(self.pids): 729 if isinstance(pid, tuple): 730 continue 731 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 732 % {'pid':pid} ) 733 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} ) 734 if out == 0: 735 try: 736 self.pids.remove(pid) 737 except: 738 pass 739 #out = os.system('kill -9 %s &> /dev/null' % pid) 740 741 time.sleep(1) # waiting if some were submitting at the time of ctrl-c 742 for pid in list(self.pids): 743 if isinstance(pid, tuple): 744 continue 745 out = os.system('CPIDS=$(pgrep -P %s); kill -15 $CPIDS > /dev/null 2>&1' % pid ) 746 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} ) 747 if out == 0: 748 try: 749 self.pids.remove(pid) 750 except: 751 pass
752
753 -class CondorCluster(Cluster):
754 """Basic class for dealing with cluster submission""" 755 756 name = 'condor' 757 job_id = 'CONDOR_ID' 758 759 760 761 @multiple_try()
762 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 763 required_output=[], nb_submit=0):
764 """Submit a job prog to a Condor cluster""" 765 766 text = """Executable = %(prog)s 767 output = %(stdout)s 768 error = %(stderr)s 769 log = %(log)s 770 %(argument)s 771 environment = CONDOR_ID=$(Cluster).$(Process) 772 Universe = vanilla 773 notification = Error 774 Initialdir = %(cwd)s 775 %(requirement)s 776 getenv=True 777 queue 1 778 """ 779 780 if self.cluster_queue not in ['None', None]: 781 requirement = 'Requirements = %s=?=True' % self.cluster_queue 782 else: 783 requirement = '' 784 785 if cwd is None: 786 cwd = os.getcwd() 787 if stdout is None: 788 stdout = '/dev/null' 789 if stderr is None: 790 stderr = '/dev/null' 791 if log is None: 792 log = '/dev/null' 793 if not os.path.exists(prog): 794 prog = os.path.join(cwd, prog) 795 if argument: 796 argument = 'Arguments = %s' % ' '.join(argument) 797 else: 798 argument = '' 799 800 801 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 802 'stderr': stderr,'log': log,'argument': argument, 803 'requirement': requirement} 804 805 open('submit_condor','w').write(text % dico) 806 a = misc.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE) 807 output = a.stdout.read() 808 #Submitting job(s). 809 #Logging submit event(s). 810 #1 job(s) submitted to cluster 2253622. 811 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 812 try: 813 id = pat.search(output).groups()[0] 814 except: 815 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 816 % output 817 self.submitted += 1 818 self.submitted_ids.append(id) 819 return id
820 821 @store_input() 822 @multiple_try()
823 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 824 log=None, input_files=[], output_files=[], required_output=[], 825 nb_submit=0):
826 """Submit the job on the cluster NO SHARE DISK 827 input/output file should be give relative to cwd 828 """ 829 830 if not required_output and output_files: 831 required_output = output_files 832 833 if (input_files == [] == output_files): 834 return self.submit(prog, argument, cwd, stdout, stderr, log, 835 required_output=required_output, nb_submit=nb_submit) 836 837 text = """Executable = %(prog)s 838 output = %(stdout)s 839 error = %(stderr)s 840 log = %(log)s 841 %(argument)s 842 should_transfer_files = YES 843 when_to_transfer_output = ON_EXIT 844 transfer_input_files = %(input_files)s 845 %(output_files)s 846 Universe = vanilla 847 notification = Error 848 Initialdir = %(cwd)s 849 %(requirement)s 850 getenv=True 851 queue 1 852 """ 853 854 if self.cluster_queue not in ['None', None]: 855 requirement = 'Requirements = %s=?=True' % self.cluster_queue 856 else: 857 requirement = '' 858 859 if cwd is None: 860 cwd = os.getcwd() 861 if stdout is None: 862 stdout = '/dev/null' 863 if stderr is None: 864 stderr = '/dev/null' 865 if log is None: 866 log = '/dev/null' 867 if not os.path.exists(prog): 868 prog = os.path.join(cwd, prog) 869 if argument: 870 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 871 else: 872 argument = '' 873 # input/output file treatment 874 if input_files: 875 input_files = ','.join(input_files) 876 else: 877 input_files = '' 878 if output_files: 879 output_files = 'transfer_output_files = %s' % ','.join(output_files) 880 else: 881 output_files = '' 882 883 884 885 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 886 'stderr': stderr,'log': log,'argument': argument, 887 'requirement': requirement, 'input_files':input_files, 888 'output_files':output_files} 889 890 open('submit_condor','w').write(text % dico) 891 a = subprocess.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE) 892 output = a.stdout.read() 893 #Submitting job(s). 894 #Logging submit event(s). 895 #1 job(s) submitted to cluster 2253622. 896 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 897 try: 898 id = pat.search(output).groups()[0] 899 except: 900 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 901 % output 902 self.submitted += 1 903 self.submitted_ids.append(id) 904 return id
905 906 907 908 909 910 @multiple_try(nb_try=10, sleep=10)
911 - def control_one_job(self, id):
912 """ control the status of a single job with it's cluster id """ 913 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 914 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 915 stderr=subprocess.PIPE) 916 917 error = status.stderr.read() 918 if status.returncode or error: 919 raise ClusterManagmentError, 'condor_q returns error: %s' % error 920 921 return status.stdout.readline().strip()
922 923 @check_interupt() 924 @multiple_try(nb_try=10, sleep=10)
925 - def control(self, me_dir):
926 """ control the status of a single job with it's cluster id """ 927 928 if not self.submitted_ids: 929 return 0, 0, 0, 0 930 931 packet = 15000 932 idle, run, fail = 0, 0, 0 933 ongoing = [] 934 for i in range(1+(len(self.submitted_ids)-1)//packet): 935 start = i * packet 936 stop = (i+1) * packet 937 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 938 " -format \'%-2s\ ' \'ClusterId\' " + \ 939 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 940 941 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 942 stderr=subprocess.PIPE) 943 error = status.stderr.read() 944 if status.returncode or error: 945 raise ClusterManagmentError, 'condor_q returns error: %s' % error 946 947 for line in status.stdout: 948 id, status = line.strip().split() 949 ongoing.append(int(id)) 950 if status in ['I','U']: 951 idle += 1 952 elif status == 'R': 953 run += 1 954 elif status != 'C': 955 fail += 1 956 957 for id in list(self.submitted_ids): 958 if int(id) not in ongoing: 959 status = self.check_termination(id) 960 if status == 'wait': 961 run += 1 962 elif status == 'resubmit': 963 idle += 1 964 965 return idle, run, self.submitted - (idle+run+fail), fail
966 967 @multiple_try()
968 - def remove(self, *args, **opts):
969 """Clean the jobson the cluster""" 970 971 if not self.submitted_ids: 972 return 973 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 974 975 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
976
977 -class PBSCluster(Cluster):
978 """Basic class for dealing with cluster submission""" 979 980 name = 'pbs' 981 job_id = 'PBS_JOBID' 982 idle_tag = ['Q'] 983 running_tag = ['T','E','R'] 984 complete_tag = ['C'] 985 986 maximum_submited_jobs = 2500 987 988 @multiple_try()
989 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 990 required_output=[], nb_submit=0):
991 """Submit a job prog to a PBS cluster""" 992 993 994 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 995 me_dir = misc.digest(me_dir)[-14:] 996 if not me_dir[0].isalpha(): 997 me_dir = 'a' + me_dir[1:] 998 999 if len(self.submitted_ids) > self.maximum_submited_jobs: 1000 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1001 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1002 self.wait(me_dir, fct, self.maximum_submited_jobs) 1003 1004 1005 text = "" 1006 if cwd is None: 1007 cwd = os.getcwd() 1008 else: 1009 text = " cd %s;" % cwd 1010 if stdout is None: 1011 stdout = '/dev/null' 1012 if stderr is None: 1013 stderr = '/dev/null' 1014 elif stderr == -2: # -2 is subprocess.STDOUT 1015 stderr = stdout 1016 if log is None: 1017 log = '/dev/null' 1018 1019 if not os.path.isabs(prog): 1020 text += "./%s" % prog 1021 else: 1022 text+= prog 1023 1024 if argument: 1025 text += ' ' + ' '.join(argument) 1026 1027 command = ['qsub','-o', stdout, 1028 '-N', me_dir, 1029 '-e', stderr, 1030 '-V'] 1031 1032 if self.cluster_queue and self.cluster_queue != 'None': 1033 command.extend(['-q', self.cluster_queue]) 1034 1035 a = misc.Popen(command, stdout=subprocess.PIPE, 1036 stderr=subprocess.STDOUT, 1037 stdin=subprocess.PIPE, cwd=cwd) 1038 1039 output = a.communicate(text)[0] 1040 id = output.split('.')[0] 1041 if not id.isdigit() or a.returncode !=0: 1042 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1043 % output 1044 1045 self.submitted += 1 1046 self.submitted_ids.append(id) 1047 return id
1048 1049 @multiple_try()
1050 - def control_one_job(self, id):
1051 """ control the status of a single job with it's cluster id """ 1052 cmd = 'qstat '+str(id) 1053 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1054 stderr=subprocess.STDOUT) 1055 1056 for line in status.stdout: 1057 line = line.strip() 1058 if 'cannot connect to server' in line or 'cannot read reply' in line: 1059 raise ClusterManagmentError, 'server disconnected' 1060 if 'Unknown' in line: 1061 return 'F' 1062 elif line.startswith(str(id)): 1063 jobstatus = line.split()[4] 1064 else: 1065 jobstatus="" 1066 1067 if status.returncode != 0 and status.returncode is not None: 1068 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1069 if jobstatus in self.idle_tag: 1070 return 'I' 1071 elif jobstatus in self.running_tag: 1072 return 'R' 1073 return 'F'
1074 1075 1076 @multiple_try()
1077 - def control(self, me_dir):
1078 """ control the status of a single job with it's cluster id """ 1079 cmd = "qstat" 1080 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1081 1082 if me_dir.endswith('/'): 1083 me_dir = me_dir[:-1] 1084 me_dir = misc.digest(me_dir)[-14:] 1085 if not me_dir[0].isalpha(): 1086 me_dir = 'a' + me_dir[1:] 1087 ongoing = [] 1088 1089 idle, run, fail = 0, 0, 0 1090 for line in status.stdout: 1091 if 'cannot connect to server' in line or 'cannot read reply' in line: 1092 raise ClusterManagmentError, 'server disconnected' 1093 if me_dir in line: 1094 ongoing.append(line.split()[0].split('.')[0]) 1095 status2 = line.split()[4] 1096 if status2 in self.idle_tag: 1097 idle += 1 1098 elif status2 in self.running_tag: 1099 run += 1 1100 elif status2 in self.complete_tag: 1101 if not self.check_termination(line.split()[0].split('.')[0]): 1102 idle += 1 1103 else: 1104 fail += 1 1105 1106 if status.returncode != 0 and status.returncode is not None: 1107 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1108 1109 for id in list(self.submitted_ids): 1110 if id not in ongoing: 1111 status2 = self.check_termination(id) 1112 if status2 == 'wait': 1113 run += 1 1114 elif status2 == 'resubmit': 1115 idle += 1 1116 1117 return idle, run, self.submitted - (idle+run+fail), fail
1118 1119 @multiple_try()
1120 - def remove(self, *args, **opts):
1121 """Clean the jobs on the cluster""" 1122 1123 if not self.submitted_ids: 1124 return 1125 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1126 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1127
1128 1129 -class SGECluster(Cluster):
1130 """Basic class for dealing with cluster submission""" 1131 # Class written by Arian Abrahantes. 1132 1133 name = 'sge' 1134 job_id = 'JOB_ID' 1135 idle_tag = ['qw', 'hqw','hRqw','w'] 1136 running_tag = ['r','t','Rr','Rt'] 1137
1138 - def def_get_path(self,location):
1139 """replace string for path issues""" 1140 location = os.path.realpath(location) 1141 homePath = os.getenv("HOME") 1142 if homePath: 1143 location = location.replace(homePath,'$HOME') 1144 return location
1145 1146 @multiple_try()
1147 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1148 required_output=[], nb_submit=0):
1149 """Submit a job prog to an SGE cluster""" 1150 1151 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1152 me_dir = misc.digest(me_dir)[-10:] 1153 if not me_dir[0].isalpha(): 1154 me_dir = 'a' + me_dir[1:] 1155 1156 if cwd is None: 1157 #cwd = os.getcwd() 1158 cwd = self.def_get_path(os.getcwd()) 1159 cwd1 = self.def_get_path(cwd) 1160 text = " cd %s;" % cwd1 1161 if stdout is None: 1162 stdout = '/dev/null' 1163 else: 1164 stdout = self.def_get_path(stdout) 1165 if stderr is None: 1166 stderr = '/dev/null' 1167 elif stderr == -2: # -2 is subprocess.STDOUT 1168 stderr = stdout 1169 else: 1170 stderr = self.def_get_path(stderr) 1171 1172 if log is None: 1173 log = '/dev/null' 1174 else: 1175 log = self.def_get_path(log) 1176 1177 text += prog 1178 if argument: 1179 text += ' ' + ' '.join(argument) 1180 1181 #if anything slips through argument 1182 #print "!=== inteded change ",text.replace('/srv/nfs','') 1183 #text = text.replace('/srv/nfs','') 1184 homePath = os.getenv("HOME") 1185 if homePath: 1186 text = text.replace(homePath,'$HOME') 1187 1188 logger.debug("!=== input %s" % text) 1189 logger.debug("!=== output %s" % stdout) 1190 logger.debug("!=== error %s" % stderr) 1191 logger.debug("!=== logs %s" % log) 1192 1193 command = ['qsub','-o', stdout, 1194 '-N', me_dir, 1195 '-e', stderr, 1196 '-V'] 1197 1198 if self.cluster_queue and self.cluster_queue != 'None': 1199 command.extend(['-q', self.cluster_queue]) 1200 1201 a = misc.Popen(command, stdout=subprocess.PIPE, 1202 stderr=subprocess.STDOUT, 1203 stdin=subprocess.PIPE, cwd=cwd) 1204 1205 output = a.communicate(text)[0] 1206 id = output.split(' ')[2] 1207 if not id.isdigit(): 1208 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1209 % output 1210 self.submitted += 1 1211 self.submitted_ids.append(id) 1212 logger.debug(output) 1213 1214 return id
1215 1216 @multiple_try()
1217 - def control_one_job(self, id):
1218 """ control the status of a single job with it's cluster id """ 1219 #cmd = 'qstat '+str(id) 1220 cmd = 'qstat ' 1221 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1222 for line in status.stdout: 1223 #print "!==",line 1224 #line = line.strip() 1225 #if 'Unknown' in line: 1226 # return 'F' 1227 #elif line.startswith(str(id)): 1228 # status = line.split()[4] 1229 if str(id) in line: 1230 status = line.split()[4] 1231 #print "!=status", status 1232 if status in self.idle_tag: 1233 return 'I' 1234 elif status in self.running_tag: 1235 return 'R' 1236 return 'F'
1237 1238 @multiple_try()
1239 - def control(self, me_dir):
1240 """ control the status of a single job with it's cluster id """ 1241 cmd = "qstat " 1242 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1243 1244 if me_dir.endswith('/'): 1245 me_dir = me_dir[:-1] 1246 me_dir = misc.digest(me_dir)[-10:] 1247 if not me_dir[0].isalpha(): 1248 me_dir = 'a' + me_dir[1:] 1249 1250 idle, run, fail = 0, 0, 0 1251 for line in status.stdout: 1252 if me_dir in line: 1253 status = line.split()[4] 1254 if status in self.idle_tag: 1255 idle += 1 1256 elif status in self.running_tag: 1257 run += 1 1258 else: 1259 logger.debug(line) 1260 fail += 1 1261 1262 return idle, run, self.submitted - (idle+run+fail), fail
1263 1264 1265 1266 @multiple_try()
1267 - def remove(self, *args, **opts):
1268 """Clean the jobs on the cluster""" 1269 1270 if not self.submitted_ids: 1271 return 1272 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1273 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1274
1275 1276 -class LSFCluster(Cluster):
1277 """Basic class for dealing with cluster submission""" 1278 1279 name = 'lsf' 1280 job_id = 'LSB_JOBID' 1281 1282 @multiple_try()
1283 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1284 required_output=[], nb_submit=0):
1285 """Submit the job prog to an LSF cluster""" 1286 1287 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1288 me_dir = misc.digest(me_dir)[-14:] 1289 if not me_dir[0].isalpha(): 1290 me_dir = 'a' + me_dir[1:] 1291 1292 text = "" 1293 command = ['bsub', '-C0', '-J', me_dir] 1294 if cwd is None: 1295 cwd = os.getcwd() 1296 else: 1297 text = " cd %s;" % cwd 1298 if stdout and isinstance(stdout, str): 1299 command.extend(['-o', stdout]) 1300 if stderr and isinstance(stdout, str): 1301 command.extend(['-e', stderr]) 1302 elif stderr == -2: # -2 is subprocess.STDOUT 1303 pass 1304 if log is None: 1305 log = '/dev/null' 1306 1307 text += prog 1308 if argument: 1309 text += ' ' + ' '.join(argument) 1310 1311 if self.cluster_queue and self.cluster_queue != 'None': 1312 command.extend(['-q', self.cluster_queue]) 1313 1314 a = misc.Popen(command, stdout=subprocess.PIPE, 1315 stderr=subprocess.STDOUT, 1316 stdin=subprocess.PIPE, cwd=cwd) 1317 1318 output = a.communicate(text)[0] 1319 #Job <nnnn> is submitted to default queue <normal>. 1320 try: 1321 id = output.split('>',1)[0].split('<')[1] 1322 except: 1323 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1324 % output 1325 if not id.isdigit(): 1326 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1327 % output 1328 self.submitted += 1 1329 self.submitted_ids.append(id) 1330 return id
1331 1332 1333 @multiple_try()
1334 - def control_one_job(self, id):
1335 """ control the status of a single job with it's cluster id """ 1336 1337 cmd = 'bjobs '+str(id) 1338 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1339 1340 for line in status.stdout: 1341 line = line.strip().upper() 1342 if 'JOBID' in line: 1343 continue 1344 elif str(id) not in line: 1345 continue 1346 status = line.split()[2] 1347 if status == 'RUN': 1348 return 'R' 1349 elif status == 'PEND': 1350 return 'I' 1351 elif status == 'DONE': 1352 return 'F' 1353 else: 1354 return 'H' 1355 return 'F'
1356 1357 @multiple_try()
1358 - def control(self, me_dir):
1359 """ control the status of a single job with it's cluster id """ 1360 1361 if not self.submitted_ids: 1362 return 0, 0, 0, 0 1363 1364 cmd = "bjobs " + ' '.join(self.submitted_ids) 1365 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1366 1367 jobstatus = {} 1368 for line in status.stdout: 1369 line = line.strip() 1370 if 'JOBID' in line: 1371 continue 1372 splitline = line.split() 1373 id = splitline[0] 1374 if id not in self.submitted_ids: 1375 continue 1376 jobstatus[id] = splitline[2] 1377 1378 idle, run, fail = 0, 0, 0 1379 for id in self.submitted_ids[:]: 1380 if id in jobstatus: 1381 status = jobstatus[id] 1382 else: 1383 status = 'MISSING' 1384 if status == 'RUN': 1385 run += 1 1386 elif status == 'PEND': 1387 idle += 1 1388 else: 1389 status = self.check_termination(id) 1390 if status == 'wait': 1391 run += 1 1392 elif status == 'resubmit': 1393 idle += 1 1394 1395 return idle, run, self.submitted - (idle+run+fail), fail
1396 1397 @multiple_try()
1398 - def remove(self, *args,**opts):
1399 """Clean the jobs on the cluster""" 1400 1401 if not self.submitted_ids: 1402 return 1403 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1404 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1405
1406 -class GECluster(Cluster):
1407 """Class for dealing with cluster submission on a GE cluster""" 1408 1409 name = 'ge' 1410 job_id = 'JOB_ID' 1411 idle_tag = ['qw'] 1412 running_tag = ['r'] 1413 1414 @multiple_try()
1415 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1416 required_output=[], nb_submit=0):
1417 """Submit a job prog to a GE cluster""" 1418 1419 text = "" 1420 if cwd is None: 1421 cwd = os.getcwd() 1422 else: 1423 text = " cd %s; bash " % cwd 1424 if stdout is None: 1425 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1426 if stderr is None: 1427 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1428 elif stderr == -2: # -2 is subprocess.STDOUT 1429 stderr = stdout 1430 if log is None: 1431 log = '/dev/null' 1432 1433 text += prog 1434 if argument: 1435 text += ' ' + ' '.join(argument) 1436 text += '\n' 1437 tmp_submit = os.path.join(cwd, 'tmp_submit') 1438 open(tmp_submit,'w').write(text) 1439 1440 a = misc.Popen(['qsub','-o', stdout, 1441 '-e', stderr, 1442 tmp_submit], 1443 stdout=subprocess.PIPE, 1444 stderr=subprocess.STDOUT, 1445 stdin=subprocess.PIPE, cwd=cwd) 1446 1447 output = a.communicate()[0] 1448 #Your job 874511 ("test.sh") has been submitted 1449 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1450 try: 1451 id = pat.search(output).groups()[0] 1452 except: 1453 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1454 % output 1455 self.submitted += 1 1456 self.submitted_ids.append(id) 1457 return id
1458 1459 @multiple_try()
1460 - def control_one_job(self, id):
1461 """ control the status of a single job with it's cluster id """ 1462 cmd = 'qstat | grep '+str(id) 1463 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1464 if not status: 1465 return 'F' 1466 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1467 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1468 stat = '' 1469 for line in status.stdout.read().split('\n'): 1470 if not line: 1471 continue 1472 line = line.strip() 1473 try: 1474 groups = pat.search(line).groups() 1475 except: 1476 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1477 if groups[0] != id: continue 1478 stat = groups[1] 1479 if not stat: 1480 return 'F' 1481 if stat in self.idle_tag: 1482 return 'I' 1483 if stat in self.running_tag: 1484 return 'R'
1485 1486 @multiple_try()
1487 - def control(self, me_dir=None):
1488 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1489 if not self.submitted_ids: 1490 return 0, 0, 0, 0 1491 idle, run, fail = 0, 0, 0 1492 ongoing = [] 1493 for statusflag in ['p', 'r', 'sh']: 1494 cmd = 'qstat -s %s' % statusflag 1495 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1496 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1497 pat = re.compile("^(\d+)") 1498 for line in status.stdout.read().split('\n'): 1499 line = line.strip() 1500 try: 1501 id = pat.search(line).groups()[0] 1502 except Exception: 1503 pass 1504 else: 1505 if id not in self.submitted_ids: 1506 continue 1507 ongoing.append(id) 1508 if statusflag == 'p': 1509 idle += 1 1510 if statusflag == 'r': 1511 run += 1 1512 if statusflag == 'sh': 1513 fail += 1 1514 for id in list(self.submitted_ids): 1515 if id not in ongoing: 1516 self.check_termination(id) 1517 #self.submitted_ids = ongoing 1518 1519 return idle, run, self.submitted - idle - run - fail, fail
1520 1521 @multiple_try()
1522 - def remove(self, *args, **opts):
1523 """Clean the jobs on the cluster""" 1524 1525 if not self.submitted_ids: 1526 return 1527 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1528 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1529
1530 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1531 """start a computation and not wait for it to finish. 1532 this fonction returns a lock which is locked as long as the job is 1533 running.""" 1534 1535 mc = MultiCore(1) 1536 mc.submit(exe, argument, cwd, stdout, **opt) 1537 mc.need_waiting = True 1538 mc.lock.acquire() 1539 return mc.lock
1540
1541 1542 -class SLURMCluster(Cluster):
1543 """Basic class for dealing with cluster submission""" 1544 1545 name = 'slurm' 1546 job_id = 'SLURM_JOBID' 1547 idle_tag = ['Q','PD','S','CF'] 1548 running_tag = ['R', 'CG'] 1549 complete_tag = ['C'] 1550 1551 @multiple_try()
1552 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1553 required_output=[], nb_submit=0):
1554 """Submit a job prog to a SLURM cluster""" 1555 1556 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1557 me_dir = misc.digest(me_dir)[-8:] 1558 1559 if not me_dir[0].isalpha(): 1560 me_dir = 'a' + me_dir[1:] 1561 1562 if cwd is None: 1563 cwd = os.getcwd() 1564 if stdout is None: 1565 stdout = '/dev/null' 1566 if stderr is None: 1567 stderr = '/dev/null' 1568 elif stderr == -2: # -2 is subprocess.STDOUT 1569 stderr = stdout 1570 if log is None: 1571 log = '/dev/null' 1572 1573 command = ['sbatch', '-o', stdout, 1574 '-J', me_dir, 1575 '-e', stderr, prog] + argument 1576 1577 if self.cluster_queue and self.cluster_queue != 'None': 1578 command.insert(1, '-p') 1579 command.insert(2, self.cluster_queue) 1580 1581 a = misc.Popen(command, stdout=subprocess.PIPE, 1582 stderr=subprocess.STDOUT, 1583 stdin=subprocess.PIPE, cwd=cwd) 1584 1585 output = a.communicate() 1586 output_arr = output[0].split(' ') 1587 id = output_arr[3].rstrip() 1588 1589 if not id.isdigit(): 1590 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1591 1592 self.submitted += 1 1593 self.submitted_ids.append(id) 1594 return id
1595 1596 @multiple_try()
1597 - def control_one_job(self, id):
1598 """ control the status of a single job with it's cluster id """ 1599 cmd = 'squeue j'+str(id) 1600 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1601 stderr=open(os.devnull,'w')) 1602 1603 for line in status.stdout: 1604 line = line.strip() 1605 if 'Invalid' in line: 1606 return 'F' 1607 elif line.startswith(str(id)): 1608 status = line.split()[4] 1609 if status in self.idle_tag: 1610 return 'I' 1611 elif status in self.running_tag: 1612 return 'R' 1613 return 'F'
1614 1615 @multiple_try()
1616 - def control(self, me_dir):
1617 """ control the status of a single job with it's cluster id """ 1618 cmd = "squeue" 1619 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1620 1621 if me_dir.endswith('/'): 1622 me_dir = me_dir[:-1] 1623 me_dir = misc.digest(me_dir)[-8:] 1624 if not me_dir[0].isalpha(): 1625 me_dir = 'a' + me_dir[1:] 1626 1627 idle, run, fail = 0, 0, 0 1628 ongoing=[] 1629 for line in status.stdout: 1630 if me_dir in line: 1631 id, _, _,_ , status,_ = line.split(None,5) 1632 ongoing.append(id) 1633 if status in self.idle_tag: 1634 idle += 1 1635 elif status in self.running_tag: 1636 run += 1 1637 elif status in self.complete_tag: 1638 status = self.check_termination(id) 1639 if status == 'wait': 1640 run += 1 1641 elif status == 'resubmit': 1642 idle += 1 1643 else: 1644 fail += 1 1645 1646 #control other finished job 1647 for id in list(self.submitted_ids): 1648 if id not in ongoing: 1649 status = self.check_termination(id) 1650 if status == 'wait': 1651 run += 1 1652 elif status == 'resubmit': 1653 idle += 1 1654 1655 1656 return idle, run, self.submitted - (idle+run+fail), fail
1657 1658 @multiple_try()
1659 - def remove(self, *args, **opts):
1660 """Clean the jobs on the cluster""" 1661 1662 if not self.submitted_ids: 1663 return 1664 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1665 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1666
1667 -class HTCaaSCluster(Cluster):
1668 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1669 1670 name= 'htcaas' 1671 job_id = 'HTCAAS_JOBID' 1672 1673 @store_input() 1674 @multiple_try()
1675 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1676 log=None, input_files=[], output_files=[], required_output=[], 1677 nb_submit=0):
1678 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1679 input/output file should be give relative to cwd 1680 """ 1681 # To make workspace name(temp) 1682 if 'ajob' in prog: 1683 prog_num = prog.rsplit("ajob",1)[1] 1684 else: 1685 prog_num = '0' 1686 1687 cur_usr = os.getenv('USER') 1688 1689 if cwd is None: 1690 cwd = os.getcwd() 1691 1692 cwd_cp = cwd.rsplit("/",2) 1693 #print 'This is HTCaaS Mode' 1694 1695 if not stdout is None: 1696 print "stdout: %s" % stdout 1697 1698 if not os.path.exists(prog): 1699 prog = os.path.join(cwd, prog) 1700 1701 if not required_output and output_files: 1702 required_output = output_files 1703 1704 1705 if not 'combine' and not 'pythia' in prog : 1706 cwd_arg = cwd+"/arguments" 1707 temp = ' '.join([str(a) for a in argument]) 1708 arg_cmd="echo '"+temp+"' > " + cwd_arg 1709 #print arg_cmd 1710 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1711 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1712 if argument : 1713 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1714 print command 1715 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1716 id = a.stdout.read().strip() 1717 1718 else: 1719 cwd_arg = cwd+"/arguments" 1720 temp = ' '.join([str(a) for a in argument]) 1721 #arg_cmd="echo '"+temp+"' > " + cwd_arg 1722 #print arg_cmd 1723 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1724 #print os.path.basename(prog) 1725 temp_file_name = "sub." + os.path.basename(prog) 1726 text = """#!/bin/bash 1727 MYPWD=%(cwd)s 1728 cd $MYPWD 1729 input_files=(%(input_files)s ) 1730 for i in ${input_files[@]} 1731 do 1732 chmod -f +x $i 1733 done 1734 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1735 """ 1736 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1737 'arguments': ' '.join([str(a) for a in argument]), 1738 'program': ' ' if '.py' in prog else 'bash'} 1739 1740 # writing a new script for the submission 1741 new_prog = pjoin(cwd, temp_file_name) 1742 open(new_prog, 'w').write(text % dico) 1743 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1744 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1745 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1746 id = a.stdout.read().strip() 1747 1748 nb_try=0 1749 nb_limit=5 1750 if not id.isdigit() : 1751 print "[ID is not digit]:" + id 1752 1753 while not id.isdigit() : 1754 nb_try+=1 1755 print "[fail_retry]:"+ nb_try 1756 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1757 id = a.stdout.read().strip() 1758 if nb_try > nb_limit : 1759 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1760 break 1761 1762 self.submitted += 1 1763 self.submitted_ids.append(id) 1764 1765 return id
1766 1767 @multiple_try(nb_try=10, sleep=10)
1768 - def control_one_job(self, id):
1769 """ control the status of a single job with it's cluster id """ 1770 1771 if id == 0 : 1772 status_out ='C' 1773 else : 1774 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1775 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1776 stderr=subprocess.PIPE) 1777 error = status.stderr.read() 1778 if status.returncode or error: 1779 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1780 status_out= status.stdout.read().strip() 1781 status_out= status_out.split(":",1)[1] 1782 if status_out == 'waiting': 1783 status_out='I' 1784 elif status_out == 'preparing' or status_out == 'running': 1785 status_out = 'R' 1786 elif status_out != 'done': 1787 status_out = 'F' 1788 elif status_out == 'done': 1789 status_out = 'C' 1790 1791 return status_out
1792 1793 @multiple_try(nb_try=15, sleep=1)
1794 - def control(self, me_dir):
1795 """ control the status of a single job with it's cluster id """ 1796 #print "HTCaaS2 Control" 1797 if not self.submitted_ids: 1798 return 0, 0, 0, 0 1799 1800 ongoing = [] 1801 idle, run, fail = 0, 0, 0 1802 1803 if id == 0 : 1804 return 0 , 0, 0, 0 1805 else : 1806 for i in range(len(self.submitted_ids)): 1807 ongoing.append(int(self.submitted_ids[i])) 1808 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 1809 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1810 status_out= status.stdout.read().strip() 1811 status_out= status_out.split(":",1)[1] 1812 if status_out == 'waiting': 1813 idle += 1 1814 elif status_out == 'preparing': 1815 run += 1 1816 elif status_out == 'running': 1817 run += 1 1818 elif status_out != 'done': 1819 fail += 1 1820 1821 if status_out != 'done': 1822 print "["+ self.submitted_ids[i] + "] " + status_out 1823 ''' 1824 for i in range(len(self.submitted_ids)): 1825 if int(self.submitted_ids[i]) not in ongoing: 1826 status = self.check_termination(int(self.submitted_ids[i])) 1827 if status = 'waiting': 1828 idle += 1 1829 elif status == 'resubmit': 1830 idle += 1 1831 elif status == 'failed': 1832 fail += 1 1833 ''' 1834 1835 return idle, run, self.submitted - (idle+run+fail), fail
1836 1837 @multiple_try()
1838 - def remove(self, *args, **opts):
1839 """Clean the jobson the cluster""" 1840 1841 if not self.submitted_ids: 1842 return 1843 for i in range(len(self.submitted_ids)): 1844 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 1845 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1846
1847 1848 -class HTCaaS2Cluster(Cluster):
1849 """Class for dealing with cluster submission on a HTCaaS cluster""" 1850 1851 name= 'htcaas2' 1852 job_id = 'HTCAAS2_JOBID' 1853 1854 @store_input() 1855 @multiple_try()
1856 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1857 log=None, input_files=[], output_files=[], required_output=[], 1858 nb_submit=0):
1859 """Submit the job on the cluster NO SHARE DISK 1860 input/output file should be give relative to cwd 1861 """ 1862 # To make workspace name(temp) 1863 if 'ajob' in prog: 1864 prog_num = prog.rsplit("ajob",1)[1] 1865 elif 'run_combine' in prog: 1866 prog_num = '0' 1867 else: 1868 prog_num = prog 1869 1870 cur_usr = os.getenv('USER') 1871 1872 import uuid 1873 dir = str(uuid.uuid4().hex) 1874 #dir = str(int(time())) 1875 prog_dir = '_run%s'% prog_num 1876 prog_dir = dir+prog_dir 1877 1878 if cwd is None: 1879 cwd = os.getcwd() 1880 1881 cwd_cp = cwd.rsplit("/",2) 1882 1883 if stdout is None: 1884 stdout='/dev/null' 1885 1886 if not os.path.exists(prog): 1887 prog = os.path.join(cwd, prog) 1888 1889 if not required_output and output_files: 1890 required_output = output_files 1891 1892 if '/' in argument : 1893 temp_file_name = "sub." + os.path.basename(prog) 1894 else : 1895 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 1896 1897 1898 if 'combine' in prog or 'pythia' in prog : 1899 text = """#!/bin/bash 1900 MYPWD=%(cwd)s 1901 cd $MYPWD 1902 script=%(script)s 1903 input_files=(%(input_files)s ) 1904 if [ $# -ge 1 ]; then 1905 arg1=$1 1906 else 1907 arg1='' 1908 fi 1909 args=' %(arguments)s' 1910 for i in ${input_files[@]}; do 1911 if [[ "$i" == *$script* ]]; then 1912 script=$i 1913 fi 1914 chmod -f +x $i 1915 done 1916 /bin/bash ${script} ${args} > %(stdout)s 1917 """ 1918 1919 elif 'shower' in prog : 1920 text = """#!/bin/bash 1921 MYPWD=%(cwd)s 1922 cd $MYPWD 1923 args=' %(arguments)s' 1924 input_files=( %(input_files)s ) 1925 for i in ${input_files[@]} 1926 do 1927 chmod -f +x $i 1928 done 1929 /bin/bash %(script)s ${args} > $MYPWD/done 1930 """ 1931 1932 else : 1933 text = """#!/bin/bash 1934 MYPWD=%(cwd)s 1935 #mkdir -p $MYTMP 1936 cd $MYPWD 1937 input_files=( %(input_files)s ) 1938 for i in ${input_files[@]} 1939 do 1940 if [[ $i != */*/* ]]; then 1941 i=$PWD"/"$i 1942 fi 1943 echo $i 1944 if [ -d $i ]; then 1945 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1946 else 1947 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1948 fi 1949 done 1950 """ 1951 1952 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 1953 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 1954 'input_files': ' '.join(input_files + [prog]), 1955 'output_files': ' '.join(output_files), 'stdout': stdout, 1956 'arguments': ' '.join([str(a) for a in argument]), 1957 'program': ' ' if '.py' in prog else 'bash'} 1958 1959 # writing a new script for the submission 1960 new_prog = pjoin(cwd, temp_file_name) 1961 open(new_prog, 'w').write(text % dico) 1962 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1963 1964 # print temp_file_name 1965 cmd1='/bin/bash '+ cwd+'/'+temp_file_name 1966 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE, 1967 stderr=subprocess.PIPE) 1968 #print '%s' % status1.stdout.read() 1969 1970 1971 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog: 1972 1973 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s""" 1974 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 1975 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) , 1976 'prog_dir': prog_dir } 1977 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE, 1978 stderr=subprocess.PIPE) 1979 id = status3.stdout.read().strip() 1980 ## exception 1981 nb_try=0 1982 nb_limit=5 1983 while not id.isdigit() : 1984 nb_try+=1 1985 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1986 id = a.stdout.read().strip() 1987 if nb_try > nb_limit : 1988 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id 1989 break 1990 1991 temp_file_name2 = "sub." +id 1992 text2 = """#!/bin/bash 1993 MYPWD=%(cwd)s 1994 output_files=( %(output_files)s ) 1995 result=done 1996 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then 1997 for i in ${output_files[@]} 1998 do 1999 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s 2000 chmod -Rf 777 ${MYPWD}/$i 2001 done 2002 for i in ${output_files[@]}; do 2003 if [[ -e ${MYPWD}/$i ]]; then 2004 result=done 2005 else 2006 result=running 2007 echo $result 2008 exit 0 2009 fi 2010 done 2011 echo $result 2012 touch ${MYPWD}/done.%(job_id)s 2013 else 2014 for i in ${output_files[@]}; do 2015 if [ -e ${MYPWD}/$i ]; then 2016 result=done 2017 else 2018 rm -f ${MYPWD}/done.%(job_id)s 2019 result=running 2020 echo $result 2021 exit 0 2022 fi 2023 done 2024 echo $result 2025 2026 fi 2027 2028 """ 2029 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 2030 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 2031 'output_files': ' '.join(output_files), 'job_id': id, 2032 'program': ' ' if '.py' in prog else 'bash'} 2033 2034 homePath = os.getenv("HOME") 2035 outPath = homePath +"/MG5" 2036 2037 new_prog2 = pjoin(outPath, temp_file_name2) 2038 open(new_prog2, 'w').write(text2 % dico2) 2039 misc.Popen(['chmod','+x',new_prog2],cwd=cwd) 2040 2041 2042 self.submitted += 1 2043 self.submitted_ids.append(id) 2044 2045 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog: 2046 if '/dev/null' in stdout : 2047 stdout='' 2048 2049 temp_file_shower = "sub.out" 2050 text_shower = """#!/bin/bash 2051 MYPWD=%(cwd)s 2052 result=done 2053 output_files=(%(output_files)s) 2054 for i in ${output_files[@]}; do 2055 if [ -e $MYPWD/$i -o -e $i ]; then 2056 result=done 2057 else 2058 result=running 2059 echo $result 2060 exit 0 2061 fi 2062 done 2063 echo $result 2064 """ 2065 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files), 2066 'program': ' ' if '.py' in prog else 'bash'} 2067 homePath = os.getenv("HOME") 2068 outPath = homePath +"/MG5" 2069 new_prog_shower = pjoin(outPath, temp_file_shower) 2070 open(new_prog_shower, 'w').write(text_shower % dico_shower) 2071 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd) 2072 2073 id='-1' 2074 self.submitted += 1 2075 self.submitted_ids.append(id) 2076 2077 else : 2078 id='-2' 2079 self.submitted += 1 2080 self.submitted_ids.append(id) 2081 2082 return id
2083 2084 @multiple_try(nb_try=10, sleep=10)
2085 - def control_one_job(self, id):
2086 """ control the status of a single job with it's cluster id """ 2087 2088 homePath = os.getenv("HOME") 2089 outPath = homePath +"/MG5" 2090 2091 2092 if id == '0' or id=='-2' : 2093 status_out ='done' 2094 elif id == '-1' : 2095 cmd='/bin/bash ' +outPath+'/sub.out' 2096 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2097 status_out=status.stdout.read().strip() 2098 print "["+id+"]" + status_out 2099 if status_out == 'waiting': 2100 status_out='wait' 2101 elif status_out == 'preparing' or status_out == 'running': 2102 status_out = 'R' 2103 elif status_out != 'done': 2104 status_out = 'F' 2105 elif status_out == 'done': 2106 status_out = 'C' 2107 2108 print "["+id+"]" + status_out 2109 else : 2110 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 2111 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 2112 stderr=subprocess.PIPE) 2113 error = status.stderr.read() 2114 if status.returncode or error: 2115 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 2116 status_out= status.stdout.read().strip() 2117 status_out= status_out.split(":",1)[1] 2118 print "["+id+"]" + status_out 2119 if status_out == 'waiting': 2120 status_out='wait' 2121 elif status_out == 'preparing' or status_out == 'running': 2122 status_out = 'R' 2123 elif status_out == 'failed' : 2124 args = self.retry_args[id] 2125 id_temp = self.submit2(**args) 2126 del self.retry_args[id] 2127 self.submitted_ids.remove(id) 2128 status_out = 'I' 2129 elif status_out != 'done': 2130 status_out = 'F' 2131 elif status_out == 'done': 2132 status_out = 'C' 2133 2134 return status_out
2135 2136 2137 @check_interupt() 2138 @multiple_try(nb_try=15, sleep=10)
2139 - def control(self, me_dir):
2140 """ control the status of a single job with it's cluster id """ 2141 2142 if not self.submitted_ids: 2143 return 0, 0, 0, 0 2144 2145 ongoing = [] 2146 idle, run, fail = 0, 0, 0 2147 2148 homePath = os.getenv("HOME") 2149 outPath = homePath +"/MG5" 2150 2151 for i in range(len(self.submitted_ids)): 2152 ongoing.append(self.submitted_ids[i]) 2153 if self.submitted_ids[i] == '-2' : 2154 return 0,0,0,0 2155 if self.submitted_ids[i] == '0' : 2156 # ongoing.append('0') 2157 status_out='done' 2158 elif self.submitted_ids[i] == '-1' : 2159 cmd='/bin/bash ' +outPath+'/sub.out' 2160 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2161 status_out=status.stdout.read().strip() 2162 if status_out == 'waiting': 2163 idle += 1 2164 elif status_out == 'preparing': 2165 run += 1 2166 elif status_out == 'running': 2167 run += 1 2168 elif status_out != 'done': 2169 fail += 1 2170 else : 2171 args = self.retry_args[str(self.submitted_ids[i])] 2172 if 'required_output'in args and not args['required_output']: 2173 args['required_output'] = args['output_files'] 2174 self.retry_args[str(self.submitted_ids[i])] = args 2175 2176 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 2177 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 2178 status_out= status.stdout.read().strip() 2179 status_out= status_out.split(":",1)[1] 2180 if status_out == 'waiting': 2181 idle += 1 2182 elif status_out == 'preparing': 2183 run += 1 2184 elif status_out == 'running': 2185 run += 1 2186 elif status_out == 'failed' or status_out == 'canceled': 2187 id = self.submit2(**args) 2188 #self.submitted_ids[i]=id 2189 del self.retry_args[self.submitted_ids[i]] 2190 self.submitted_ids.remove(self.submitted_ids[i]) 2191 self.submitted-=1 2192 idle += 1 2193 elif status_out != 'done': 2194 fail += 1 2195 if status_out == 'done': 2196 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i] 2197 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2198 aa= status2.stdout.read().strip() 2199 #result= self.check_termination(str(self.submitted_ids[i])) 2200 #print result 2201 #if not result : 2202 #if not self.check_termination(str(self.submitted_ids[i])): 2203 # print "not_self" + self.submitted_ids[i] 2204 # idle += 1 2205 #else : 2206 for path in args['required_output']: 2207 if args['cwd']: 2208 path = pjoin(args['cwd'], path) 2209 # check that file exists and is not empty. 2210 temp1=os.path.exists(path) 2211 temp2=os.stat(path).st_size 2212 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 2213 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2214 aa= status2.stdout.read().strip() 2215 if aa == 'done': 2216 self.submitted_ids[i] = '0' 2217 elif aa == 'running': 2218 run += 1 2219 else : 2220 self.submitted_ids[i]='0' 2221 2222 2223 for i in range(len(self.submitted_ids)): 2224 if str(self.submitted_ids[i]) not in ongoing: 2225 status2= self.check_termination(str(self.submitted_ids[i])) 2226 if status2 == 'wait': 2227 run += 1 2228 elif status2 == 'resubmit': 2229 idle += 1 2230 2231 return idle, run, self.submitted - (idle+run+fail), fail
2232 2233 @multiple_try()
2234 - def remove(self, *args, **opts):
2235 """Clean the jobson the cluster""" 2236 2237 if not self.submitted_ids: 2238 return 2239 for i in range(len(self.submitted_ids)): 2240 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 2241 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2242 2243 2244 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2245 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2246 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2247