Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21   
  22  logger = logging.getLogger('madgraph.cluster')  
  23   
  24  try: 
  25      from madgraph import MadGraph5Error 
  26      import madgraph.various.misc as misc 
  27  except Exception, error: 
  28      if __debug__: 
  29          print  str(error) 
  30      from internal import MadGraph5Error 
  31      import internal.misc as misc 
  32   
  33  pjoin = os.path.join 
34 35 -class ClusterManagmentError(MadGraph5Error):
36 pass
37
38 -class NotImplemented(MadGraph5Error):
39 pass
40 41 42 multiple_try = misc.multiple_try 43 pjoin = os.path.join
44 45 46 -def check_interupt(error=KeyboardInterrupt):
47 48 def deco_interupt(f): 49 def deco_f_interupt(self, *args, **opt): 50 try: 51 return f(self, *args, **opt) 52 except error: 53 try: 54 self.remove(*args, **opt) 55 except Exception: 56 pass 57 raise error
58 return deco_f_interupt 59 return deco_interupt 60
61 -def store_input(arg=''):
62 63 def deco_store(f): 64 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 65 input_files=[], output_files=[], required_output=[], nb_submit=0): 66 frame = inspect.currentframe() 67 args, _, _, values = inspect.getargvalues(frame) 68 args = dict([(i, values[i]) for i in args if i != 'self']) 69 id = f(self, **args) 70 if self.nb_retry > 0: 71 self.retry_args[id] = args 72 return id
73 return deco_f_store 74 return deco_store 75
76 77 -class Cluster(object):
78 """Basic Class for all cluster type submission""" 79 name = 'mother class' 80
81 - def __init__(self,*args, **opts):
82 """Init the cluster""" 83 84 self.submitted = 0 85 self.submitted_ids = [] 86 self.finish = 0 87 if 'cluster_queue' in opts: 88 self.cluster_queue = opts['cluster_queue'] 89 else: 90 self.cluster_queue = 'madgraph' 91 if 'cluster_temp_path' in opts: 92 self.temp_dir = opts['cluster_temp_path'] 93 else: 94 self.temp_dir = None 95 self.options = {'cluster_status_update': (600, 30)} 96 for key,value in opts.items(): 97 self.options[key] = value 98 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 99 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300 100 self.options = dict(opts) 101 self.retry_args = {}
102 103
104 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 105 log=None, required_output=[], nb_submit=0):
106 """How to make one submission. Return status id on the cluster.""" 107 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
108 109 @store_input()
110 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 111 log=None, input_files=[], output_files=[], required_output=[],nb_submit=0):
112 """How to make one submission. Return status id on the cluster. 113 NO SHARE DISK""" 114 115 if cwd is None: 116 cwd = os.getcwd() 117 if not os.path.exists(prog): 118 prog = os.path.join(cwd, prog) 119 120 if not required_output and output_files: 121 required_output = output_files 122 123 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 124 (input_files == [] == output_files): 125 return self.submit(prog, argument, cwd, stdout, stderr, log, 126 required_output=required_output, nb_submit=nb_submit) 127 128 if not input_files and not output_files: 129 # not input/output so not using submit2 130 return self.submit(prog, argument, cwd, stdout, stderr, log, 131 required_output=required_output, nb_submit=nb_submit) 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 138 139 text = """#!/bin/bash 140 MYTMP=%(tmpdir)s/run$%(job_id)s 141 MYPWD=%(cwd)s 142 mkdir -p $MYTMP 143 cd $MYPWD 144 input_files=( %(input_files)s ) 145 for i in ${input_files[@]} 146 do 147 cp -R -L $i $MYTMP 148 done 149 cd $MYTMP 150 echo '%(arguments)s' > arguments 151 chmod +x ./%(script)s 152 %(program)s ./%(script)s %(arguments)s 153 output_files=( %(output_files)s ) 154 for i in ${output_files[@]} 155 do 156 cp -r $MYTMP/$i $MYPWD 157 done 158 rm -rf $MYTMP 159 """ 160 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 161 'cwd': cwd, 'job_id': self.job_id, 162 'input_files': ' '.join(input_files + [prog]), 163 'output_files': ' '.join(output_files), 164 'arguments': ' '.join([str(a) for a in argument]), 165 'program': ' ' if '.py' in prog else 'bash'} 166 167 # writing a new script for the submission 168 new_prog = pjoin(cwd, temp_file_name) 169 open(new_prog, 'w').write(text % dico) 170 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 171 172 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 173 required_output=required_output, nb_submit=nb_submit)
174 175
176 - def control(self, me_dir=None):
177 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 178 if not self.submitted_ids: 179 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 180 idle, run, fail = 0, 0, 0 181 for pid in self.submitted_ids[:]: 182 status = self.control_one_job(id) 183 if status == 'I': 184 idle += 1 185 elif status == 'R': 186 run += 1 187 elif status == 'F': 188 self.finish +=1 189 self.submitted_ids.remove(pid) 190 else: 191 fail += 1 192 193 return idle, run, self.finish, fail
194
195 - def control_one_job(self, pid):
196 """ control the status of a single job with it's cluster id """ 197 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
198 199 @check_interupt()
200 - def wait(self, me_dir, fct, minimal_job=0):
201 """Wait that all job are finish. 202 if minimal_job set, then return if idle + run is lower than that number""" 203 204 205 mode = 1 # 0 is long waiting/ 1 is short waiting 206 nb_iter = 0 207 nb_short = 0 208 change_at = 5 # number of iteration from which we wait longer between update. 209 #usefull shortcut for readibility 210 longtime, shorttime = self.options['cluster_status_update'] 211 212 while 1: 213 old_mode = mode 214 nb_iter += 1 215 idle, run, finish, fail = self.control(me_dir) 216 if fail: 217 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 218 if idle + run == 0: 219 #time.sleep(20) #security to ensure that the file are really written on the disk 220 logger.info('All jobs finished') 221 break 222 if idle + run < minimal_job: 223 return 224 fct(idle, run, finish) 225 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 226 if nb_iter < change_at: 227 mode = 1 228 elif idle < run: 229 if old_mode == 0: 230 if nb_short: 231 mode = 0 #we already be back from short to long so stay in long 232 #check if we need to go back to short mode 233 elif idle: 234 if nb_iter > change_at + int(longtime)//shorttime: 235 mode = 0 #stay in long waiting mode 236 else: 237 mode = 1 # pass in short waiting mode 238 nb_short =0 239 else: 240 mode = 1 # pass in short waiting mode 241 nb_short = 0 242 elif old_mode == 1: 243 nb_short +=1 244 if nb_short > 3* max(change_at, int(longtime)//shorttime): 245 mode = 0 #go back in slow waiting 246 else: 247 mode = 0 248 249 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 250 if old_mode > mode: 251 logger.info('''Start to wait %ss between checking status. 252 Note that you can change this time in the configuration file. 253 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 254 255 #now Waiting! 256 if mode == 0: 257 try: 258 time.sleep(self.options['cluster_status_update'][0]) 259 except KeyboardInterrupt: 260 logger.info('start to update the status') 261 nb_iter = min(0, change_at -2) 262 nb_short = 0 263 else: 264 time.sleep(self.options['cluster_status_update'][1]) 265 266 267 self.submitted = 0 268 self.submitted_ids = []
269
270 - def check_termination(self, job_id):
271 """Check the termination of the jobs with job_id and relaunch it if needed.""" 272 273 274 if job_id not in self.retry_args: 275 return True 276 277 args = self.retry_args[job_id] 278 if 'time_check' in args: 279 time_check = args['time_check'] 280 else: 281 time_check = 0 282 283 for path in args['required_output']: 284 if args['cwd']: 285 path = pjoin(args['cwd'], path) 286 # check that file exists and is not empty. 287 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 288 break 289 else: 290 # all requested output are present 291 if time_check > 0: 292 logger.info('Job %s Finally found the missing output.' % (job_id)) 293 del self.retry_args[job_id] 294 self.submitted_ids.remove(job_id) 295 return 'done' 296 297 if time_check == 0: 298 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 299 args['time_check'] = time.time() 300 return 'wait' 301 elif self.cluster_retry_wait > time.time() - time_check: 302 return 'wait' 303 304 #jobs failed to be completed even after waiting time!! 305 if self.nb_retry < 0: 306 logger.critical('''Fail to run correctly job %s. 307 with option: %s 308 file missing: %s''' % (job_id, args, path)) 309 raw_input('press enter to continue.') 310 elif self.nb_retry == 0: 311 logger.critical('''Fail to run correctly job %s. 312 with option: %s 313 file missing: %s. 314 Stopping all runs.''' % (job_id, args, path)) 315 #self.remove() 316 elif args['nb_submit'] >= self.nb_retry: 317 logger.critical('''Fail to run correctly job %s. 318 with option: %s 319 file missing: %s 320 Fails %s times 321 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 322 #self.remove() 323 else: 324 args['nb_submit'] += 1 325 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 326 del self.retry_args[job_id] 327 self.submitted_ids.remove(job_id) 328 if 'time_check' in args: 329 del args['time_check'] 330 self.submit2(**args) 331 return 'resubmit' 332 return 'done'
333 334 335 336 @check_interupt()
337 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 338 stderr=None, log=None, required_output=[], nb_submit=0, 339 input_files=[], output_files=[]):
340 """launch one job on the cluster and wait for it""" 341 342 special_output = False # tag for concatenate the error with the output. 343 if stderr == -2 and stdout: 344 #We are suppose to send the output to stdout 345 special_output = True 346 stderr = stdout + '.err' 347 348 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 349 required_output=required_output, input_files=input_files, 350 output_files=output_files) 351 352 frame = inspect.currentframe() 353 args, _, _, values = inspect.getargvalues(frame) 354 args = dict([(i, values[i]) for i in args if i != 'self']) 355 self.retry_args[id] = args 356 357 nb_wait=0 358 while 1: 359 nb_wait+=1 360 status = self.control_one_job(id) 361 if not status in ['R','I']: 362 status = self.check_termination(id) 363 if status in ['wait']: 364 time.sleep(30) 365 continue 366 elif status in ['resubmit']: 367 id = self.submitted_ids[0] 368 time.sleep(30) 369 continue 370 #really stop! 371 time.sleep(30) #security to ensure that the file are really written on the disk 372 break 373 time.sleep(self.options['cluster_status_update'][1]) 374 375 if required_output: 376 status = self.check_termination(id) 377 if status == 'wait': 378 run += 1 379 elif status == 'resubmit': 380 idle += 1 381 382 383 if special_output: 384 # combine the stdout and the stderr 385 #wait up to 50 s to see if those files exists 386 for i in range(5): 387 if os.path.exists(stdout): 388 if not os.path.exists(stderr): 389 time.sleep(5) 390 if os.path.exists(stderr): 391 err_text = open(stderr).read() 392 if not err_text: 393 return 394 logger.warning(err_text) 395 text = open(stdout).read() 396 open(stdout,'w').write(text + err_text) 397 else: 398 return 399 time.sleep(10)
400
401 - def remove(self, *args, **opts):
402 """ """ 403 logger.warning("""This cluster didn't support job removal, 404 the jobs are still running on the cluster.""")
405
406 -class MultiCore(Cluster):
407 """ class for dealing with the submission in multiple node""" 408 409 job_id = '$' 410
411 - def __init__(self, *args, **opt):
412 """Init the cluster""" 413 import thread 414 super(MultiCore, self).__init__(self, *args, **opt) 415 416 417 self.submitted = 0 418 self.finish = 0 419 if 'nb_core' in opt: 420 self.nb_core = opt['nb_core'] 421 elif isinstance(args[0],int): 422 self.nb_core = args[0] 423 else: 424 self.nb_core = 1 425 self.update_fct = None 426 427 # initialize the thread controler 428 self.need_waiting = False 429 self.nb_used = 0 430 self.lock = thread.allocate_lock() 431 self.done = 0 432 self.waiting_submission = [] 433 self.pids = [] 434 self.fail_msg = None
435
436 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 437 stderr=None, log=None, **opts):
438 """launch one job and wait for it""" 439 if isinstance(stdout, str): 440 stdout = open(stdout, 'w') 441 if isinstance(stderr, str): 442 stdout = open(stderr, 'w') 443 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
444 445
446 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 447 log=None, required_output=[], nb_submit=0):
448 """submit a job on multicore machine""" 449 450 self.submitted +=1 451 if cwd is None: 452 cwd = os.getcwd() 453 if isinstance(prog, str): 454 if not os.path.exists(prog) and not misc.which(prog): 455 prog = os.path.join(cwd, prog) 456 457 import thread 458 if self.waiting_submission or self.nb_used == self.nb_core: 459 self.waiting_submission.append((prog, argument,cwd, stdout)) 460 # check that none submission is already finished 461 while self.nb_used < self.nb_core and self.waiting_submission: 462 arg = self.waiting_submission.pop(0) 463 self.nb_used += 1 # udpate the number of running thread 464 thread.start_new_thread(self.launch, arg) 465 elif self.nb_used < self.nb_core -1: 466 self.nb_used += 1 # upate the number of running thread 467 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout)) 468 elif self.nb_used == self.nb_core -1: 469 self.nb_used += 1 # upate the number of running thread 470 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
471 472
473 - def launch(self, exe, argument, cwd, stdout):
474 """ way to launch for multicore. If exe is a string then treat it as 475 an executable. Otherwise treat it as a function""" 476 import thread 477 def end(self, pid): 478 self.nb_used -= 1 479 self.done += 1 480 try: 481 self.pids.remove(pid) 482 except: 483 pass
484 485 fail_msg = None 486 try: 487 if isinstance(exe,str): 488 if os.path.exists(exe) and not exe.startswith('/'): 489 exe = './' + exe 490 proc = misc.Popen([exe] + argument, cwd=cwd, stdout=stdout, 491 stderr=subprocess.STDOUT) 492 pid = proc.pid 493 self.pids.append(pid) 494 proc.wait() 495 if proc.returncode not in [0, 143, -15]: 496 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 497 (' '.join([exe]+argument), proc.returncode) 498 #self.fail_msg = fail_msg 499 logger.warning(fail_msg) 500 try: 501 log = open(glob.glob(pjoin(cwd,'*','log.txt'))[0]).read() 502 logger.warning('Last 15 lines of logfile %s:\n%s\n' % \ 503 (pjoin(cwd,'*','log.txt'), '\n'.join(log.split('\n')[-15:-1]) + '\n')) 504 except IOError, AttributeError: 505 logger.warning('Please look for possible logfiles in %s' % cwd) 506 pass 507 self.remove(fail_msg) 508 else: 509 pid = tuple([id(o) for o in [exe] + argument]) 510 self.pids.append(pid) 511 # the function should return 0 if everything is fine 512 # the error message otherwise 513 returncode = exe(argument) 514 if returncode != 0: 515 logger.warning(returncode) 516 self.remove() 517 518 519 520 # release the lock for allowing to launch the next job 521 security = 0 522 # check that the status is locked to avoid coincidence unlock 523 while 1: 524 while not self.lock.locked(): 525 if not self.need_waiting: 526 # Main is not yet locked 527 end(self, pid) 528 return 529 elif security > 60: 530 end(self, pid) 531 return 532 security += 1 533 time.sleep(1) 534 try: 535 self.lock.release() 536 except thread.error: 537 continue 538 break 539 end(self, pid) 540 541 542 except Exception, error: 543 #logger.critical('one core fails with %s' % error) 544 self.remove() 545 raise
546 547 548 549
550 - def wait(self, me_dir, update_status):
551 """Wait that all thread finish 552 self.nb_used and self.done are update via each jobs (thread and local) 553 self.submitted is the nb of times that submitted has been call (local) 554 remaining is the nb of job that we still have to wait. (local) 555 self.pids is the list of the BASH pid of the submitted jobs. (thread) 556 557 WARNING: In principle all those value are coherent but since some are 558 modified in various thread, those data can be corrupted. (not the local 559 one). Nb_used in particular shouldn't be trusted too much. 560 This code check in different ways that all jobs have finished. 561 562 In principle, the statement related to '#security #X' are not used. 563 In practise they are times to times. 564 """ 565 566 import thread 567 568 remaining = self.submitted - self.done 569 570 while self.nb_used < self.nb_core: 571 if self.waiting_submission: 572 arg = self.waiting_submission.pop(0) 573 thread.start_new_thread(self.launch, arg) 574 self.nb_used += 1 # update the number of running thread 575 else: 576 break 577 578 try: 579 self.need_waiting = True 580 self.lock.acquire() 581 no_in_queue = 0 582 secure_mode = False # forbid final acauire if in securemode 583 while self.waiting_submission or self.nb_used: 584 if self.fail_msg: 585 msg, self.fail_msg = self.fail_msg, None 586 self.remove() 587 raise Exception, msg 588 if update_status: 589 update_status(len(self.waiting_submission), self.nb_used, self.done) 590 # security#1 that all job expected to be launched since 591 # we enter in this function are indeed launched. 592 if len(self.waiting_submission) == 0 == remaining : 593 self.done = self.submitted 594 break 595 596 # security #2: nb_used >0 but nothing remains as BASH PID 597 if len(self.waiting_submission) == 0 and len(self.pids) == 0: 598 if self.submitted == self.done: 599 break 600 logger.debug('Found too many jobs. Recovering') 601 no_in_queue += 1 602 time.sleep(min(180, 5 * no_in_queue)) 603 if no_in_queue > 3: 604 logger.debug('Still too many jobs. Continue') 605 break 606 continue 607 608 # security #3: if nb_used not reliable pass in secure mode 609 if not secure_mode and len(self.waiting_submission) != 0: 610 if self.nb_used != self.nb_core: 611 if self.nb_used != len(self.pids): 612 secure_mode = True 613 # security #4: nb_used not reliable use secure mode to finish the run 614 if secure_mode and not self.waiting_submission: 615 self.need_waiting = False 616 if self.lock.locked(): 617 self.lock.release() 618 break 619 620 # Wait for core to finish 621 self.lock.acquire() 622 remaining -=1 # update remaining job 623 #submit next one 624 if self.waiting_submission: 625 arg = self.waiting_submission.pop(0) 626 thread.start_new_thread(self.launch, arg) 627 self.nb_used += 1 # update the number of running thread 628 629 if self.fail_msg: 630 msg, self.fail_msg = self.fail_msg, None 631 self.remove() 632 raise Exception, msg 633 # security #5: checked that self.nb_used is not lower than expected 634 #This is the most current problem. 635 no_in_queue = 0 636 while self.submitted > self.done: 637 if self.fail_msg: 638 msg, self.fail_msg = self.fail_msg, None 639 self.remove() 640 raise Exception, msg 641 if no_in_queue == 0: 642 logger.debug('Some jobs have been lost. Try to recover') 643 #something bad happens 644 if not len(self.pids): 645 # The job is not running 646 logger.critical('Some jobs have been lost in the multicore treatment.') 647 logger.critical('The results might be incomplete. (Trying to continue anyway)') 648 break 649 elif update_status: 650 update_status(len(self.waiting_submission), len(self.pids) , 651 self.done) 652 # waiting that those jobs ends. 653 if not secure_mode: 654 self.lock.acquire() 655 else: 656 no_in_queue += 1 657 try: 658 time.sleep(min(180,5*no_in_queue)) 659 if no_in_queue > 5 * 3600.0 / 162: 660 break 661 except KeyboardInterrupt: 662 logger.warning('CTRL-C assumes that all jobs are done. Continue the code') 663 self.pids = [] # avoid security 6 664 break 665 666 # security #6. check that queue is empty. don't 667 no_in_queue = 0 668 while len(self.pids): 669 if self.fail_msg: 670 msg, self.fail_msg = self.fail_msg, None 671 self.remove() 672 raise Exception, msg 673 self.need_waiting = False 674 if self.lock.locked(): 675 self.lock.release() 676 secure_mode = True 677 if no_in_queue == 0 : 678 logger.warning('Some jobs have been lost. Try to recover.') 679 logger.warning('Hitting ctrl-c will consider that all jobs are done and continue the code.') 680 try: 681 #something very bad happens 682 if update_status: 683 update_status(len(self.waiting_submission), len(self.pids) , 684 self.done) 685 time.sleep(min(5*no_in_queue, 180)) 686 no_in_queue += 1 687 if no_in_queue > 5 * 3600.0 / 162: 688 break 689 except KeyboardInterrupt: 690 break 691 692 # print a last time the status (forcing 0 for the running) 693 if update_status: 694 self.next_update = 0 695 update_status(len(self.waiting_submission), 0, self.done) 696 697 # reset variable for next submission 698 self.need_waiting = False 699 security = 0 700 while not self.lock.locked() and security < 10: 701 # check that the status is locked to avoid coincidence unlock 702 if secure_mode: 703 security = 10 704 security +=1 705 time.sleep(1) 706 if security < 10: 707 self.lock.release() 708 self.done = 0 709 self.nb_used = 0 710 self.submitted = 0 711 self.pids = [] 712 713 except KeyboardInterrupt: 714 self.remove() 715 raise 716 if self.fail_msg: 717 msg, self.fail_msg = self.fail_msg, None 718 self.remove() 719 raise Exception, msg
720 721
722 - def remove(self, error=None):
723 """Ensure that all thread are killed""" 724 logger.info('remove job currently running') 725 self.waiting_submission = [] 726 if error: 727 self.fail_msg = error 728 for pid in list(self.pids): 729 if isinstance(pid, tuple): 730 continue 731 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 732 % {'pid':pid} ) 733 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} ) 734 if out == 0: 735 try: 736 self.pids.remove(pid) 737 except: 738 pass 739 #out = os.system('kill -9 %s &> /dev/null' % pid) 740 741 time.sleep(1) # waiting if some were submitting at the time of ctrl-c 742 for pid in list(self.pids): 743 if isinstance(pid, tuple): 744 continue 745 out = os.system('CPIDS=$(pgrep -P %s); kill -15 $CPIDS > /dev/null 2>&1' % pid ) 746 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} ) 747 if out == 0: 748 try: 749 self.pids.remove(pid) 750 except: 751 pass
752
753 -class CondorCluster(Cluster):
754 """Basic class for dealing with cluster submission""" 755 756 name = 'condor' 757 job_id = 'CONDOR_ID' 758 759 760 761 @multiple_try()
762 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 763 required_output=[], nb_submit=0):
764 """Submit a job prog to a Condor cluster""" 765 766 text = """Executable = %(prog)s 767 output = %(stdout)s 768 error = %(stderr)s 769 log = %(log)s 770 %(argument)s 771 environment = CONDOR_ID=$(Cluster).$(Process) 772 Universe = vanilla 773 notification = Error 774 Initialdir = %(cwd)s 775 %(requirement)s 776 getenv=True 777 queue 1 778 """ 779 780 if self.cluster_queue not in ['None', None]: 781 requirement = 'Requirements = %s=?=True' % self.cluster_queue 782 else: 783 requirement = '' 784 785 if cwd is None: 786 cwd = os.getcwd() 787 if stdout is None: 788 stdout = '/dev/null' 789 if stderr is None: 790 stderr = '/dev/null' 791 if log is None: 792 log = '/dev/null' 793 if not os.path.exists(prog): 794 prog = os.path.join(cwd, prog) 795 if argument: 796 argument = 'Arguments = %s' % ' '.join(argument) 797 else: 798 argument = '' 799 800 801 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 802 'stderr': stderr,'log': log,'argument': argument, 803 'requirement': requirement} 804 805 open('submit_condor','w').write(text % dico) 806 a = misc.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE) 807 output = a.stdout.read() 808 #Submitting job(s). 809 #Logging submit event(s). 810 #1 job(s) submitted to cluster 2253622. 811 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 812 try: 813 id = pat.search(output).groups()[0] 814 except: 815 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 816 % output 817 self.submitted += 1 818 self.submitted_ids.append(id) 819 return id
820 821 @store_input() 822 @multiple_try()
823 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 824 log=None, input_files=[], output_files=[], required_output=[], 825 nb_submit=0):
826 """Submit the job on the cluster NO SHARE DISK 827 input/output file should be give relative to cwd 828 """ 829 830 if not required_output and output_files: 831 required_output = output_files 832 833 if (input_files == [] == output_files): 834 return self.submit(prog, argument, cwd, stdout, stderr, log, 835 required_output=required_output, nb_submit=nb_submit) 836 837 text = """Executable = %(prog)s 838 output = %(stdout)s 839 error = %(stderr)s 840 log = %(log)s 841 %(argument)s 842 should_transfer_files = YES 843 when_to_transfer_output = ON_EXIT 844 transfer_input_files = %(input_files)s 845 %(output_files)s 846 Universe = vanilla 847 notification = Error 848 Initialdir = %(cwd)s 849 %(requirement)s 850 getenv=True 851 queue 1 852 """ 853 854 if self.cluster_queue not in ['None', None]: 855 requirement = 'Requirements = %s=?=True' % self.cluster_queue 856 else: 857 requirement = '' 858 859 if cwd is None: 860 cwd = os.getcwd() 861 if stdout is None: 862 stdout = '/dev/null' 863 if stderr is None: 864 stderr = '/dev/null' 865 if log is None: 866 log = '/dev/null' 867 if not os.path.exists(prog): 868 prog = os.path.join(cwd, prog) 869 if argument: 870 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 871 else: 872 argument = '' 873 # input/output file treatment 874 if input_files: 875 input_files = ','.join(input_files) 876 else: 877 input_files = '' 878 if output_files: 879 output_files = 'transfer_output_files = %s' % ','.join(output_files) 880 else: 881 output_files = '' 882 883 884 885 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 886 'stderr': stderr,'log': log,'argument': argument, 887 'requirement': requirement, 'input_files':input_files, 888 'output_files':output_files} 889 890 open('submit_condor','w').write(text % dico) 891 a = subprocess.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE) 892 output = a.stdout.read() 893 #Submitting job(s). 894 #Logging submit event(s). 895 #1 job(s) submitted to cluster 2253622. 896 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 897 try: 898 id = pat.search(output).groups()[0] 899 except: 900 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 901 % output 902 self.submitted += 1 903 self.submitted_ids.append(id) 904 return id
905 906 907 908 909 910 @multiple_try(nb_try=10, sleep=10)
911 - def control_one_job(self, id):
912 """ control the status of a single job with it's cluster id """ 913 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 914 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 915 stderr=subprocess.PIPE) 916 917 error = status.stderr.read() 918 if status.returncode or error: 919 raise ClusterManagmentError, 'condor_q returns error: %s' % error 920 921 return status.stdout.readline().strip()
922 923 @check_interupt() 924 @multiple_try(nb_try=10, sleep=10)
925 - def control(self, me_dir):
926 """ control the status of a single job with it's cluster id """ 927 928 if not self.submitted_ids: 929 return 0, 0, 0, 0 930 931 packet = 15000 932 idle, run, fail = 0, 0, 0 933 ongoing = [] 934 for i in range(1+(len(self.submitted_ids)-1)//packet): 935 start = i * packet 936 stop = (i+1) * packet 937 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 938 " -format \'%-2s\ ' \'ClusterId\' " + \ 939 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 940 941 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 942 stderr=subprocess.PIPE) 943 error = status.stderr.read() 944 if status.returncode or error: 945 raise ClusterManagmentError, 'condor_q returns error: %s' % error 946 947 for line in status.stdout: 948 id, status = line.strip().split() 949 ongoing.append(int(id)) 950 if status in ['I','U']: 951 idle += 1 952 elif status == 'R': 953 run += 1 954 elif status != 'C': 955 fail += 1 956 957 for id in list(self.submitted_ids): 958 if int(id) not in ongoing: 959 status = self.check_termination(id) 960 if status == 'wait': 961 run += 1 962 elif status == 'resubmit': 963 idle += 1 964 965 return idle, run, self.submitted - (idle+run+fail), fail
966 967 @multiple_try()
968 - def remove(self, *args, **opts):
969 """Clean the jobson the cluster""" 970 971 if not self.submitted_ids: 972 return 973 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 974 975 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
976
977 -class PBSCluster(Cluster):
978 """Basic class for dealing with cluster submission""" 979 980 name = 'pbs' 981 job_id = 'PBS_JOBID' 982 idle_tag = ['Q'] 983 running_tag = ['T','E','R'] 984 complete_tag = ['C'] 985 986 maximum_submited_jobs = 2500 987 988 @multiple_try()
989 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 990 required_output=[], nb_submit=0):
991 """Submit a job prog to a PBS cluster""" 992 993 994 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 995 me_dir = misc.digest(me_dir)[-14:] 996 if not me_dir[0].isalpha(): 997 me_dir = 'a' + me_dir[1:] 998 999 if len(self.submitted_ids) > self.maximum_submited_jobs: 1000 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1001 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1002 self.wait(me_dir, fct, self.maximum_submited_jobs) 1003 1004 1005 text = "" 1006 if cwd is None: 1007 cwd = os.getcwd() 1008 else: 1009 text = " cd %s;" % cwd 1010 if stdout is None: 1011 stdout = '/dev/null' 1012 if stderr is None: 1013 stderr = '/dev/null' 1014 elif stderr == -2: # -2 is subprocess.STDOUT 1015 stderr = stdout 1016 if log is None: 1017 log = '/dev/null' 1018 1019 if not os.path.isabs(prog): 1020 text += "./%s" % prog 1021 else: 1022 text+= prog 1023 1024 if argument: 1025 text += ' ' + ' '.join(argument) 1026 1027 command = ['qsub','-o', stdout, 1028 '-N', me_dir, 1029 '-e', stderr, 1030 '-V'] 1031 1032 if self.cluster_queue and self.cluster_queue != 'None': 1033 command.extend(['-q', self.cluster_queue]) 1034 1035 a = misc.Popen(command, stdout=subprocess.PIPE, 1036 stderr=subprocess.STDOUT, 1037 stdin=subprocess.PIPE, cwd=cwd) 1038 1039 output = a.communicate(text)[0] 1040 id = output.split('.')[0] 1041 if not id.isdigit() or a.returncode !=0: 1042 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1043 % output 1044 1045 self.submitted += 1 1046 self.submitted_ids.append(id) 1047 return id
1048 1049 @multiple_try()
1050 - def control_one_job(self, id):
1051 """ control the status of a single job with it's cluster id """ 1052 cmd = 'qstat '+str(id) 1053 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1054 stderr=subprocess.STDOUT) 1055 1056 for line in status.stdout: 1057 line = line.strip() 1058 if 'cannot connect to server' in line or 'cannot read reply' in line: 1059 raise ClusterManagmentError, 'server disconnected' 1060 if 'Unknown' in line: 1061 return 'F' 1062 elif line.startswith(str(id)): 1063 jobstatus = line.split()[4] 1064 else: 1065 jobstatus="" 1066 1067 if status.returncode != 0 and status.returncode is not None: 1068 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1069 if jobstatus in self.idle_tag: 1070 return 'I' 1071 elif jobstatus in self.running_tag: 1072 return 'R' 1073 return 'F'
1074 1075 1076 @multiple_try()
1077 - def control(self, me_dir):
1078 """ control the status of a single job with it's cluster id """ 1079 cmd = "qstat" 1080 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1081 1082 if me_dir.endswith('/'): 1083 me_dir = me_dir[:-1] 1084 me_dir = misc.digest(me_dir)[-14:] 1085 if not me_dir[0].isalpha(): 1086 me_dir = 'a' + me_dir[1:] 1087 ongoing = [] 1088 1089 idle, run, fail = 0, 0, 0 1090 for line in status.stdout: 1091 if 'cannot connect to server' in line or 'cannot read reply' in line: 1092 raise ClusterManagmentError, 'server disconnected' 1093 if me_dir in line: 1094 ongoing.append(line.split()[0].split('.')[0]) 1095 status2 = line.split()[4] 1096 if status2 in self.idle_tag: 1097 idle += 1 1098 elif status2 in self.running_tag: 1099 run += 1 1100 elif status2 in self.complete_tag: 1101 if not self.check_termination(line.split()[0].split('.')[0]): 1102 idle += 1 1103 else: 1104 fail += 1 1105 1106 if status.returncode != 0 and status.returncode is not None: 1107 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1108 1109 for id in list(self.submitted_ids): 1110 if id not in ongoing: 1111 status2 = self.check_termination(id) 1112 if status2 == 'wait': 1113 run += 1 1114 elif status2 == 'resubmit': 1115 idle += 1 1116 1117 return idle, run, self.submitted - (idle+run+fail), fail
1118 1119 @multiple_try()
1120 - def remove(self, *args, **opts):
1121 """Clean the jobs on the cluster""" 1122 1123 if not self.submitted_ids: 1124 return 1125 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1126 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1127
1128 1129 -class SGECluster(Cluster):
1130 """Basic class for dealing with cluster submission""" 1131 # Class written by Arian Abrahantes. 1132 1133 name = 'sge' 1134 job_id = 'JOB_ID' 1135 idle_tag = ['qw', 'hqw','hRqw','w'] 1136 running_tag = ['r','t','Rr','Rt'] 1137
1138 - def def_get_path(self,location):
1139 """replace string for path issues""" 1140 location = os.path.realpath(location) 1141 homePath = os.getenv("HOME") 1142 if homePath: 1143 location = location.replace(homePath,'$HOME') 1144 return location
1145 1146 @multiple_try()
1147 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1148 required_output=[], nb_submit=0):
1149 """Submit a job prog to an SGE cluster""" 1150 1151 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1152 me_dir = misc.digest(me_dir)[-10:] 1153 if not me_dir[0].isalpha(): 1154 me_dir = 'a' + me_dir[1:] 1155 1156 if cwd is None: 1157 #cwd = os.getcwd() 1158 cwd = self.def_get_path(os.getcwd()) 1159 cwd1 = self.def_get_path(cwd) 1160 text = " cd %s;" % cwd1 1161 if stdout is None: 1162 stdout = '/dev/null' 1163 else: 1164 stdout = self.def_get_path(stdout) 1165 if stderr is None: 1166 stderr = '/dev/null' 1167 elif stderr == -2: # -2 is subprocess.STDOUT 1168 stderr = stdout 1169 else: 1170 stderr = self.def_get_path(stderr) 1171 1172 if log is None: 1173 log = '/dev/null' 1174 else: 1175 log = self.def_get_path(log) 1176 1177 text += prog 1178 if argument: 1179 text += ' ' + ' '.join(argument) 1180 1181 #if anything slips through argument 1182 #print "!=== inteded change ",text.replace('/srv/nfs','') 1183 #text = text.replace('/srv/nfs','') 1184 homePath = os.getenv("HOME") 1185 if homePath: 1186 text = text.replace(homePath,'$HOME') 1187 1188 logger.debug("!=== input %s" % text) 1189 logger.debug("!=== output %s" % stdout) 1190 logger.debug("!=== error %s" % stderr) 1191 logger.debug("!=== logs %s" % log) 1192 1193 command = ['qsub','-o', stdout, 1194 '-N', me_dir, 1195 '-e', stderr, 1196 '-V'] 1197 1198 if self.cluster_queue and self.cluster_queue != 'None': 1199 command.extend(['-q', self.cluster_queue]) 1200 1201 a = misc.Popen(command, stdout=subprocess.PIPE, 1202 stderr=subprocess.STDOUT, 1203 stdin=subprocess.PIPE, cwd=cwd) 1204 1205 output = a.communicate(text)[0] 1206 id = output.split(' ')[2] 1207 if not id.isdigit(): 1208 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1209 % output 1210 self.submitted += 1 1211 self.submitted_ids.append(id) 1212 logger.debug(output) 1213 1214 return id
1215 1216 @multiple_try()
1217 - def control_one_job(self, id):
1218 """ control the status of a single job with it's cluster id """ 1219 #cmd = 'qstat '+str(id) 1220 cmd = 'qstat ' 1221 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1222 for line in status.stdout: 1223 #print "!==",line 1224 #line = line.strip() 1225 #if 'Unknown' in line: 1226 # return 'F' 1227 #elif line.startswith(str(id)): 1228 # status = line.split()[4] 1229 if str(id) in line: 1230 status = line.split()[4] 1231 #print "!=status", status 1232 if status in self.idle_tag: 1233 return 'I' 1234 elif status in self.running_tag: 1235 return 'R' 1236 return 'F'
1237 1238 @multiple_try()
1239 - def control(self, me_dir):
1240 """ control the status of a single job with it's cluster id """ 1241 cmd = "qstat " 1242 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1243 1244 if me_dir.endswith('/'): 1245 me_dir = me_dir[:-1] 1246 me_dir = misc.digest(me_dir)[-10:] 1247 if not me_dir[0].isalpha(): 1248 me_dir = 'a' + me_dir[1:] 1249 1250 idle, run, fail = 0, 0, 0 1251 for line in status.stdout: 1252 if me_dir in line: 1253 status = line.split()[4] 1254 if status in self.idle_tag: 1255 idle += 1 1256 elif status in self.running_tag: 1257 run += 1 1258 else: 1259 logger.debug(line) 1260 fail += 1 1261 1262 return idle, run, self.submitted - (idle+run+fail), fail
1263 1264 1265 1266 @multiple_try()
1267 - def remove(self, *args, **opts):
1268 """Clean the jobs on the cluster""" 1269 1270 if not self.submitted_ids: 1271 return 1272 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1273 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1274
1275 1276 -class LSFCluster(Cluster):
1277 """Basic class for dealing with cluster submission""" 1278 1279 name = 'lsf' 1280 job_id = 'LSB_JOBID' 1281 1282 @multiple_try()
1283 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1284 required_output=[], nb_submit=0):
1285 """Submit the job prog to an LSF cluster""" 1286 1287 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1288 me_dir = misc.digest(me_dir)[-14:] 1289 if not me_dir[0].isalpha(): 1290 me_dir = 'a' + me_dir[1:] 1291 1292 text = "" 1293 command = ['bsub', '-J', me_dir] 1294 if cwd is None: 1295 cwd = os.getcwd() 1296 else: 1297 text = " cd %s;" % cwd 1298 if stdout and isinstance(stdout, str): 1299 command.extend(['-o', stdout]) 1300 if stderr and isinstance(stdout, str): 1301 command.extend(['-e', stderr]) 1302 elif stderr == -2: # -2 is subprocess.STDOUT 1303 pass 1304 if log is None: 1305 log = '/dev/null' 1306 1307 text += prog 1308 if argument: 1309 text += ' ' + ' '.join(argument) 1310 1311 if self.cluster_queue and self.cluster_queue != 'None': 1312 command.extend(['-q', self.cluster_queue]) 1313 1314 a = misc.Popen(command, stdout=subprocess.PIPE, 1315 stderr=subprocess.STDOUT, 1316 stdin=subprocess.PIPE, cwd=cwd) 1317 1318 output = a.communicate(text)[0] 1319 #Job <nnnn> is submitted to default queue <normal>. 1320 try: 1321 id = output.split('>',1)[0].split('<')[1] 1322 except: 1323 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1324 % output 1325 if not id.isdigit(): 1326 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1327 % output 1328 self.submitted += 1 1329 self.submitted_ids.append(id) 1330 return id
1331 1332 1333 @multiple_try()
1334 - def control_one_job(self, id):
1335 """ control the status of a single job with it's cluster id """ 1336 1337 cmd = 'bjobs '+str(id) 1338 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1339 1340 for line in status.stdout: 1341 line = line.strip().upper() 1342 if 'JOBID' in line: 1343 continue 1344 elif str(id) not in line: 1345 continue 1346 status = line.split()[2] 1347 if status == 'RUN': 1348 return 'R' 1349 elif status == 'PEND': 1350 return 'I' 1351 elif status == 'DONE': 1352 return 'F' 1353 else: 1354 return 'H' 1355 return 'F'
1356 1357 @multiple_try()
1358 - def control(self, me_dir):
1359 """ control the status of a single job with it's cluster id """ 1360 1361 if not self.submitted_ids: 1362 return 0, 0, 0, 0 1363 1364 cmd = "bjobs " + ' '.join(self.submitted_ids) 1365 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1366 1367 idle, run, fail = 0, 0, 0 1368 for line in status.stdout: 1369 line = line.strip() 1370 if 'JOBID' in line: 1371 continue 1372 splitline = line.split() 1373 id = splitline[0] 1374 if id not in self.submitted_ids: 1375 continue 1376 status = splitline[2] 1377 if status == 'RUN': 1378 run += 1 1379 elif status == 'PEND': 1380 idle += 1 1381 elif status == 'DONE': 1382 status = self.check_termination(id) 1383 if status == 'wait': 1384 run += 1 1385 elif status == 'resubmit': 1386 idle += 1 1387 else: 1388 fail += 1 1389 1390 return idle, run, self.submitted - (idle+run+fail), fail
1391 1392 @multiple_try()
1393 - def remove(self, *args,**opts):
1394 """Clean the jobs on the cluster""" 1395 1396 if not self.submitted_ids: 1397 return 1398 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1399 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1400
1401 -class GECluster(Cluster):
1402 """Class for dealing with cluster submission on a GE cluster""" 1403 1404 name = 'ge' 1405 job_id = 'JOB_ID' 1406 idle_tag = ['qw'] 1407 running_tag = ['r'] 1408 1409 @multiple_try()
1410 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1411 required_output=[], nb_submit=0):
1412 """Submit a job prog to a GE cluster""" 1413 1414 text = "" 1415 if cwd is None: 1416 cwd = os.getcwd() 1417 else: 1418 text = " cd %s; bash " % cwd 1419 if stdout is None: 1420 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1421 if stderr is None: 1422 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1423 elif stderr == -2: # -2 is subprocess.STDOUT 1424 stderr = stdout 1425 if log is None: 1426 log = '/dev/null' 1427 1428 text += prog 1429 if argument: 1430 text += ' ' + ' '.join(argument) 1431 text += '\n' 1432 tmp_submit = os.path.join(cwd, 'tmp_submit') 1433 open(tmp_submit,'w').write(text) 1434 1435 a = misc.Popen(['qsub','-o', stdout, 1436 '-e', stderr, 1437 tmp_submit], 1438 stdout=subprocess.PIPE, 1439 stderr=subprocess.STDOUT, 1440 stdin=subprocess.PIPE, cwd=cwd) 1441 1442 output = a.communicate()[0] 1443 #Your job 874511 ("test.sh") has been submitted 1444 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1445 try: 1446 id = pat.search(output).groups()[0] 1447 except: 1448 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1449 % output 1450 self.submitted += 1 1451 self.submitted_ids.append(id) 1452 return id
1453 1454 @multiple_try()
1455 - def control_one_job(self, id):
1456 """ control the status of a single job with it's cluster id """ 1457 cmd = 'qstat | grep '+str(id) 1458 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1459 if not status: 1460 return 'F' 1461 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1462 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1463 stat = '' 1464 for line in status.stdout.read().split('\n'): 1465 if not line: 1466 continue 1467 line = line.strip() 1468 try: 1469 groups = pat.search(line).groups() 1470 except: 1471 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1472 if groups[0] != id: continue 1473 stat = groups[1] 1474 if not stat: 1475 return 'F' 1476 if stat in self.idle_tag: 1477 return 'I' 1478 if stat in self.running_tag: 1479 return 'R'
1480 1481 @multiple_try()
1482 - def control(self, me_dir=None):
1483 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1484 if not self.submitted_ids: 1485 return 0, 0, 0, 0 1486 idle, run, fail = 0, 0, 0 1487 ongoing = [] 1488 for statusflag in ['p', 'r', 'sh']: 1489 cmd = 'qstat -s %s' % statusflag 1490 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1491 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1492 pat = re.compile("^(\d+)") 1493 for line in status.stdout.read().split('\n'): 1494 line = line.strip() 1495 try: 1496 id = pat.search(line).groups()[0] 1497 except Exception: 1498 pass 1499 else: 1500 if id not in self.submitted_ids: 1501 continue 1502 ongoing.append(id) 1503 if statusflag == 'p': 1504 idle += 1 1505 if statusflag == 'r': 1506 run += 1 1507 if statusflag == 'sh': 1508 fail += 1 1509 for id in list(self.submitted_ids): 1510 if id not in ongoing: 1511 self.check_termination(id) 1512 #self.submitted_ids = ongoing 1513 1514 return idle, run, self.submitted - idle - run - fail, fail
1515 1516 @multiple_try()
1517 - def remove(self, *args, **opts):
1518 """Clean the jobs on the cluster""" 1519 1520 if not self.submitted_ids: 1521 return 1522 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1523 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1524
1525 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1526 """start a computation and not wait for it to finish. 1527 this fonction returns a lock which is locked as long as the job is 1528 running.""" 1529 1530 mc = MultiCore(1) 1531 mc.submit(exe, argument, cwd, stdout, **opt) 1532 mc.need_waiting = True 1533 mc.lock.acquire() 1534 return mc.lock
1535
1536 1537 -class SLURMCluster(Cluster):
1538 """Basic class for dealing with cluster submission""" 1539 1540 name = 'slurm' 1541 job_id = 'SLURM_JOBID' 1542 idle_tag = ['Q','PD','S','CF'] 1543 running_tag = ['R', 'CG'] 1544 complete_tag = ['C'] 1545 1546 @multiple_try()
1547 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1548 required_output=[], nb_submit=0):
1549 """Submit a job prog to a SLURM cluster""" 1550 1551 me_dir = os.path.realpath(os.path.join(cwd,prog)).rsplit('/SubProcesses',1)[0] 1552 me_dir = misc.digest(me_dir)[-8:] 1553 1554 if not me_dir[0].isalpha(): 1555 me_dir = 'a' + me_dir[1:] 1556 1557 if cwd is None: 1558 cwd = os.getcwd() 1559 if stdout is None: 1560 stdout = '/dev/null' 1561 if stderr is None: 1562 stderr = '/dev/null' 1563 elif stderr == -2: # -2 is subprocess.STDOUT 1564 stderr = stdout 1565 if log is None: 1566 log = '/dev/null' 1567 1568 command = ['sbatch', '-o', stdout, 1569 '-J', me_dir, 1570 '-e', stderr, prog] + argument 1571 1572 if self.cluster_queue and self.cluster_queue != 'None': 1573 command.insert(1, '-p') 1574 command.insert(2, self.cluster_queue) 1575 1576 a = misc.Popen(command, stdout=subprocess.PIPE, 1577 stderr=subprocess.STDOUT, 1578 stdin=subprocess.PIPE, cwd=cwd) 1579 1580 output = a.communicate() 1581 output_arr = output[0].split(' ') 1582 id = output_arr[3].rstrip() 1583 1584 if not id.isdigit(): 1585 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1586 1587 self.submitted += 1 1588 self.submitted_ids.append(id) 1589 return id
1590 1591 @multiple_try()
1592 - def control_one_job(self, id):
1593 """ control the status of a single job with it's cluster id """ 1594 cmd = 'squeue j'+str(id) 1595 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1596 stderr=open(os.devnull,'w')) 1597 1598 for line in status.stdout: 1599 line = line.strip() 1600 if 'Invalid' in line: 1601 return 'F' 1602 elif line.startswith(str(id)): 1603 status = line.split()[4] 1604 if status in self.idle_tag: 1605 return 'I' 1606 elif status in self.running_tag: 1607 return 'R' 1608 return 'F'
1609 1610 @multiple_try()
1611 - def control(self, me_dir):
1612 """ control the status of a single job with it's cluster id """ 1613 cmd = "squeue" 1614 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1615 1616 if me_dir.endswith('/'): 1617 me_dir = me_dir[:-1] 1618 me_dir = misc.digest(me_dir)[-8:] 1619 if not me_dir[0].isalpha(): 1620 me_dir = 'a' + me_dir[1:] 1621 1622 idle, run, fail = 0, 0, 0 1623 ongoing=[] 1624 for line in status.stdout: 1625 if me_dir in line: 1626 id, _, _,_ , status,_ = line.split(None,5) 1627 ongoing.append(id) 1628 if status in self.idle_tag: 1629 idle += 1 1630 elif status in self.running_tag: 1631 run += 1 1632 elif status in self.complete_tag: 1633 status = self.check_termination(id) 1634 if status == 'wait': 1635 run += 1 1636 elif status == 'resubmit': 1637 idle += 1 1638 else: 1639 fail += 1 1640 1641 #control other finished job 1642 for id in list(self.submitted_ids): 1643 if id not in ongoing: 1644 status = self.check_termination(id) 1645 if status == 'wait': 1646 run += 1 1647 elif status == 'resubmit': 1648 idle += 1 1649 1650 1651 return idle, run, self.submitted - (idle+run+fail), fail
1652 1653 @multiple_try()
1654 - def remove(self, *args, **opts):
1655 """Clean the jobs on the cluster""" 1656 1657 if not self.submitted_ids: 1658 return 1659 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1660 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1661
1662 -class HTCaaSCluster(Cluster):
1663 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1664 1665 name= 'htcaas' 1666 job_id = 'HTCAAS_JOBID' 1667 1668 @store_input() 1669 @multiple_try()
1670 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1671 log=None, input_files=[], output_files=[], required_output=[], 1672 nb_submit=0):
1673 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1674 input/output file should be give relative to cwd 1675 """ 1676 # To make workspace name(temp) 1677 if 'ajob' in prog: 1678 prog_num = prog.rsplit("ajob",1)[1] 1679 else: 1680 prog_num = '0' 1681 1682 cur_usr = os.getenv('USER') 1683 1684 if cwd is None: 1685 cwd = os.getcwd() 1686 1687 cwd_cp = cwd.rsplit("/",2) 1688 #print 'This is HTCaaS Mode' 1689 1690 if not stdout is None: 1691 print "stdout: %s" % stdout 1692 1693 if not os.path.exists(prog): 1694 prog = os.path.join(cwd, prog) 1695 1696 if not required_output and output_files: 1697 required_output = output_files 1698 1699 1700 if not 'combine' and not 'pythia' in prog : 1701 cwd_arg = cwd+"/arguments" 1702 temp = ' '.join([str(a) for a in argument]) 1703 arg_cmd="echo '"+temp+"' > " + cwd_arg 1704 #print arg_cmd 1705 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1706 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1707 if argument : 1708 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1709 print command 1710 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1711 id = a.stdout.read().strip() 1712 1713 else: 1714 cwd_arg = cwd+"/arguments" 1715 temp = ' '.join([str(a) for a in argument]) 1716 #arg_cmd="echo '"+temp+"' > " + cwd_arg 1717 #print arg_cmd 1718 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1719 #print os.path.basename(prog) 1720 temp_file_name = "sub." + os.path.basename(prog) 1721 text = """#!/bin/bash 1722 MYPWD=%(cwd)s 1723 cd $MYPWD 1724 input_files=(%(input_files)s ) 1725 for i in ${input_files[@]} 1726 do 1727 chmod -f +x $i 1728 done 1729 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1730 """ 1731 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1732 'arguments': ' '.join([str(a) for a in argument]), 1733 'program': ' ' if '.py' in prog else 'bash'} 1734 1735 # writing a new script for the submission 1736 new_prog = pjoin(cwd, temp_file_name) 1737 open(new_prog, 'w').write(text % dico) 1738 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1739 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1740 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1741 id = a.stdout.read().strip() 1742 1743 nb_try=0 1744 nb_limit=5 1745 if not id.isdigit() : 1746 print "[ID is not digit]:" + id 1747 1748 while not id.isdigit() : 1749 nb_try+=1 1750 print "[fail_retry]:"+ nb_try 1751 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1752 id = a.stdout.read().strip() 1753 if nb_try > nb_limit : 1754 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1755 break 1756 1757 self.submitted += 1 1758 self.submitted_ids.append(id) 1759 1760 return id
1761 1762 @multiple_try(nb_try=10, sleep=10)
1763 - def control_one_job(self, id):
1764 """ control the status of a single job with it's cluster id """ 1765 1766 if id == 0 : 1767 status_out ='C' 1768 else : 1769 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1770 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1771 stderr=subprocess.PIPE) 1772 error = status.stderr.read() 1773 if status.returncode or error: 1774 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1775 status_out= status.stdout.read().strip() 1776 status_out= status_out.split(":",1)[1] 1777 if status_out == 'waiting': 1778 status_out='I' 1779 elif status_out == 'preparing' or status_out == 'running': 1780 status_out = 'R' 1781 elif status_out != 'done': 1782 status_out = 'F' 1783 elif status_out == 'done': 1784 status_out = 'C' 1785 1786 return status_out
1787 1788 @multiple_try(nb_try=15, sleep=1)
1789 - def control(self, me_dir):
1790 """ control the status of a single job with it's cluster id """ 1791 #print "HTCaaS2 Control" 1792 if not self.submitted_ids: 1793 return 0, 0, 0, 0 1794 1795 ongoing = [] 1796 idle, run, fail = 0, 0, 0 1797 1798 if id == 0 : 1799 return 0 , 0, 0, 0 1800 else : 1801 for i in range(len(self.submitted_ids)): 1802 ongoing.append(int(self.submitted_ids[i])) 1803 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 1804 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1805 status_out= status.stdout.read().strip() 1806 status_out= status_out.split(":",1)[1] 1807 if status_out == 'waiting': 1808 idle += 1 1809 elif status_out == 'preparing': 1810 run += 1 1811 elif status_out == 'running': 1812 run += 1 1813 elif status_out != 'done': 1814 fail += 1 1815 1816 if status_out != 'done': 1817 print "["+ self.submitted_ids[i] + "] " + status_out 1818 ''' 1819 for i in range(len(self.submitted_ids)): 1820 if int(self.submitted_ids[i]) not in ongoing: 1821 status = self.check_termination(int(self.submitted_ids[i])) 1822 if status = 'waiting': 1823 idle += 1 1824 elif status == 'resubmit': 1825 idle += 1 1826 elif status == 'failed': 1827 fail += 1 1828 ''' 1829 1830 return idle, run, self.submitted - (idle+run+fail), fail
1831 1832 @multiple_try()
1833 - def remove(self, *args, **opts):
1834 """Clean the jobson the cluster""" 1835 1836 if not self.submitted_ids: 1837 return 1838 for i in range(len(self.submitted_ids)): 1839 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 1840 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1841
1842 1843 -class HTCaaS2Cluster(Cluster):
1844 """Class for dealing with cluster submission on a HTCaaS cluster""" 1845 1846 name= 'htcaas2' 1847 job_id = 'HTCAAS2_JOBID' 1848 1849 @store_input() 1850 @multiple_try()
1851 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1852 log=None, input_files=[], output_files=[], required_output=[], 1853 nb_submit=0):
1854 """Submit the job on the cluster NO SHARE DISK 1855 input/output file should be give relative to cwd 1856 """ 1857 # To make workspace name(temp) 1858 if 'ajob' in prog: 1859 prog_num = prog.rsplit("ajob",1)[1] 1860 elif 'run_combine' in prog: 1861 prog_num = '0' 1862 else: 1863 prog_num = prog 1864 1865 cur_usr = os.getenv('USER') 1866 1867 import uuid 1868 dir = str(uuid.uuid4().hex) 1869 #dir = str(int(time())) 1870 prog_dir = '_run%s'% prog_num 1871 prog_dir = dir+prog_dir 1872 1873 if cwd is None: 1874 cwd = os.getcwd() 1875 1876 cwd_cp = cwd.rsplit("/",2) 1877 1878 if stdout is None: 1879 stdout='/dev/null' 1880 1881 if not os.path.exists(prog): 1882 prog = os.path.join(cwd, prog) 1883 1884 if not required_output and output_files: 1885 required_output = output_files 1886 1887 if '/' in argument : 1888 temp_file_name = "sub." + os.path.basename(prog) 1889 else : 1890 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 1891 1892 1893 if 'combine' in prog or 'pythia' in prog : 1894 text = """#!/bin/bash 1895 MYPWD=%(cwd)s 1896 cd $MYPWD 1897 script=%(script)s 1898 input_files=(%(input_files)s ) 1899 if [ $# -ge 1 ]; then 1900 arg1=$1 1901 else 1902 arg1='' 1903 fi 1904 args=' %(arguments)s' 1905 for i in ${input_files[@]}; do 1906 if [[ "$i" == *$script* ]]; then 1907 script=$i 1908 fi 1909 chmod -f +x $i 1910 done 1911 /bin/bash ${script} ${args} > %(stdout)s 1912 """ 1913 1914 elif 'shower' in prog : 1915 text = """#!/bin/bash 1916 MYPWD=%(cwd)s 1917 cd $MYPWD 1918 args=' %(arguments)s' 1919 input_files=( %(input_files)s ) 1920 for i in ${input_files[@]} 1921 do 1922 chmod -f +x $i 1923 done 1924 /bin/bash %(script)s ${args} > $MYPWD/done 1925 """ 1926 1927 else : 1928 text = """#!/bin/bash 1929 MYPWD=%(cwd)s 1930 #mkdir -p $MYTMP 1931 cd $MYPWD 1932 input_files=( %(input_files)s ) 1933 for i in ${input_files[@]} 1934 do 1935 if [[ $i != */*/* ]]; then 1936 i=$PWD"/"$i 1937 fi 1938 echo $i 1939 if [ -d $i ]; then 1940 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1941 else 1942 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1943 fi 1944 done 1945 """ 1946 1947 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 1948 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 1949 'input_files': ' '.join(input_files + [prog]), 1950 'output_files': ' '.join(output_files), 'stdout': stdout, 1951 'arguments': ' '.join([str(a) for a in argument]), 1952 'program': ' ' if '.py' in prog else 'bash'} 1953 1954 # writing a new script for the submission 1955 new_prog = pjoin(cwd, temp_file_name) 1956 open(new_prog, 'w').write(text % dico) 1957 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1958 1959 # print temp_file_name 1960 cmd1='/bin/bash '+ cwd+'/'+temp_file_name 1961 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE, 1962 stderr=subprocess.PIPE) 1963 #print '%s' % status1.stdout.read() 1964 1965 1966 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog: 1967 1968 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s""" 1969 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 1970 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) , 1971 'prog_dir': prog_dir } 1972 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE, 1973 stderr=subprocess.PIPE) 1974 id = status3.stdout.read().strip() 1975 ## exception 1976 nb_try=0 1977 nb_limit=5 1978 while not id.isdigit() : 1979 nb_try+=1 1980 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1981 id = a.stdout.read().strip() 1982 if nb_try > nb_limit : 1983 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id 1984 break 1985 1986 temp_file_name2 = "sub." +id 1987 text2 = """#!/bin/bash 1988 MYPWD=%(cwd)s 1989 output_files=( %(output_files)s ) 1990 result=done 1991 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then 1992 for i in ${output_files[@]} 1993 do 1994 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s 1995 chmod -Rf 777 ${MYPWD}/$i 1996 done 1997 for i in ${output_files[@]}; do 1998 if [[ -e ${MYPWD}/$i ]]; then 1999 result=done 2000 else 2001 result=running 2002 echo $result 2003 exit 0 2004 fi 2005 done 2006 echo $result 2007 touch ${MYPWD}/done.%(job_id)s 2008 else 2009 for i in ${output_files[@]}; do 2010 if [ -e ${MYPWD}/$i ]; then 2011 result=done 2012 else 2013 rm -f ${MYPWD}/done.%(job_id)s 2014 result=running 2015 echo $result 2016 exit 0 2017 fi 2018 done 2019 echo $result 2020 2021 fi 2022 2023 """ 2024 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 2025 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 2026 'output_files': ' '.join(output_files), 'job_id': id, 2027 'program': ' ' if '.py' in prog else 'bash'} 2028 2029 homePath = os.getenv("HOME") 2030 outPath = homePath +"/MG5" 2031 2032 new_prog2 = pjoin(outPath, temp_file_name2) 2033 open(new_prog2, 'w').write(text2 % dico2) 2034 misc.Popen(['chmod','+x',new_prog2],cwd=cwd) 2035 2036 2037 self.submitted += 1 2038 self.submitted_ids.append(id) 2039 2040 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog: 2041 if '/dev/null' in stdout : 2042 stdout='' 2043 2044 temp_file_shower = "sub.out" 2045 text_shower = """#!/bin/bash 2046 MYPWD=%(cwd)s 2047 result=done 2048 output_files=(%(output_files)s) 2049 for i in ${output_files[@]}; do 2050 if [ -e $MYPWD/$i -o -e $i ]; then 2051 result=done 2052 else 2053 result=running 2054 echo $result 2055 exit 0 2056 fi 2057 done 2058 echo $result 2059 """ 2060 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files), 2061 'program': ' ' if '.py' in prog else 'bash'} 2062 homePath = os.getenv("HOME") 2063 outPath = homePath +"/MG5" 2064 new_prog_shower = pjoin(outPath, temp_file_shower) 2065 open(new_prog_shower, 'w').write(text_shower % dico_shower) 2066 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd) 2067 2068 id='-1' 2069 self.submitted += 1 2070 self.submitted_ids.append(id) 2071 2072 else : 2073 id='-2' 2074 self.submitted += 1 2075 self.submitted_ids.append(id) 2076 2077 return id
2078 2079 @multiple_try(nb_try=10, sleep=10)
2080 - def control_one_job(self, id):
2081 """ control the status of a single job with it's cluster id """ 2082 2083 homePath = os.getenv("HOME") 2084 outPath = homePath +"/MG5" 2085 2086 2087 if id == '0' or id=='-2' : 2088 status_out ='done' 2089 elif id == '-1' : 2090 cmd='/bin/bash ' +outPath+'/sub.out' 2091 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2092 status_out=status.stdout.read().strip() 2093 print "["+id+"]" + status_out 2094 if status_out == 'waiting': 2095 status_out='wait' 2096 elif status_out == 'preparing' or status_out == 'running': 2097 status_out = 'R' 2098 elif status_out != 'done': 2099 status_out = 'F' 2100 elif status_out == 'done': 2101 status_out = 'C' 2102 2103 print "["+id+"]" + status_out 2104 else : 2105 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 2106 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 2107 stderr=subprocess.PIPE) 2108 error = status.stderr.read() 2109 if status.returncode or error: 2110 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 2111 status_out= status.stdout.read().strip() 2112 status_out= status_out.split(":",1)[1] 2113 print "["+id+"]" + status_out 2114 if status_out == 'waiting': 2115 status_out='wait' 2116 elif status_out == 'preparing' or status_out == 'running': 2117 status_out = 'R' 2118 elif status_out == 'failed' : 2119 args = self.retry_args[id] 2120 id_temp = self.submit2(**args) 2121 del self.retry_args[id] 2122 self.submitted_ids.remove(id) 2123 status_out = 'I' 2124 elif status_out != 'done': 2125 status_out = 'F' 2126 elif status_out == 'done': 2127 status_out = 'C' 2128 2129 return status_out
2130 2131 2132 @check_interupt() 2133 @multiple_try(nb_try=15, sleep=10)
2134 - def control(self, me_dir):
2135 """ control the status of a single job with it's cluster id """ 2136 2137 if not self.submitted_ids: 2138 return 0, 0, 0, 0 2139 2140 ongoing = [] 2141 idle, run, fail = 0, 0, 0 2142 2143 homePath = os.getenv("HOME") 2144 outPath = homePath +"/MG5" 2145 2146 for i in range(len(self.submitted_ids)): 2147 ongoing.append(self.submitted_ids[i]) 2148 if self.submitted_ids[i] == '-2' : 2149 return 0,0,0,0 2150 if self.submitted_ids[i] == '0' : 2151 # ongoing.append('0') 2152 status_out='done' 2153 elif self.submitted_ids[i] == '-1' : 2154 cmd='/bin/bash ' +outPath+'/sub.out' 2155 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2156 status_out=status.stdout.read().strip() 2157 if status_out == 'waiting': 2158 idle += 1 2159 elif status_out == 'preparing': 2160 run += 1 2161 elif status_out == 'running': 2162 run += 1 2163 elif status_out != 'done': 2164 fail += 1 2165 else : 2166 args = self.retry_args[str(self.submitted_ids[i])] 2167 if 'required_output'in args and not args['required_output']: 2168 args['required_output'] = args['output_files'] 2169 self.retry_args[str(self.submitted_ids[i])] = args 2170 2171 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 2172 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 2173 status_out= status.stdout.read().strip() 2174 status_out= status_out.split(":",1)[1] 2175 if status_out == 'waiting': 2176 idle += 1 2177 elif status_out == 'preparing': 2178 run += 1 2179 elif status_out == 'running': 2180 run += 1 2181 elif status_out == 'failed' or status_out == 'canceled': 2182 id = self.submit2(**args) 2183 #self.submitted_ids[i]=id 2184 del self.retry_args[self.submitted_ids[i]] 2185 self.submitted_ids.remove(self.submitted_ids[i]) 2186 self.submitted-=1 2187 idle += 1 2188 elif status_out != 'done': 2189 fail += 1 2190 if status_out == 'done': 2191 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i] 2192 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2193 aa= status2.stdout.read().strip() 2194 #result= self.check_termination(str(self.submitted_ids[i])) 2195 #print result 2196 #if not result : 2197 #if not self.check_termination(str(self.submitted_ids[i])): 2198 # print "not_self" + self.submitted_ids[i] 2199 # idle += 1 2200 #else : 2201 for path in args['required_output']: 2202 if args['cwd']: 2203 path = pjoin(args['cwd'], path) 2204 # check that file exists and is not empty. 2205 temp1=os.path.exists(path) 2206 temp2=os.stat(path).st_size 2207 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 2208 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2209 aa= status2.stdout.read().strip() 2210 if aa == 'done': 2211 self.submitted_ids[i] = '0' 2212 elif aa == 'running': 2213 run += 1 2214 else : 2215 self.submitted_ids[i]='0' 2216 2217 2218 for i in range(len(self.submitted_ids)): 2219 if str(self.submitted_ids[i]) not in ongoing: 2220 status2= self.check_termination(str(self.submitted_ids[i])) 2221 if status2 == 'wait': 2222 run += 1 2223 elif status2 == 'resubmit': 2224 idle += 1 2225 2226 return idle, run, self.submitted - (idle+run+fail), fail
2227 2228 @multiple_try()
2229 - def remove(self, *args, **opts):
2230 """Clean the jobson the cluster""" 2231 2232 if not self.submitted_ids: 2233 return 2234 for i in range(len(self.submitted_ids)): 2235 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 2236 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2237 2238 2239 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2240 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2241 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2242