Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21  import sys 
  22   
  23  logger = logging.getLogger('madgraph.cluster')  
  24   
  25  try: 
  26      from madgraph import MadGraph5Error 
  27      import madgraph.various.misc as misc 
  28  except Exception, error: 
  29      if __debug__: 
  30          print  str(error) 
  31      from internal import MadGraph5Error 
  32      import internal.misc as misc 
  33   
  34  pjoin = os.path.join 
35 36 -class ClusterManagmentError(MadGraph5Error):
37 pass
38
39 -class NotImplemented(MadGraph5Error):
40 pass
41 42 43 multiple_try = misc.multiple_try 44 pjoin = os.path.join
45 46 47 -def check_interupt(error=KeyboardInterrupt):
48 49 def deco_interupt(f): 50 def deco_f_interupt(self, *args, **opt): 51 try: 52 return f(self, *args, **opt) 53 except error: 54 try: 55 self.remove(*args, **opt) 56 except Exception: 57 pass 58 raise error
59 return deco_f_interupt 60 return deco_interupt 61
62 -def store_input(arg=''):
63 64 def deco_store(f): 65 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 66 input_files=[], output_files=[], required_output=[], nb_submit=0): 67 frame = inspect.currentframe() 68 args, _, _, values = inspect.getargvalues(frame) 69 args = dict([(i, values[i]) for i in args if i != 'self']) 70 id = f(self, **args) 71 if self.nb_retry > 0: 72 self.retry_args[id] = args 73 return id
74 return deco_f_store 75 return deco_store 76
77 -def need_transfer(options):
78 """ This function checks whether compression of input files are necessary 79 given the running options given. """ 80 81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 82 return False 83 else: 84 return True
85
86 -class Cluster(object):
87 """Basic Class for all cluster type submission""" 88 name = 'mother class' 89 identifier_length = 14 90
91 - def __init__(self,*args, **opts):
92 """Init the cluster""" 93 94 self.submitted = 0 95 self.submitted_ids = [] 96 self.finish = 0 97 self.submitted_dirs = [] #HTCaaS 98 self.submitted_exes = [] #HTCaaS 99 self.submitted_args = [] #HTCaaS 100 101 if 'cluster_queue' in opts: 102 self.cluster_queue = opts['cluster_queue'] 103 else: 104 self.cluster_queue = 'madgraph' 105 if 'cluster_temp_path' in opts: 106 self.temp_dir = opts['cluster_temp_path'] 107 else: 108 self.temp_dir = None 109 self.options = {'cluster_status_update': (600, 30)} 110 for key,value in opts.items(): 111 self.options[key] = value 112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 114 self.options = dict(opts) 115 self.retry_args = {} 116 # controlling jobs in controlled type submision 117 self.packet = {} 118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster.""" 123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124 125 126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 128 log=None, input_files=[], output_files=[], required_output=[], 129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster. 131 NO SHARE DISK""" 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 138 if not required_output and output_files: 139 required_output = output_files 140 141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 142 (input_files == [] == output_files): 143 return self.submit(prog, argument, cwd, stdout, stderr, log, 144 required_output=required_output, nb_submit=nb_submit) 145 146 if not input_files and not output_files: 147 # not input/output so not using submit2 148 return self.submit(prog, argument, cwd, stdout, stderr, log, 149 required_output=required_output, nb_submit=nb_submit) 150 151 if cwd is None: 152 cwd = os.getcwd() 153 if not os.path.exists(prog): 154 prog = os.path.join(cwd, prog) 155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 156 157 text = """#!/bin/bash 158 MYTMP=%(tmpdir)s/run$%(job_id)s 159 MYPWD=%(cwd)s 160 mkdir -p $MYTMP 161 cd $MYPWD 162 input_files=( %(input_files)s ) 163 for i in ${input_files[@]} 164 do 165 cp -R -L $i $MYTMP 166 done 167 cd $MYTMP 168 echo '%(arguments)s' > arguments 169 chmod +x ./%(script)s 170 %(program)s ./%(script)s %(arguments)s 171 exit=$? 172 output_files=( %(output_files)s ) 173 for i in ${output_files[@]} 174 do 175 cp -r $MYTMP/$i $MYPWD 176 done 177 # if [ "$exit" -eq "0" ] 178 # then 179 rm -rf $MYTMP 180 # fi 181 """ 182 183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 184 'cwd': cwd, 'job_id': self.job_id, 185 'input_files': ' '.join(input_files + [prog]), 186 'output_files': ' '.join(output_files), 187 'arguments': ' '.join([str(a) for a in argument]), 188 'program': ' ' if '.py' in prog else 'bash'} 189 190 # writing a new script for the submission 191 new_prog = pjoin(cwd, temp_file_name) 192 open(new_prog, 'w').write(text % dico) 193 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 194 195 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 196 required_output=required_output, nb_submit=nb_submit)
197 198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 200 log=None, input_files=[], output_files=[], required_output=[], 201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant 203 method should not be overwritten (but for DAG type submission)""" 204 205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 206 output_files, required_output, nb_submit) 207 208 209 if not packet_member: 210 return id 211 else: 212 if isinstance(packet_member, Packet): 213 self.id_to_packet[id] = packet_member 214 packet_member.put(id) 215 if packet_member.tag not in self.packet: 216 self.packet[packet_member.tag] = packet_member 217 else: 218 if packet_member in self.packet: 219 packet = self.packet[packet_member] 220 packet.put(id) 221 self.id_to_packet[id] = packet 222 return id
223
224 - def control(self, me_dir=None):
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 226 if not self.submitted_ids: 227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 228 idle, run, fail = 0, 0, 0 229 for pid in self.submitted_ids[:]: 230 status = self.control_one_job(id) 231 if status == 'I': 232 idle += 1 233 elif status == 'R': 234 run += 1 235 elif status == 'F': 236 self.finish +=1 237 self.submitted_ids.remove(pid) 238 else: 239 fail += 1 240 241 return idle, run, self.finish, fail
242
243 - def control_one_job(self, pid):
244 """ control the status of a single job with it's cluster id """ 245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
247 - def get_jobs_identifier(self, path, second_path=None):
248 """get a unique run_name for all the jobs helps to identify the runs 249 in the controller for some cluster.""" 250 251 if second_path: 252 path = os.path.realpath(pjoin(path, second_path)) 253 elif not os.path.exists(path): 254 return path # job already done 255 256 if 'SubProcesses' in path: 257 target = path.rsplit('/SubProcesses',1)[0] 258 elif 'MCatNLO' in path: 259 target = path.rsplit('/MCatNLO',1)[0] 260 elif second_path: 261 target=path 262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 263 else: 264 target = path 265 266 if target.endswith('/'): 267 target = target[:-1] 268 269 target = misc.digest(target)[-self.identifier_length:] 270 if not target[0].isalpha(): 271 target = 'a' + target[1:] 272 273 return target
274 275 276 @check_interupt()
277 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
278 """Wait that all job are finish. 279 if minimal_job set, then return if idle + run is lower than that number""" 280 281 282 mode = 1 # 0 is long waiting/ 1 is short waiting 283 nb_iter = 0 284 nb_short = 0 285 change_at = 5 # number of iteration from which we wait longer between update. 286 287 if update_first: 288 idle, run, finish, fail = self.control(me_dir) 289 update_first(idle, run, finish) 290 291 #usefull shortcut for readibility 292 longtime, shorttime = self.options['cluster_status_update'] 293 294 nb_job = 0 295 296 if self.options['cluster_type'] == 'htcaas2': 297 me_dir = self.metasubmit(self) 298 299 while 1: 300 old_mode = mode 301 nb_iter += 1 302 idle, run, finish, fail = self.control(me_dir) 303 if nb_job: 304 if idle + run + finish + fail != nb_job: 305 nb_job = idle + run + finish + fail 306 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 307 else: 308 nb_job = idle + run + finish + fail 309 if fail: 310 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 311 if idle + run == 0: 312 #time.sleep(20) #security to ensure that the file are really written on the disk 313 logger.info('All jobs finished') 314 fct(idle, run, finish) 315 break 316 if idle + run < minimal_job: 317 return 318 fct(idle, run, finish) 319 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 320 if nb_iter < change_at: 321 mode = 1 322 elif idle < run: 323 if old_mode == 0: 324 if nb_short: 325 mode = 0 #we already be back from short to long so stay in long 326 #check if we need to go back to short mode 327 elif idle: 328 if nb_iter > change_at + int(longtime)//shorttime: 329 mode = 0 #stay in long waiting mode 330 else: 331 mode = 1 # pass in short waiting mode 332 nb_short =0 333 else: 334 mode = 1 # pass in short waiting mode 335 nb_short = 0 336 elif old_mode == 1: 337 nb_short +=1 338 if nb_short > 3* max(change_at, int(longtime)//shorttime): 339 mode = 0 #go back in slow waiting 340 else: 341 mode = 0 342 343 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 344 if old_mode > mode: 345 logger.info('''Start to wait %ss between checking status. 346 Note that you can change this time in the configuration file. 347 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 348 349 #now Waiting! 350 if mode == 0: 351 try: 352 time.sleep(self.options['cluster_status_update'][0]) 353 except KeyboardInterrupt: 354 logger.info('start to update the status') 355 nb_iter = min(0, change_at -2) 356 nb_short = 0 357 else: 358 time.sleep(self.options['cluster_status_update'][1]) 359 360 361 self.submitted = 0 362 self.submitted_ids = []
363
364 - def check_termination(self, job_id):
365 """Check the termination of the jobs with job_id and relaunch it if needed.""" 366 367 368 if job_id not in self.retry_args: 369 if job_id in self.id_to_packet: 370 nb_in_packet = self.id_to_packet[job_id].remove_one() 371 if nb_in_packet == 0: 372 # packet done run the associate function 373 packet = self.id_to_packet[job_id] 374 # fully ensure that the packet is finished (thread safe) 375 packet.queue.join() 376 #running the function 377 packet.fct(*packet.args) 378 del self.id_to_packet[job_id] 379 return 'resubmit' 380 else: 381 return True 382 383 args = self.retry_args[job_id] 384 if 'time_check' in args: 385 time_check = args['time_check'] 386 else: 387 time_check = 0 388 389 for path in args['required_output']: 390 if args['cwd']: 391 path = pjoin(args['cwd'], path) 392 # check that file exists and is not empty. 393 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 394 break 395 else: 396 # all requested output are present 397 if time_check > 0: 398 logger.info('Job %s Finally found the missing output.' % (job_id)) 399 del self.retry_args[job_id] 400 self.submitted_ids.remove(job_id) 401 # check if the job_id is in a packet 402 if job_id in self.id_to_packet: 403 nb_in_packet = self.id_to_packet[job_id].remove_one() 404 if nb_in_packet == 0: 405 # packet done run the associate function 406 packet = self.id_to_packet[job_id] 407 # fully ensure that the packet is finished (thread safe) 408 packet.queue.join() 409 #running the function 410 packet.fct(*packet.args) 411 del self.id_to_packet[job_id] 412 return 'resubmit' 413 414 return 'done' 415 416 if time_check == 0: 417 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 418 args['time_check'] = time.time() 419 return 'wait' 420 elif self.cluster_retry_wait > time.time() - time_check: 421 return 'wait' 422 423 #jobs failed to be completed even after waiting time!! 424 if self.nb_retry < 0: 425 logger.critical('''Fail to run correctly job %s. 426 with option: %s 427 file missing: %s''' % (job_id, args, path)) 428 raw_input('press enter to continue.') 429 elif self.nb_retry == 0: 430 logger.critical('''Fail to run correctly job %s. 431 with option: %s 432 file missing: %s. 433 Stopping all runs.''' % (job_id, args, path)) 434 self.remove() 435 elif args['nb_submit'] >= self.nb_retry: 436 logger.critical('''Fail to run correctly job %s. 437 with option: %s 438 file missing: %s 439 Fails %s times 440 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 441 self.remove() 442 else: 443 args['nb_submit'] += 1 444 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 445 del self.retry_args[job_id] 446 self.submitted_ids.remove(job_id) 447 if 'time_check' in args: 448 del args['time_check'] 449 if job_id in self.id_to_packet: 450 self.id_to_packet[job_id].remove_one() 451 args['packet_member'] = self.id_to_packet[job_id] 452 del self.id_to_packet[job_id] 453 self.cluster_submit(**args) 454 else: 455 self.submit2(**args) 456 return 'resubmit' 457 return 'done'
458 459 @check_interupt()
460 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 461 stderr=None, log=None, required_output=[], nb_submit=0, 462 input_files=[], output_files=[]):
463 """launch one job on the cluster and wait for it""" 464 465 special_output = False # tag for concatenate the error with the output. 466 if stderr == -2 and stdout: 467 #We are suppose to send the output to stdout 468 special_output = True 469 stderr = stdout + '.err' 470 471 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 472 required_output=required_output, input_files=input_files, 473 output_files=output_files) 474 475 if self.options['cluster_type']=='htcaas2': 476 if self.submitted == self.submitted_ids[-1]: 477 id = self.metasubmit(self) 478 479 frame = inspect.currentframe() 480 args, _, _, values = inspect.getargvalues(frame) 481 args = dict([(i, values[i]) for i in args if i != 'self']) 482 self.retry_args[id] = args 483 484 nb_wait=0 485 while 1: 486 nb_wait+=1 487 status = self.control_one_job(id) 488 if not status in ['R','I']: 489 status = self.check_termination(id) 490 if status in ['wait']: 491 time.sleep(30) 492 continue 493 elif status in ['resubmit']: 494 id = self.submitted_ids[0] 495 time.sleep(30) 496 continue 497 #really stop! 498 time.sleep(30) #security to ensure that the file are really written on the disk 499 break 500 time.sleep(self.options['cluster_status_update'][1]) 501 502 if required_output: 503 status = self.check_termination(id) 504 if status == 'wait': 505 run += 1 506 elif status == 'resubmit': 507 idle += 1 508 509 510 if special_output: 511 # combine the stdout and the stderr 512 #wait up to 50 s to see if those files exists 513 for i in range(5): 514 if os.path.exists(stdout): 515 if not os.path.exists(stderr): 516 time.sleep(5) 517 if os.path.exists(stderr): 518 err_text = open(stderr).read() 519 if not err_text: 520 return 521 logger.warning(err_text) 522 text = open(stdout).read() 523 open(stdout,'w').write(text + err_text) 524 else: 525 return 526 time.sleep(10)
527
528 - def remove(self, *args, **opts):
529 """ """ 530 logger.warning("""This cluster didn't support job removal, 531 the jobs are still running on the cluster.""")
532 533 @store_input()
534 - def metasubmit(self, me_dir):
535 logger.warning("""This cluster didn't support metajob submit.""") 536 return 0
537
538 -class Packet(object):
539 """ an object for handling packet of job, it is designed to be thread safe 540 """ 541
542 - def __init__(self, name, fct, args, opts={}):
543 import Queue 544 import threading 545 self.queue = Queue.Queue() 546 self.tag = name 547 self.fct = fct 548 self.args = args 549 self.opts = opts 550 self.done = threading.Event()
551
552 - def put(self, *args, **opts):
553 self.queue.put(*args, **opts)
554 555 append = put 556
557 - def remove_one(self):
558 self.queue.get(True) 559 self.queue.task_done() 560 return self.queue.qsize()
561
562 -class MultiCore(Cluster):
563 """class for dealing with the submission in multiple node""" 564 565 job_id = "$" 566
567 - def __init__(self, *args, **opt):
568 """Init the cluster """ 569 570 571 super(MultiCore, self).__init__(self, *args, **opt) 572 573 import Queue 574 import threading 575 import thread 576 self.queue = Queue.Queue() # list of job to do 577 self.done = Queue.Queue() # list of job finisned 578 self.submitted = Queue.Queue() # one entry by job submitted 579 self.stoprequest = threading.Event() #flag to ensure everything to close 580 self.demons = [] 581 self.nb_done =0 582 if 'nb_core' in opt: 583 self.nb_core = opt['nb_core'] 584 elif isinstance(args[0],int): 585 self.nb_core = args[0] 586 else: 587 self.nb_core = 1 588 self.update_fct = None 589 590 self.lock = threading.Event() # allow nice lock of the main thread 591 self.pids = Queue.Queue() # allow to clean jobs submit via subprocess 592 self.done_pid = [] # list of job finisned 593 self.done_pid_queue = Queue.Queue() 594 self.fail_msg = None 595 596 # starting the worker node 597 for _ in range(self.nb_core): 598 self.start_demon()
599 600
601 - def start_demon(self):
602 import threading 603 t = threading.Thread(target=self.worker) 604 t.daemon = True 605 t.start() 606 self.demons.append(t)
607 608
609 - def worker(self):
610 import Queue 611 import thread 612 while not self.stoprequest.isSet(): 613 try: 614 args = self.queue.get() 615 tag, exe, arg, opt = args 616 try: 617 # check for executable case 618 if isinstance(exe,str): 619 if os.path.exists(exe) and not exe.startswith('/'): 620 exe = './' + exe 621 if opt['stderr'] == None: 622 opt['stderr'] = subprocess.STDOUT 623 proc = misc.Popen([exe] + arg, **opt) 624 pid = proc.pid 625 self.pids.put(pid) 626 proc.wait() 627 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 628 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 629 (' '.join([exe]+arg), proc.returncode) 630 logger.warning(fail_msg) 631 self.stoprequest.set() 632 self.remove(fail_msg) 633 # handle the case when this is a python function. Note that 634 # this use Thread so they are NO built-in parralelization this is 635 # going to work on a single core! (but this is fine for IO intensive 636 # function. for CPU intensive fct this will slow down the computation 637 else: 638 pid = tag 639 self.pids.put(pid) 640 # the function should return 0 if everything is fine 641 # the error message otherwise 642 returncode = exe(*arg, **opt) 643 if returncode != 0: 644 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 645 self.stoprequest.set() 646 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 647 except Exception,error: 648 self.fail_msg = sys.exc_info() 649 logger.warning(str(error)) 650 self.stoprequest.set() 651 self.remove(error) 652 653 if __debug__: 654 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2] 655 656 self.queue.task_done() 657 self.done.put(tag) 658 self.done_pid_queue.put(pid) 659 #release the mother to print the status on the screen 660 try: 661 self.lock.set() 662 except thread.error: 663 continue 664 except Queue.Empty: 665 continue
666 667 668 669
670 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 671 log=None, required_output=[], nb_submit=0):
672 """submit a job on multicore machine""" 673 674 tag = (prog, tuple(argument), cwd, nb_submit) 675 if isinstance(prog, str): 676 677 678 opt = {'cwd': cwd, 679 'stdout':stdout, 680 'stderr': stderr} 681 self.queue.put((tag, prog, argument, opt)) 682 self.submitted.put(1) 683 return tag 684 else: 685 # python function 686 self.queue.put((tag, prog, argument, {})) 687 self.submitted.put(1) 688 return tag
689
690 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 691 stderr=None, log=None, **opts):
692 """launch one job and wait for it""" 693 if isinstance(stdout, str): 694 stdout = open(stdout, 'w') 695 if isinstance(stderr, str): 696 stdout = open(stderr, 'w') 697 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
698
699 - def remove(self, error=None):
700 """Ensure that all thread are killed""" 701 702 # ensure the worker to stop 703 self.stoprequest.set() 704 if error and not self.fail_msg: 705 self.fail_msg = error 706 707 # cleaning the queue done_pid_queue and move them to done_pid 708 while not self.done_pid_queue.empty(): 709 pid = self.done_pid_queue.get() 710 self.done_pid.append(pid) 711 # self.done_pid_queue.task_done() 712 713 while not self.pids.empty(): 714 pid = self.pids.get() 715 self.pids.task_done() 716 if isinstance(pid, tuple): 717 continue 718 if pid in self.done_pid: 719 continue 720 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 721 % {'pid':pid} ) 722 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
723 724
725 - def wait(self, me_dir, update_status, update_first=None):
726 """Waiting that all the jobs are done. This function also control that 727 the submission by packet are handle correctly (i.e. submit the function)""" 728 729 import Queue 730 import threading 731 732 try: # to catch KeyBoardInterupt to see which kind of error to display 733 last_status = (0, 0, 0) 734 sleep_time = 1 735 use_lock = True 736 first = True 737 while True: 738 force_one_more_loop = False # some security 739 740 # Loop over the job tagged as done to check if some packet of jobs 741 # are finished in case, put the associate function in the queue 742 while self.done.qsize(): 743 try: 744 tag = self.done.get(True, 1) 745 except Queue.Empty: 746 pass 747 else: 748 if self.id_to_packet and tuple(tag) in self.id_to_packet: 749 packet = self.id_to_packet[tuple(tag)] 750 remaining = packet.remove_one() 751 if remaining == 0: 752 # fully ensure that the packet is finished (thread safe) 753 packet.queue.join() 754 self.submit(packet.fct, packet.args) 755 force_one_more_loop = True 756 self.nb_done += 1 757 self.done.task_done() 758 759 # Get from the various queue the Idle/Done/Running information 760 # Those variable should be thread safe but approximate. 761 Idle = self.queue.qsize() 762 Done = self.nb_done + self.done.qsize() 763 Running = max(0, self.submitted.qsize() - Idle - Done) 764 765 if Idle + Running <= 0 and not force_one_more_loop: 766 update_status(Idle, Running, Done) 767 # Going the quit since everything is done 768 # Fully Ensure that everything is indeed done. 769 self.queue.join() 770 break 771 772 if (Idle, Running, Done) != last_status: 773 if first and update_first: 774 update_first(Idle, Running, Done) 775 first = False 776 else: 777 update_status(Idle, Running, Done) 778 last_status = (Idle, Running, Done) 779 780 # cleaning the queue done_pid_queue and move them to done_pid 781 while not self.done_pid_queue.empty(): 782 pid = self.done_pid_queue.get() 783 self.done_pid.append(pid) 784 self.done_pid_queue.task_done() 785 786 787 # Define how to wait for the next iteration 788 if use_lock: 789 # simply wait that a worker release the lock 790 use_lock = self.lock.wait(300) 791 self.lock.clear() 792 if not use_lock and Idle > 0: 793 use_lock = True 794 else: 795 # to be sure that we will never fully lock at the end pass to 796 # a simple time.sleep() 797 time.sleep(sleep_time) 798 sleep_time = min(sleep_time + 2, 180) 799 if update_first: 800 update_first(Idle, Running, Done) 801 802 if self.stoprequest.isSet(): 803 if isinstance(self.fail_msg, Exception): 804 raise self.fail_msg 805 elif isinstance(self.fail_msg, str): 806 raise Exception, self.fail_msg 807 else: 808 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 809 # reset variable for next submission 810 try: 811 self.lock.clear() 812 except Exception: 813 pass 814 self.done = Queue.Queue() 815 self.done_pid = [] 816 self.done_pid_queue = Queue.Queue() 817 self.nb_done = 0 818 self.submitted = Queue.Queue() 819 self.pids = Queue.Queue() 820 self.stoprequest.clear() 821 822 except KeyboardInterrupt: 823 # if one of the node fails -> return that error 824 if isinstance(self.fail_msg, Exception): 825 raise self.fail_msg 826 elif isinstance(self.fail_msg, str): 827 raise Exception, self.fail_msg 828 elif self.fail_msg: 829 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 830 # else return orignal error 831 raise
832
833 -class CondorCluster(Cluster):
834 """Basic class for dealing with cluster submission""" 835 836 name = 'condor' 837 job_id = 'CONDOR_ID' 838 839 840 841 @multiple_try()
842 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 843 required_output=[], nb_submit=0):
844 """Submit a job prog to a Condor cluster""" 845 846 text = """Executable = %(prog)s 847 output = %(stdout)s 848 error = %(stderr)s 849 log = %(log)s 850 %(argument)s 851 environment = CONDOR_ID=$(Cluster).$(Process) 852 Universe = vanilla 853 notification = Error 854 Initialdir = %(cwd)s 855 %(requirement)s 856 getenv=True 857 queue 1 858 """ 859 860 if self.cluster_queue not in ['None', None]: 861 requirement = 'Requirements = %s=?=True' % self.cluster_queue 862 else: 863 requirement = '' 864 865 if cwd is None: 866 cwd = os.getcwd() 867 if stdout is None: 868 stdout = '/dev/null' 869 if stderr is None: 870 stderr = '/dev/null' 871 if log is None: 872 log = '/dev/null' 873 if not os.path.exists(prog): 874 prog = os.path.join(cwd, prog) 875 if argument: 876 argument = 'Arguments = %s' % ' '.join(argument) 877 else: 878 argument = '' 879 880 881 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 882 'stderr': stderr,'log': log,'argument': argument, 883 'requirement': requirement} 884 885 #open('submit_condor','w').write(text % dico) 886 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 887 stdin=subprocess.PIPE) 888 output, _ = a.communicate(text % dico) 889 #output = a.stdout.read() 890 #Submitting job(s). 891 #Logging submit event(s). 892 #1 job(s) submitted to cluster 2253622. 893 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 894 try: 895 id = pat.search(output).groups()[0] 896 except: 897 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 898 % output 899 self.submitted += 1 900 self.submitted_ids.append(id) 901 return id
902 903 @store_input() 904 @multiple_try()
905 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 906 log=None, input_files=[], output_files=[], required_output=[], 907 nb_submit=0):
908 """Submit the job on the cluster NO SHARE DISK 909 input/output file should be give relative to cwd 910 """ 911 912 if not required_output and output_files: 913 required_output = output_files 914 915 if (input_files == [] == output_files): 916 return self.submit(prog, argument, cwd, stdout, stderr, log, 917 required_output=required_output, nb_submit=nb_submit) 918 919 text = """Executable = %(prog)s 920 output = %(stdout)s 921 error = %(stderr)s 922 log = %(log)s 923 %(argument)s 924 should_transfer_files = YES 925 when_to_transfer_output = ON_EXIT 926 transfer_input_files = %(input_files)s 927 %(output_files)s 928 Universe = vanilla 929 notification = Error 930 Initialdir = %(cwd)s 931 %(requirement)s 932 getenv=True 933 queue 1 934 """ 935 936 if self.cluster_queue not in ['None', None]: 937 requirement = 'Requirements = %s=?=True' % self.cluster_queue 938 else: 939 requirement = '' 940 941 if cwd is None: 942 cwd = os.getcwd() 943 if stdout is None: 944 stdout = '/dev/null' 945 if stderr is None: 946 stderr = '/dev/null' 947 if log is None: 948 log = '/dev/null' 949 if not os.path.exists(prog): 950 prog = os.path.join(cwd, prog) 951 if argument: 952 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 953 else: 954 argument = '' 955 # input/output file treatment 956 if input_files: 957 input_files = ','.join(input_files) 958 else: 959 input_files = '' 960 if output_files: 961 output_files = 'transfer_output_files = %s' % ','.join(output_files) 962 else: 963 output_files = '' 964 965 966 967 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 968 'stderr': stderr,'log': log,'argument': argument, 969 'requirement': requirement, 'input_files':input_files, 970 'output_files':output_files} 971 972 #open('submit_condor','w').write(text % dico) 973 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 974 stdin=subprocess.PIPE) 975 output, _ = a.communicate(text % dico) 976 #output = a.stdout.read() 977 #Submitting job(s). 978 #Logging submit event(s). 979 #1 job(s) submitted to cluster 2253622. 980 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 981 try: 982 id = pat.search(output).groups()[0] 983 except: 984 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 985 % output 986 self.submitted += 1 987 self.submitted_ids.append(id) 988 return id
989 990 991 992 993 994 @multiple_try(nb_try=10, sleep=10)
995 - def control_one_job(self, id):
996 """ control the status of a single job with it's cluster id """ 997 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 998 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 999 stderr=subprocess.PIPE) 1000 1001 error = status.stderr.read() 1002 if status.returncode or error: 1003 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1004 1005 return status.stdout.readline().strip()
1006 1007 @check_interupt() 1008 @multiple_try(nb_try=10, sleep=10)
1009 - def control(self, me_dir):
1010 """ control the status of a single job with it's cluster id """ 1011 1012 if not self.submitted_ids: 1013 return 0, 0, 0, 0 1014 1015 packet = 15000 1016 idle, run, fail = 0, 0, 0 1017 ongoing = [] 1018 for i in range(1+(len(self.submitted_ids)-1)//packet): 1019 start = i * packet 1020 stop = (i+1) * packet 1021 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1022 " -format \'%-2s\ ' \'ClusterId\' " + \ 1023 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1024 1025 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1026 stderr=subprocess.PIPE) 1027 error = status.stderr.read() 1028 if status.returncode or error: 1029 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1030 1031 for line in status.stdout: 1032 id, status = line.strip().split() 1033 ongoing.append(int(id)) 1034 if status in ['I','U']: 1035 idle += 1 1036 elif status == 'R': 1037 run += 1 1038 elif status != 'C': 1039 fail += 1 1040 1041 for id in list(self.submitted_ids): 1042 if int(id) not in ongoing: 1043 status = self.check_termination(id) 1044 if status == 'wait': 1045 run += 1 1046 elif status == 'resubmit': 1047 idle += 1 1048 1049 return idle, run, self.submitted - (idle+run+fail), fail
1050 1051 @multiple_try()
1052 - def remove(self, *args, **opts):
1053 """Clean the jobson the cluster""" 1054 1055 if not self.submitted_ids: 1056 return 1057 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1058 1059 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1060 self.submitted_ids = []
1061
1062 -class PBSCluster(Cluster):
1063 """Basic class for dealing with cluster submission""" 1064 1065 name = 'pbs' 1066 job_id = 'PBS_JOBID' 1067 idle_tag = ['Q'] 1068 running_tag = ['T','E','R'] 1069 complete_tag = ['C'] 1070 1071 maximum_submited_jobs = 2500 1072 1073 @multiple_try()
1074 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1075 required_output=[], nb_submit=0):
1076 """Submit a job prog to a PBS cluster""" 1077 1078 me_dir = self.get_jobs_identifier(cwd, prog) 1079 1080 if len(self.submitted_ids) > self.maximum_submited_jobs: 1081 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1082 self.wait(me_dir, fct, self.maximum_submited_jobs) 1083 1084 1085 text = "" 1086 if cwd is None: 1087 cwd = os.getcwd() 1088 else: 1089 text = " cd %s;" % cwd 1090 if stdout is None: 1091 stdout = '/dev/null' 1092 if stderr is None: 1093 stderr = '/dev/null' 1094 elif stderr == -2: # -2 is subprocess.STDOUT 1095 stderr = stdout 1096 if log is None: 1097 log = '/dev/null' 1098 1099 if not os.path.isabs(prog): 1100 text += "./%s" % prog 1101 else: 1102 text+= prog 1103 1104 if argument: 1105 text += ' ' + ' '.join(argument) 1106 1107 command = ['qsub','-o', stdout, 1108 '-N', me_dir, 1109 '-e', stderr, 1110 '-V'] 1111 1112 if self.cluster_queue and self.cluster_queue != 'None': 1113 command.extend(['-q', self.cluster_queue]) 1114 1115 a = misc.Popen(command, stdout=subprocess.PIPE, 1116 stderr=subprocess.STDOUT, 1117 stdin=subprocess.PIPE, cwd=cwd) 1118 1119 output = a.communicate(text)[0] 1120 id = output.split('.')[0] 1121 if not id.isdigit() or a.returncode !=0: 1122 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1123 % output 1124 1125 self.submitted += 1 1126 self.submitted_ids.append(id) 1127 return id
1128 1129 @multiple_try()
1130 - def control_one_job(self, id):
1131 """ control the status of a single job with it's cluster id """ 1132 cmd = 'qstat '+str(id) 1133 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1134 stderr=subprocess.STDOUT) 1135 1136 for line in status.stdout: 1137 line = line.strip() 1138 if 'cannot connect to server' in line or 'cannot read reply' in line: 1139 raise ClusterManagmentError, 'server disconnected' 1140 if 'Unknown' in line: 1141 return 'F' 1142 elif line.startswith(str(id)): 1143 jobstatus = line.split()[4] 1144 else: 1145 jobstatus="" 1146 1147 if status.returncode != 0 and status.returncode is not None: 1148 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1149 if jobstatus in self.idle_tag: 1150 return 'I' 1151 elif jobstatus in self.running_tag: 1152 return 'R' 1153 return 'F'
1154 1155 1156 @multiple_try()
1157 - def control(self, me_dir):
1158 """ control the status of a single job with it's cluster id """ 1159 cmd = "qstat" 1160 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1161 1162 me_dir = self.get_jobs_identifier(me_dir) 1163 1164 ongoing = [] 1165 1166 idle, run, fail = 0, 0, 0 1167 for line in status.stdout: 1168 if 'cannot connect to server' in line or 'cannot read reply' in line: 1169 raise ClusterManagmentError, 'server disconnected' 1170 if me_dir in line: 1171 ongoing.append(line.split()[0].split('.')[0]) 1172 status2 = line.split()[4] 1173 if status2 in self.idle_tag: 1174 idle += 1 1175 elif status2 in self.running_tag: 1176 run += 1 1177 elif status2 in self.complete_tag: 1178 if not self.check_termination(line.split()[0].split('.')[0]): 1179 idle += 1 1180 else: 1181 fail += 1 1182 1183 if status.returncode != 0 and status.returncode is not None: 1184 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1185 1186 for id in list(self.submitted_ids): 1187 if id not in ongoing: 1188 status2 = self.check_termination(id) 1189 if status2 == 'wait': 1190 run += 1 1191 elif status2 == 'resubmit': 1192 idle += 1 1193 1194 return idle, run, self.submitted - (idle+run+fail), fail
1195 1196 @multiple_try()
1197 - def remove(self, *args, **opts):
1198 """Clean the jobs on the cluster""" 1199 1200 if not self.submitted_ids: 1201 return 1202 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1203 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1204 self.submitted_ids = []
1205
1206 1207 -class SGECluster(Cluster):
1208 """Basic class for dealing with cluster submission""" 1209 # Class written by Arian Abrahantes. 1210 1211 name = 'sge' 1212 job_id = 'JOB_ID' 1213 idle_tag = ['qw', 'hqw','hRqw','w'] 1214 running_tag = ['r','t','Rr','Rt'] 1215 identifier_length = 10 1216
1217 - def def_get_path(self,location):
1218 """replace string for path issues""" 1219 location = os.path.realpath(location) 1220 homePath = os.getenv("HOME") 1221 if homePath: 1222 location = location.replace(homePath,'$HOME') 1223 return location
1224 1225 @multiple_try()
1226 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1227 required_output=[], nb_submit=0):
1228 """Submit a job prog to an SGE cluster""" 1229 1230 me_dir = self.get_jobs_identifier(cwd, prog) 1231 1232 1233 if cwd is None: 1234 #cwd = os.getcwd() 1235 cwd = self.def_get_path(os.getcwd()) 1236 cwd1 = self.def_get_path(cwd) 1237 text = " cd %s;" % cwd1 1238 if stdout is None: 1239 stdout = '/dev/null' 1240 else: 1241 stdout = self.def_get_path(stdout) 1242 if stderr is None: 1243 stderr = '/dev/null' 1244 elif stderr == -2: # -2 is subprocess.STDOUT 1245 stderr = stdout 1246 else: 1247 stderr = self.def_get_path(stderr) 1248 1249 if log is None: 1250 log = '/dev/null' 1251 else: 1252 log = self.def_get_path(log) 1253 1254 text += prog 1255 if argument: 1256 text += ' ' + ' '.join(argument) 1257 1258 #if anything slips through argument 1259 #print "!=== inteded change ",text.replace('/srv/nfs','') 1260 #text = text.replace('/srv/nfs','') 1261 homePath = os.getenv("HOME") 1262 if homePath: 1263 text = text.replace(homePath,'$HOME') 1264 1265 logger.debug("!=== input %s" % text) 1266 logger.debug("!=== output %s" % stdout) 1267 logger.debug("!=== error %s" % stderr) 1268 logger.debug("!=== logs %s" % log) 1269 1270 command = ['qsub','-o', stdout, 1271 '-N', me_dir, 1272 '-e', stderr, 1273 '-V'] 1274 1275 if self.cluster_queue and self.cluster_queue != 'None': 1276 command.extend(['-q', self.cluster_queue]) 1277 1278 a = misc.Popen(command, stdout=subprocess.PIPE, 1279 stderr=subprocess.STDOUT, 1280 stdin=subprocess.PIPE, cwd=cwd) 1281 1282 output = a.communicate(text)[0] 1283 id = output.split(' ')[2] 1284 if not id.isdigit(): 1285 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1286 % output 1287 self.submitted += 1 1288 self.submitted_ids.append(id) 1289 logger.debug(output) 1290 1291 return id
1292 1293 @multiple_try()
1294 - def control_one_job(self, id):
1295 """ control the status of a single job with it's cluster id """ 1296 #cmd = 'qstat '+str(id) 1297 cmd = 'qstat ' 1298 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1299 for line in status.stdout: 1300 #print "!==",line 1301 #line = line.strip() 1302 #if 'Unknown' in line: 1303 # return 'F' 1304 #elif line.startswith(str(id)): 1305 # status = line.split()[4] 1306 if str(id) in line: 1307 status = line.split()[4] 1308 #print "!=status", status 1309 if status in self.idle_tag: 1310 return 'I' 1311 elif status in self.running_tag: 1312 return 'R' 1313 return 'F'
1314 1315 @multiple_try()
1316 - def control(self, me_dir):
1317 """ control the status of a single job with it's cluster id """ 1318 cmd = "qstat " 1319 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1320 1321 me_dir = self.get_jobs_identifier(me_dir) 1322 1323 finished = list(self.submitted_ids) 1324 1325 idle, run, fail = 0, 0, 0 1326 for line in status.stdout: 1327 if me_dir in line: 1328 id,_,_,_,status = line.split()[:5] 1329 if status in self.idle_tag: 1330 idle += 1 1331 finished.remove(id) 1332 elif status in self.running_tag: 1333 run += 1 1334 finished.remove(id) 1335 else: 1336 logger.debug(line) 1337 fail += 1 1338 finished.remove(id) 1339 1340 for id in finished: 1341 self.check_termination(id) 1342 1343 return idle, run, self.submitted - (idle+run+fail), fail
1344 1345 1346 1347 @multiple_try()
1348 - def remove(self, *args, **opts):
1349 """Clean the jobs on the cluster""" 1350 1351 if not self.submitted_ids: 1352 return 1353 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1354 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1355 self.submitted_ids = []
1356
1357 1358 -class LSFCluster(Cluster):
1359 """Basic class for dealing with cluster submission""" 1360 1361 name = 'lsf' 1362 job_id = 'LSB_JOBID' 1363 1364 @multiple_try()
1365 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1366 required_output=[], nb_submit=0):
1367 """Submit the job prog to an LSF cluster""" 1368 1369 1370 me_dir = self.get_jobs_identifier(cwd, prog) 1371 1372 text = "" 1373 command = ['bsub', '-C0', '-J', me_dir] 1374 if cwd is None: 1375 cwd = os.getcwd() 1376 else: 1377 text = " cd %s;" % cwd 1378 if stdout and isinstance(stdout, str): 1379 command.extend(['-o', stdout]) 1380 if stderr and isinstance(stdout, str): 1381 command.extend(['-e', stderr]) 1382 elif stderr == -2: # -2 is subprocess.STDOUT 1383 pass 1384 if log is None: 1385 log = '/dev/null' 1386 1387 text += prog 1388 if argument: 1389 text += ' ' + ' '.join(argument) 1390 1391 if self.cluster_queue and self.cluster_queue != 'None': 1392 command.extend(['-q', self.cluster_queue]) 1393 1394 a = misc.Popen(command, stdout=subprocess.PIPE, 1395 stderr=subprocess.STDOUT, 1396 stdin=subprocess.PIPE, cwd=cwd) 1397 1398 output = a.communicate(text)[0] 1399 #Job <nnnn> is submitted to default queue <normal>. 1400 try: 1401 id = output.split('>',1)[0].split('<')[1] 1402 except: 1403 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1404 % output 1405 if not id.isdigit(): 1406 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1407 % output 1408 self.submitted += 1 1409 self.submitted_ids.append(id) 1410 return id
1411 1412 1413 @multiple_try()
1414 - def control_one_job(self, id):
1415 """ control the status of a single job with it's cluster id """ 1416 1417 cmd = 'bjobs '+str(id) 1418 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1419 1420 for line in status.stdout: 1421 line = line.strip().upper() 1422 if 'JOBID' in line: 1423 continue 1424 elif str(id) not in line: 1425 continue 1426 status = line.split()[2] 1427 if status == 'RUN': 1428 return 'R' 1429 elif status == 'PEND': 1430 return 'I' 1431 elif status == 'DONE': 1432 return 'F' 1433 else: 1434 return 'H' 1435 return 'F'
1436 1437 @multiple_try()
1438 - def control(self, me_dir):
1439 """ control the status of a single job with it's cluster id """ 1440 1441 if not self.submitted_ids: 1442 return 0, 0, 0, 0 1443 1444 cmd = "bjobs " + ' '.join(self.submitted_ids) 1445 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1446 1447 jobstatus = {} 1448 for line in status.stdout: 1449 line = line.strip() 1450 if 'JOBID' in line: 1451 continue 1452 splitline = line.split() 1453 id = splitline[0] 1454 if id not in self.submitted_ids: 1455 continue 1456 jobstatus[id] = splitline[2] 1457 1458 idle, run, fail = 0, 0, 0 1459 for id in self.submitted_ids[:]: 1460 if id in jobstatus: 1461 status = jobstatus[id] 1462 else: 1463 status = 'MISSING' 1464 if status == 'RUN': 1465 run += 1 1466 elif status == 'PEND': 1467 idle += 1 1468 else: 1469 status = self.check_termination(id) 1470 if status == 'wait': 1471 run += 1 1472 elif status == 'resubmit': 1473 idle += 1 1474 1475 return idle, run, self.submitted - (idle+run+fail), fail
1476 1477 @multiple_try()
1478 - def remove(self, *args,**opts):
1479 """Clean the jobs on the cluster""" 1480 1481 if not self.submitted_ids: 1482 return 1483 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1484 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1485 self.submitted_ids = []
1486
1487 -class GECluster(Cluster):
1488 """Class for dealing with cluster submission on a GE cluster""" 1489 1490 name = 'ge' 1491 job_id = 'JOB_ID' 1492 idle_tag = ['qw'] 1493 running_tag = ['r'] 1494 1495 @multiple_try()
1496 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1497 required_output=[], nb_submit=0):
1498 """Submit a job prog to a GE cluster""" 1499 1500 text = "" 1501 if cwd is None: 1502 cwd = os.getcwd() 1503 else: 1504 text = " cd %s; bash " % cwd 1505 if stdout is None: 1506 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1507 if stderr is None: 1508 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1509 elif stderr == -2: # -2 is subprocess.STDOUT 1510 stderr = stdout 1511 if log is None: 1512 log = '/dev/null' 1513 1514 text += prog 1515 if argument: 1516 text += ' ' + ' '.join(argument) 1517 text += '\n' 1518 tmp_submit = os.path.join(cwd, 'tmp_submit') 1519 open(tmp_submit,'w').write(text) 1520 1521 a = misc.Popen(['qsub','-o', stdout, 1522 '-e', stderr, 1523 tmp_submit], 1524 stdout=subprocess.PIPE, 1525 stderr=subprocess.STDOUT, 1526 stdin=subprocess.PIPE, cwd=cwd) 1527 1528 output = a.communicate()[0] 1529 #Your job 874511 ("test.sh") has been submitted 1530 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1531 try: 1532 id = pat.search(output).groups()[0] 1533 except: 1534 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1535 % output 1536 self.submitted += 1 1537 self.submitted_ids.append(id) 1538 return id
1539 1540 @multiple_try()
1541 - def control_one_job(self, id):
1542 """ control the status of a single job with it's cluster id """ 1543 cmd = 'qstat | grep '+str(id) 1544 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1545 if not status: 1546 return 'F' 1547 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1548 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1549 stat = '' 1550 for line in status.stdout.read().split('\n'): 1551 if not line: 1552 continue 1553 line = line.strip() 1554 try: 1555 groups = pat.search(line).groups() 1556 except: 1557 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1558 if groups[0] != id: continue 1559 stat = groups[1] 1560 if not stat: 1561 return 'F' 1562 if stat in self.idle_tag: 1563 return 'I' 1564 if stat in self.running_tag: 1565 return 'R'
1566 1567 @multiple_try()
1568 - def control(self, me_dir=None):
1569 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1570 if not self.submitted_ids: 1571 return 0, 0, 0, 0 1572 idle, run, fail = 0, 0, 0 1573 ongoing = [] 1574 for statusflag in ['p', 'r', 'sh']: 1575 cmd = 'qstat -s %s' % statusflag 1576 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1577 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1578 pat = re.compile("^(\d+)") 1579 for line in status.stdout.read().split('\n'): 1580 line = line.strip() 1581 try: 1582 id = pat.search(line).groups()[0] 1583 except Exception: 1584 pass 1585 else: 1586 if id not in self.submitted_ids: 1587 continue 1588 ongoing.append(id) 1589 if statusflag == 'p': 1590 idle += 1 1591 if statusflag == 'r': 1592 run += 1 1593 if statusflag == 'sh': 1594 fail += 1 1595 for id in list(self.submitted_ids): 1596 if id not in ongoing: 1597 self.check_termination(id) 1598 #self.submitted_ids = ongoing 1599 1600 return idle, run, self.submitted - idle - run - fail, fail
1601 1602 @multiple_try()
1603 - def remove(self, *args, **opts):
1604 """Clean the jobs on the cluster""" 1605 1606 if not self.submitted_ids: 1607 return 1608 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1609 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1610 self.submitted_ids = []
1611
1612 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1613 """start a computation and not wait for it to finish. 1614 this fonction returns a lock which is locked as long as the job is 1615 running.""" 1616 1617 mc = MultiCore(1) 1618 mc.submit(exe, argument, cwd, stdout, **opt) 1619 mc.need_waiting = True 1620 return mc.lock
1621
1622 1623 -class SLURMCluster(Cluster):
1624 """Basic class for dealing with cluster submission""" 1625 1626 name = 'slurm' 1627 job_id = 'SLURM_JOBID' 1628 idle_tag = ['Q','PD','S','CF'] 1629 running_tag = ['R', 'CG'] 1630 complete_tag = ['C'] 1631 identifier_length = 8 1632 1633 @multiple_try()
1634 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1635 required_output=[], nb_submit=0):
1636 """Submit a job prog to a SLURM cluster""" 1637 1638 me_dir = self.get_jobs_identifier(cwd, prog) 1639 1640 1641 if cwd is None: 1642 cwd = os.getcwd() 1643 if stdout is None: 1644 stdout = '/dev/null' 1645 if stderr is None: 1646 stderr = '/dev/null' 1647 elif stderr == -2: # -2 is subprocess.STDOUT 1648 stderr = stdout 1649 if log is None: 1650 log = '/dev/null' 1651 1652 command = ['sbatch', '-o', stdout, 1653 '-J', me_dir, 1654 '-e', stderr, prog] + argument 1655 1656 if self.cluster_queue and self.cluster_queue != 'None': 1657 command.insert(1, '-p') 1658 command.insert(2, self.cluster_queue) 1659 1660 a = misc.Popen(command, stdout=subprocess.PIPE, 1661 stderr=subprocess.STDOUT, 1662 stdin=subprocess.PIPE, cwd=cwd) 1663 1664 output = a.communicate() 1665 output_arr = output[0].split(' ') 1666 id = output_arr[3].rstrip() 1667 1668 if not id.isdigit(): 1669 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1670 % (output[0] + '\n' + output[1]) 1671 1672 self.submitted += 1 1673 self.submitted_ids.append(id) 1674 return id
1675 1676 @multiple_try()
1677 - def control_one_job(self, id):
1678 """ control the status of a single job with it's cluster id """ 1679 cmd = 'squeue j'+str(id) 1680 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1681 stderr=open(os.devnull,'w')) 1682 1683 for line in status.stdout: 1684 line = line.strip() 1685 if 'Invalid' in line: 1686 return 'F' 1687 elif line.startswith(str(id)): 1688 status = line.split()[4] 1689 if status in self.idle_tag: 1690 return 'I' 1691 elif status in self.running_tag: 1692 return 'R' 1693 return 'F'
1694 1695 @multiple_try()
1696 - def control(self, me_dir):
1697 """ control the status of a single job with it's cluster id """ 1698 cmd = "squeue" 1699 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1700 1701 me_dir = self.get_jobs_identifier(me_dir) 1702 1703 idle, run, fail = 0, 0, 0 1704 ongoing=[] 1705 for line in pstatus.stdout: 1706 if me_dir in line: 1707 id, _, _,_ , status,_ = line.split(None,5) 1708 ongoing.append(id) 1709 if status in self.idle_tag: 1710 idle += 1 1711 elif status in self.running_tag: 1712 run += 1 1713 elif status in self.complete_tag: 1714 status = self.check_termination(id) 1715 if status == 'wait': 1716 run += 1 1717 elif status == 'resubmit': 1718 idle += 1 1719 else: 1720 fail += 1 1721 1722 #control other finished job 1723 for id in list(self.submitted_ids): 1724 if id not in ongoing: 1725 status = self.check_termination(id) 1726 if status == 'wait': 1727 run += 1 1728 elif status == 'resubmit': 1729 idle += 1 1730 1731 1732 return idle, run, self.submitted - (idle+run+fail), fail
1733 1734 @multiple_try()
1735 - def remove(self, *args, **opts):
1736 """Clean the jobs on the cluster""" 1737 1738 if not self.submitted_ids: 1739 return 1740 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1741 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1742 self.submitted_ids = []
1743
1744 -class HTCaaSCluster(Cluster):
1745 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1746 1747 name= 'htcaas' 1748 job_id = 'HTCAAS_JOBID' 1749 idle_tag = ['waiting'] 1750 running_tag = ['preparing','running'] 1751 complete_tag = ['done'] 1752 1753 @store_input() 1754 @multiple_try()
1755 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1756 log=None, input_files=[], output_files=[], required_output=[], 1757 nb_submit=0):
1758 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1759 input/output file should be given as relative to CWd 1760 """ 1761 # To make workspace name(temp) 1762 cur_usr = os.getenv('USER') 1763 1764 if cwd is None: 1765 cwd = os.getcwd() 1766 1767 cwd_cp = cwd.rsplit("/",2) 1768 1769 if not stdout is None: 1770 print "stdout: %s" % stdout 1771 1772 if not os.path.exists(prog): 1773 prog = os.path.join(cwd, prog) 1774 1775 if not required_output and output_files: 1776 required_output = output_files 1777 1778 logger.debug(prog) 1779 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1780 cwd_arg = cwd+"/arguments" 1781 temp = ' '.join([str(a) for a in argument]) 1782 arg_cmd="echo '"+temp+"' > " + cwd_arg 1783 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1784 if argument : 1785 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1786 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1787 id = a.stdout.read().strip() 1788 1789 else: 1790 cwd_arg = cwd+"/arguments" 1791 temp = ' '.join([str(a) for a in argument]) 1792 temp_file_name = "sub." + os.path.basename(prog) 1793 text = """#!/bin/bash 1794 MYPWD=%(cwd)s 1795 cd $MYPWD 1796 input_files=(%(input_files)s ) 1797 for i in ${input_files[@]} 1798 do 1799 chmod -f +x $i 1800 done 1801 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1802 """ 1803 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1804 'arguments': ' '.join([str(a) for a in argument]), 1805 'program': ' ' if '.py' in prog else 'bash'} 1806 1807 # writing a new script for the submission 1808 new_prog = pjoin(cwd, temp_file_name) 1809 open(new_prog, 'w').write(text % dico) 1810 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1811 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1812 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1813 id = a.stdout.read().strip() 1814 logger.debug(id) 1815 1816 nb_try=0 1817 nb_limit=5 1818 if not id.isdigit() : 1819 print "[ID is not digit]:" + id 1820 1821 while not id.isdigit() : 1822 nb_try+=1 1823 print "[fail_retry]:"+ nb_try 1824 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1825 id = a.stdout.read().strip() 1826 if nb_try > nb_limit : 1827 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1828 break 1829 1830 self.submitted += 1 1831 self.submitted_ids.append(id) 1832 1833 return id
1834 1835 @multiple_try(nb_try=10, sleep=5)
1836 - def control_one_job(self, id):
1837 """ control the status of a single job with it's cluster id """ 1838 1839 if id == 0 : 1840 status_out ='C' 1841 else : 1842 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1843 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1844 stderr=subprocess.PIPE) 1845 error = status.stderr.read() 1846 if status.returncode or error: 1847 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1848 status_out= status.stdout.read().strip() 1849 status_out= status_out.split(":",1)[1] 1850 if status_out == 'waiting': 1851 status_out='I' 1852 elif status_out == 'preparing' or status_out == 'running': 1853 status_out = 'R' 1854 elif status_out != 'done': 1855 status_out = 'F' 1856 elif status_out == 'done': 1857 status_out = 'C' 1858 1859 return status_out
1860 1861 @multiple_try()
1862 - def control(self, me_dir):
1863 """ control the status of a single job with it's cluster id """ 1864 if not self.submitted_ids: 1865 logger.debug("self.submitted_ids not exists") 1866 return 0, 0, 0, 0 1867 1868 ongoing = [] 1869 idle, run, fail = 0, 0, 0 1870 1871 start = self.submitted_ids[0] 1872 end = self.submitted_ids[-1] 1873 1874 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1875 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1876 1877 for line in status.stdout: 1878 #ongoing.append(line.split()[0].strip()) 1879 status2 = line.split()[-1] 1880 if status2 is not 'null' or line.split()[0].strip() is not '0': 1881 ongoing.append(line.split()[0].strip()) 1882 logger.debug("["+line.split()[0].strip()+"]"+status2) 1883 if status2 is 'null' or line.split()[0].strip() is '0': 1884 idle += 1 1885 elif status2 in self.idle_tag: 1886 idle += 1 1887 elif status2 in self.running_tag: 1888 run += 1 1889 elif status2 in self.complete_tag: 1890 if not self.check_termination(line.split()[0]): 1891 idle +=1 1892 else: 1893 fail += 1 1894 1895 return idle, run, self.submitted - (idle+run+fail), fail
1896 1897 @multiple_try()
1898 - def remove(self, *args, **opts):
1899 """Clean the jobson the cluster""" 1900 1901 if not self.submitted_ids: 1902 return 1903 for i in range(len(self.submitted_ids)): 1904 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1905 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1906
1907 -class HTCaaS2Cluster(Cluster):
1908 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1909 1910 name= 'htcaas2' 1911 job_id = 'HTCAAS2_JOBID' 1912 idle_tag = ['waiting'] 1913 running_tag = ['preparing','running'] 1914 complete_tag = ['done'] 1915 1916 @store_input() 1917 @multiple_try()
1918 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1919 log=None, input_files=[], output_files=[], required_output=[], 1920 nb_submit=0):
1921 1922 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1923 input/output file should be given as relative to CWD 1924 """ 1925 if cwd is None: 1926 cwd = os.getcwd() 1927 1928 if not os.path.exists(prog): 1929 prog = os.path.join(cwd, prog) 1930 1931 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1932 if cwd or prog : 1933 self.submitted_dirs.append(cwd) 1934 self.submitted_exes.append(prog) 1935 else: 1936 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1937 1938 if argument : 1939 self.submitted_args.append('='.join([str(a) for a in argument])) 1940 1941 if cwd or prog : 1942 self.submitted += 1 1943 id = self.submitted 1944 self.submitted_ids.append(id) 1945 else: 1946 logger.debug("cwd and prog are not exist! ") 1947 id = 0 1948 1949 else: 1950 temp_file_name = "sub."+ os.path.basename(prog) 1951 text = """#!/bin/bash 1952 MYPWD=%(cwd)s 1953 cd $MYPWD 1954 input_files=(%(input_files)s ) 1955 for i in ${input_files[@]} 1956 do 1957 chmod -f +x $i 1958 done 1959 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1960 """ 1961 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1962 'arguments': ' '.join([str(a) for a in argument]), 1963 'program': ' ' if '.py' in prog else 'bash'} 1964 # writing a new script for the submission 1965 new_prog = pjoin(cwd, temp_file_name) 1966 open(new_prog, 'w').write(text % dico) 1967 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1968 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 1969 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1970 id = a.stdout.read().strip() 1971 logger.debug("[mode2]-["+str(id)+"]") 1972 if cwd and prog : 1973 self.submitted += 1 1974 self.submitted_ids.append(id) 1975 else: 1976 logger.debug("cwd and prog are not exist! ") 1977 id = 0 1978 1979 return id
1980 1981 @multiple_try()
1982 - def metasubmit(self, me_dir=None):
1983 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 1984 tmp_leng= len(self.submitted_ids)/2 1985 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 1986 tmp_dirs2= self.submitted_dirs[tmp_leng:] 1987 tmp_exes1= self.submitted_exes[0:tmp_leng] 1988 tmp_exes2= self.submitted_exes[tmp_leng:] 1989 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a is not ' ']), 1990 '-e', ":".join([str(a) for a in tmp_exes1 if a and a is not ' '])] 1991 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a is not ' ']), 1992 '-e', ":".join([str(a) for a in tmp_exes2 if a and a is not ' '])] 1993 if len(self.submitted_args) > 0 : 1994 tmp_args1= self.submitted_args[0:tmp_leng] 1995 tmp_args2= self.submitted_args[tmp_leng:] 1996 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 1997 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 1998 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 1999 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2000 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2001 2002 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2003 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a is not ' ']), 2004 '-e', ":".join([str(a) for a in self.submitted_exes if a and a is not ' '])] 2005 if len(self.submitted_args) > 0 : 2006 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2007 if self.submitted_dirs[0] or self.submitted_exes[0] : 2008 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2009 me_dir = result.stdout.read().strip() 2010 self.submitted_ids[0]=me_dir 2011 else: 2012 me_dir = self.submitted_ids[-1] 2013 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2014 me_dir = self.submitted_ids[0] 2015 else: 2016 me_dir = -1 2017 2018 logger.debug("[" + str(me_dir) + "]") 2019 2020 self.submitted_dirs = [] 2021 self.submitted_exes = [] 2022 self.submitted_args = [] 2023 2024 return me_dir
2025 2026 2027 @multiple_try(nb_try=10, sleep=5)
2028 - def control_one_job(self, id):
2029 """ control the status of a single job with it's cluster id """ 2030 #logger.debug("CONTROL ONE JOB MODE") 2031 if self.submitted == self.submitted_ids[-1] : 2032 id = self.metasubmit(self) 2033 tempid = self.submitted_ids[-1] 2034 self.submitted_ids.remove(self.submitted_ids[-1]) 2035 self.submitted_ids.append(id) 2036 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2037 2038 if id == 0 : 2039 status_out ='C' 2040 else: 2041 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2042 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2043 stderr=subprocess.PIPE) 2044 error = status.stderr.read() 2045 if status.returncode or error: 2046 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error 2047 status_out= status.stdout.read().strip() 2048 status_out= status_out.split(":",1)[1] 2049 logger.debug("[["+str(id)+"]]"+status_out) 2050 if status_out == 'waiting': 2051 status_out='I' 2052 elif status_out == 'preparing' or status_out == 'running': 2053 status_out = 'R' 2054 elif status_out != 'done': 2055 status_out = 'F' 2056 elif status_out == 'done': 2057 status_out = 'C' 2058 self.submitted -= 1 2059 2060 return status_out
2061 2062 @multiple_try()
2063 - def control(self, me_dir):
2064 """ control the status of a single job with it's cluster id """ 2065 if not self.submitted_ids: 2066 logger.debug("self.submitted_ids not exists") 2067 return 0, 0, 0, 0 2068 2069 if "//" in me_dir : 2070 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2071 start = me_dir.split("//")[0] 2072 end = me_dir.split("//")[1] 2073 else : 2074 start = me_dir.split("//")[1] 2075 end = me_dir.split("//")[0] 2076 elif "/" in me_dir : # update 2077 start = 0 2078 end = 0 2079 elif me_dir.isdigit(): 2080 start = me_dir 2081 end = me_dir 2082 elif not me_dir.isdigit(): 2083 me_dir = self.submitted_ids[0] 2084 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2085 2086 ongoing = [] 2087 idle, run, fail, done = 0, 0, 0, 0 2088 2089 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2090 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2091 2092 for line in status.stdout: 2093 status2 = line.split()[-1] 2094 if status2 is not 'null' or line.split()[0].strip() is not '0': 2095 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2096 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2097 2098 if status2 is 'null' or line.split()[0].strip() is '0': 2099 idle += 1 2100 elif status2 in self.idle_tag: 2101 idle += 1 2102 elif status2 in self.running_tag: 2103 run += 1 2104 elif status2 in self.complete_tag: 2105 done += 1 2106 self.submitted -= 1 2107 if not self.check_termination(line.split()[1]): 2108 idle +=1 2109 else: 2110 fail += 1 2111 2112 return idle, run, self.submitted - (idle+run+fail), fail
2113 2114 @multiple_try()
2115 - def remove(self, *args, **opts):
2116 """Clean the jobson the cluster""" 2117 2118 if not self.submitted_ids: 2119 return 2120 id = self.submitted_ids[0] 2121 if id is not 0 : 2122 cmd = "htcaas-job-cancel -m %s" % str(id) 2123 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2124 2125 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2126 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2127 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2128