Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21  import sys 
  22   
  23  logger = logging.getLogger('madgraph.cluster')  
  24   
  25  try: 
  26      from madgraph import MadGraph5Error 
  27      import madgraph.various.misc as misc 
  28  except Exception, error: 
  29      if __debug__: 
  30          print  str(error) 
  31      from internal import MadGraph5Error 
  32      import internal.misc as misc 
  33   
  34  pjoin = os.path.join 
35 36 -class ClusterManagmentError(MadGraph5Error):
37 pass
38
39 -class NotImplemented(MadGraph5Error):
40 pass
41 42 43 multiple_try = misc.multiple_try 44 pjoin = os.path.join
45 46 47 -def check_interupt(error=KeyboardInterrupt):
48 49 def deco_interupt(f): 50 def deco_f_interupt(self, *args, **opt): 51 try: 52 return f(self, *args, **opt) 53 except error: 54 try: 55 self.remove(*args, **opt) 56 except Exception: 57 pass 58 raise error
59 return deco_f_interupt 60 return deco_interupt 61
62 -def store_input(arg=''):
63 64 def deco_store(f): 65 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 66 input_files=[], output_files=[], required_output=[], nb_submit=0): 67 frame = inspect.currentframe() 68 args, _, _, values = inspect.getargvalues(frame) 69 args = dict([(i, values[i]) for i in args if i != 'self']) 70 id = f(self, **args) 71 if self.nb_retry > 0: 72 self.retry_args[id] = args 73 return id
74 return deco_f_store 75 return deco_store 76
77 -def need_transfer(options):
78 """ This function checks whether compression of input files are necessary 79 given the running options given. """ 80 81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 82 return False 83 else: 84 return True
85
86 -class Cluster(object):
87 """Basic Class for all cluster type submission""" 88 name = 'mother class' 89 identifier_length = 14 90
91 - def __init__(self,*args, **opts):
92 """Init the cluster""" 93 94 self.submitted = 0 95 self.submitted_ids = [] 96 self.finish = 0 97 self.submitted_dirs = [] #HTCaaS 98 self.submitted_exes = [] #HTCaaS 99 self.submitted_args = [] #HTCaaS 100 101 if 'cluster_queue' in opts: 102 self.cluster_queue = opts['cluster_queue'] 103 else: 104 self.cluster_queue = 'madgraph' 105 if 'cluster_temp_path' in opts: 106 self.temp_dir = opts['cluster_temp_path'] 107 else: 108 self.temp_dir = None 109 self.options = {'cluster_status_update': (600, 30)} 110 for key,value in opts.items(): 111 self.options[key] = value 112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 114 self.options = dict(opts) 115 self.retry_args = {} 116 # controlling jobs in controlled type submision 117 self.packet = {} 118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster.""" 123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124 125 126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 128 log=None, input_files=[], output_files=[], required_output=[], 129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster. 131 NO SHARE DISK""" 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 138 if not required_output and output_files: 139 required_output = output_files 140 141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 142 (input_files == [] == output_files): 143 return self.submit(prog, argument, cwd, stdout, stderr, log, 144 required_output=required_output, nb_submit=nb_submit) 145 146 if not input_files and not output_files: 147 # not input/output so not using submit2 148 return self.submit(prog, argument, cwd, stdout, stderr, log, 149 required_output=required_output, nb_submit=nb_submit) 150 151 if cwd is None: 152 cwd = os.getcwd() 153 if not os.path.exists(prog): 154 prog = os.path.join(cwd, prog) 155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 156 157 text = """#!/bin/bash 158 MYTMP=%(tmpdir)s/run$%(job_id)s 159 MYPWD=%(cwd)s 160 mkdir -p $MYTMP 161 cd $MYPWD 162 input_files=( %(input_files)s ) 163 for i in ${input_files[@]} 164 do 165 cp -R -L $i $MYTMP 166 done 167 cd $MYTMP 168 echo '%(arguments)s' > arguments 169 chmod +x ./%(script)s 170 %(program)s ./%(script)s %(arguments)s 171 exit=$? 172 output_files=( %(output_files)s ) 173 for i in ${output_files[@]} 174 do 175 cp -r $MYTMP/$i $MYPWD 176 done 177 # if [ "$exit" -eq "0" ] 178 # then 179 rm -rf $MYTMP 180 # fi 181 """ 182 183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 184 'cwd': cwd, 'job_id': self.job_id, 185 'input_files': ' '.join(input_files + [prog]), 186 'output_files': ' '.join(output_files), 187 'arguments': ' '.join([str(a) for a in argument]), 188 'program': ' ' if '.py' in prog else 'bash'} 189 190 # writing a new script for the submission 191 new_prog = pjoin(cwd, temp_file_name) 192 open(new_prog, 'w').write(text % dico) 193 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 194 195 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 196 required_output=required_output, nb_submit=nb_submit)
197 198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 200 log=None, input_files=[], output_files=[], required_output=[], 201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant 203 method should not be overwritten (but for DAG type submission)""" 204 205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 206 output_files, required_output, nb_submit) 207 208 209 if not packet_member: 210 return id 211 else: 212 if isinstance(packet_member, Packet): 213 self.id_to_packet[id] = packet_member 214 packet_member.put(id) 215 if packet_member.tag not in self.packet: 216 self.packet[packet_member.tag] = packet_member 217 else: 218 if packet_member in self.packet: 219 packet = self.packet[packet_member] 220 packet.put(id) 221 self.id_to_packet[id] = packet 222 return id
223
224 - def control(self, me_dir=None):
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 226 if not self.submitted_ids: 227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 228 idle, run, fail = 0, 0, 0 229 for pid in self.submitted_ids[:]: 230 status = self.control_one_job(id) 231 if status == 'I': 232 idle += 1 233 elif status == 'R': 234 run += 1 235 elif status == 'F': 236 self.finish +=1 237 self.submitted_ids.remove(pid) 238 else: 239 fail += 1 240 241 return idle, run, self.finish, fail
242
243 - def control_one_job(self, pid):
244 """ control the status of a single job with it's cluster id """ 245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
247 - def get_jobs_identifier(self, path, second_path=None):
248 """get a unique run_name for all the jobs helps to identify the runs 249 in the controller for some cluster.""" 250 251 if second_path: 252 path = os.path.realpath(pjoin(path, second_path)) 253 elif not os.path.exists(path): 254 return path # job already done 255 256 if 'SubProcesses' in path: 257 target = path.rsplit('/SubProcesses',1)[0] 258 elif 'MCatNLO' in path: 259 target = path.rsplit('/MCatNLO',1)[0] 260 elif second_path: 261 target=path 262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 263 elif 'PY8_parallelization' in path: 264 target = path.rsplit('/PY8_parallelization',1)[0] 265 else: 266 target = path 267 268 if target.endswith('/'): 269 target = target[:-1] 270 271 target = misc.digest(target)[-self.identifier_length:] 272 if not target[0].isalpha(): 273 target = 'a' + target[1:] 274 275 return target
276 277 278 @check_interupt()
279 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
280 """Wait that all job are finish. 281 if minimal_job set, then return if idle + run is lower than that number""" 282 283 284 mode = 1 # 0 is long waiting/ 1 is short waiting 285 nb_iter = 0 286 nb_short = 0 287 change_at = 5 # number of iteration from which we wait longer between update. 288 289 if update_first: 290 idle, run, finish, fail = self.control(me_dir) 291 update_first(idle, run, finish) 292 293 #usefull shortcut for readibility 294 longtime, shorttime = self.options['cluster_status_update'] 295 296 nb_job = 0 297 298 if self.options['cluster_type'] == 'htcaas2': 299 me_dir = self.metasubmit(self) 300 301 while 1: 302 old_mode = mode 303 nb_iter += 1 304 idle, run, finish, fail = self.control(me_dir) 305 if nb_job: 306 if idle + run + finish + fail != nb_job: 307 nb_job = idle + run + finish + fail 308 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 309 else: 310 nb_job = idle + run + finish + fail 311 if fail: 312 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 313 if idle + run == 0: 314 #time.sleep(20) #security to ensure that the file are really written on the disk 315 logger.info('All jobs finished') 316 fct(idle, run, finish) 317 break 318 if idle + run < minimal_job: 319 return 320 fct(idle, run, finish) 321 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 322 if nb_iter < change_at: 323 mode = 1 324 elif idle < run: 325 if old_mode == 0: 326 if nb_short: 327 mode = 0 #we already be back from short to long so stay in long 328 #check if we need to go back to short mode 329 elif idle: 330 if nb_iter > change_at + int(longtime)//shorttime: 331 mode = 0 #stay in long waiting mode 332 else: 333 mode = 1 # pass in short waiting mode 334 nb_short =0 335 else: 336 mode = 1 # pass in short waiting mode 337 nb_short = 0 338 elif old_mode == 1: 339 nb_short +=1 340 if nb_short > 3* max(change_at, int(longtime)//shorttime): 341 mode = 0 #go back in slow waiting 342 else: 343 mode = 0 344 345 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 346 if old_mode > mode: 347 logger.info('''Start to wait %ss between checking status. 348 Note that you can change this time in the configuration file. 349 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 350 351 #now Waiting! 352 if mode == 0: 353 try: 354 time.sleep(self.options['cluster_status_update'][0]) 355 except KeyboardInterrupt: 356 logger.info('start to update the status') 357 nb_iter = min(0, change_at -2) 358 nb_short = 0 359 else: 360 time.sleep(self.options['cluster_status_update'][1]) 361 362 363 self.submitted = 0 364 self.submitted_ids = []
365
366 - def check_termination(self, job_id):
367 """Check the termination of the jobs with job_id and relaunch it if needed.""" 368 369 370 if job_id not in self.retry_args: 371 if job_id in self.id_to_packet: 372 nb_in_packet = self.id_to_packet[job_id].remove_one() 373 if nb_in_packet == 0: 374 # packet done run the associate function 375 packet = self.id_to_packet[job_id] 376 # fully ensure that the packet is finished (thread safe) 377 packet.queue.join() 378 #running the function 379 packet.fct(*packet.args) 380 del self.id_to_packet[job_id] 381 return 'resubmit' 382 else: 383 return True 384 385 args = self.retry_args[job_id] 386 if 'time_check' in args: 387 time_check = args['time_check'] 388 else: 389 time_check = 0 390 391 for path in args['required_output']: 392 if args['cwd']: 393 path = pjoin(args['cwd'], path) 394 # check that file exists and is not empty. 395 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 396 break 397 else: 398 # all requested output are present 399 if time_check > 0: 400 logger.info('Job %s Finally found the missing output.' % (job_id)) 401 del self.retry_args[job_id] 402 self.submitted_ids.remove(job_id) 403 # check if the job_id is in a packet 404 if job_id in self.id_to_packet: 405 nb_in_packet = self.id_to_packet[job_id].remove_one() 406 if nb_in_packet == 0: 407 # packet done run the associate function 408 packet = self.id_to_packet[job_id] 409 # fully ensure that the packet is finished (thread safe) 410 packet.queue.join() 411 #running the function 412 packet.fct(*packet.args) 413 del self.id_to_packet[job_id] 414 return 'resubmit' 415 416 return 'done' 417 418 if time_check == 0: 419 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 420 args['time_check'] = time.time() 421 return 'wait' 422 elif self.cluster_retry_wait > time.time() - time_check: 423 return 'wait' 424 425 #jobs failed to be completed even after waiting time!! 426 if self.nb_retry < 0: 427 logger.critical('''Fail to run correctly job %s. 428 with option: %s 429 file missing: %s''' % (job_id, args, path)) 430 raw_input('press enter to continue.') 431 elif self.nb_retry == 0: 432 logger.critical('''Fail to run correctly job %s. 433 with option: %s 434 file missing: %s. 435 Stopping all runs.''' % (job_id, args, path)) 436 self.remove() 437 elif args['nb_submit'] >= self.nb_retry: 438 logger.critical('''Fail to run correctly job %s. 439 with option: %s 440 file missing: %s 441 Fails %s times 442 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 443 self.remove() 444 else: 445 args['nb_submit'] += 1 446 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 447 del self.retry_args[job_id] 448 self.submitted_ids.remove(job_id) 449 if 'time_check' in args: 450 del args['time_check'] 451 if job_id in self.id_to_packet: 452 self.id_to_packet[job_id].remove_one() 453 args['packet_member'] = self.id_to_packet[job_id] 454 del self.id_to_packet[job_id] 455 self.cluster_submit(**args) 456 else: 457 self.submit2(**args) 458 return 'resubmit' 459 return 'done'
460 461 @check_interupt()
462 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 463 stderr=None, log=None, required_output=[], nb_submit=0, 464 input_files=[], output_files=[]):
465 """launch one job on the cluster and wait for it""" 466 467 special_output = False # tag for concatenate the error with the output. 468 if stderr == -2 and stdout: 469 #We are suppose to send the output to stdout 470 special_output = True 471 stderr = stdout + '.err' 472 473 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 474 required_output=required_output, input_files=input_files, 475 output_files=output_files) 476 477 if self.options['cluster_type']=='htcaas2': 478 if self.submitted == self.submitted_ids[-1]: 479 id = self.metasubmit(self) 480 481 frame = inspect.currentframe() 482 args, _, _, values = inspect.getargvalues(frame) 483 args = dict([(i, values[i]) for i in args if i != 'self']) 484 self.retry_args[id] = args 485 486 nb_wait=0 487 while 1: 488 nb_wait+=1 489 status = self.control_one_job(id) 490 if not status in ['R','I']: 491 status = self.check_termination(id) 492 if status in ['wait']: 493 time.sleep(30) 494 continue 495 elif status in ['resubmit']: 496 id = self.submitted_ids[0] 497 time.sleep(30) 498 continue 499 #really stop! 500 time.sleep(30) #security to ensure that the file are really written on the disk 501 break 502 time.sleep(self.options['cluster_status_update'][1]) 503 504 if required_output: 505 status = self.check_termination(id) 506 if status == 'wait': 507 run += 1 508 elif status == 'resubmit': 509 idle += 1 510 511 512 if special_output: 513 # combine the stdout and the stderr 514 #wait up to 50 s to see if those files exists 515 for i in range(5): 516 if os.path.exists(stdout): 517 if not os.path.exists(stderr): 518 time.sleep(5) 519 if os.path.exists(stderr): 520 err_text = open(stderr).read() 521 if not err_text: 522 return 523 logger.warning(err_text) 524 text = open(stdout).read() 525 open(stdout,'w').write(text + err_text) 526 else: 527 return 528 time.sleep(10)
529
530 - def remove(self, *args, **opts):
531 """ """ 532 logger.warning("""This cluster didn't support job removal, 533 the jobs are still running on the cluster.""")
534 535 @store_input()
536 - def metasubmit(self, me_dir):
537 logger.warning("""This cluster didn't support metajob submit.""") 538 return 0
539
540 -class Packet(object):
541 """ an object for handling packet of job, it is designed to be thread safe 542 """ 543
544 - def __init__(self, name, fct, args, opts={}):
545 import Queue 546 import threading 547 self.queue = Queue.Queue() 548 self.tag = name 549 self.fct = fct 550 self.args = args 551 self.opts = opts 552 self.done = threading.Event()
553
554 - def put(self, *args, **opts):
555 self.queue.put(*args, **opts)
556 557 append = put 558
559 - def remove_one(self):
560 self.queue.get(True) 561 self.queue.task_done() 562 return self.queue.qsize()
563
564 -class MultiCore(Cluster):
565 """class for dealing with the submission in multiple node""" 566 567 job_id = "$" 568
569 - def __init__(self, *args, **opt):
570 """Init the cluster """ 571 572 573 super(MultiCore, self).__init__(self, *args, **opt) 574 575 import Queue 576 import threading 577 import thread 578 self.queue = Queue.Queue() # list of job to do 579 self.done = Queue.Queue() # list of job finisned 580 self.submitted = Queue.Queue() # one entry by job submitted 581 self.stoprequest = threading.Event() #flag to ensure everything to close 582 self.demons = [] 583 self.nb_done =0 584 if 'nb_core' in opt: 585 self.nb_core = opt['nb_core'] 586 elif isinstance(args[0],int): 587 self.nb_core = args[0] 588 else: 589 self.nb_core = 1 590 self.update_fct = None 591 592 self.lock = threading.Event() # allow nice lock of the main thread 593 self.pids = Queue.Queue() # allow to clean jobs submit via subprocess 594 self.done_pid = [] # list of job finisned 595 self.done_pid_queue = Queue.Queue() 596 self.fail_msg = None 597 598 # starting the worker node 599 for _ in range(self.nb_core): 600 self.start_demon()
601 602
603 - def start_demon(self):
604 import threading 605 t = threading.Thread(target=self.worker) 606 t.daemon = True 607 t.start() 608 self.demons.append(t)
609 610
611 - def worker(self):
612 import Queue 613 import thread 614 while not self.stoprequest.isSet(): 615 try: 616 args = self.queue.get() 617 tag, exe, arg, opt = args 618 try: 619 # check for executable case 620 if isinstance(exe,str): 621 if os.path.exists(exe) and not exe.startswith('/'): 622 exe = './' + exe 623 if isinstance(opt['stdout'],str): 624 opt['stdout'] = open(opt['stdout'],'w') 625 if opt['stderr'] == None: 626 opt['stderr'] = subprocess.STDOUT 627 proc = misc.Popen([exe] + arg, **opt) 628 pid = proc.pid 629 self.pids.put(pid) 630 proc.wait() 631 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 632 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 633 (' '.join([exe]+arg), proc.returncode) 634 logger.warning(fail_msg) 635 self.stoprequest.set() 636 self.remove(fail_msg) 637 # handle the case when this is a python function. Note that 638 # this use Thread so they are NO built-in parralelization this is 639 # going to work on a single core! (but this is fine for IO intensive 640 # function. for CPU intensive fct this will slow down the computation 641 else: 642 pid = tag 643 self.pids.put(pid) 644 # the function should return 0 if everything is fine 645 # the error message otherwise 646 returncode = exe(*arg, **opt) 647 if returncode != 0: 648 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 649 self.stoprequest.set() 650 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 651 except Exception,error: 652 self.fail_msg = sys.exc_info() 653 logger.warning(str(error)) 654 self.stoprequest.set() 655 self.remove(error) 656 657 if __debug__: 658 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2] 659 660 self.queue.task_done() 661 self.done.put(tag) 662 self.done_pid_queue.put(pid) 663 #release the mother to print the status on the screen 664 try: 665 self.lock.set() 666 except thread.error: 667 continue 668 except Queue.Empty: 669 continue
670 671 672 673
674 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 675 log=None, required_output=[], nb_submit=0):
676 """submit a job on multicore machine""" 677 678 tag = (prog, tuple(argument), cwd, nb_submit) 679 if isinstance(prog, str): 680 681 opt = {'cwd': cwd, 682 'stdout':stdout, 683 'stderr': stderr} 684 self.queue.put((tag, prog, argument, opt)) 685 self.submitted.put(1) 686 return tag 687 else: 688 # python function 689 self.queue.put((tag, prog, argument, {})) 690 self.submitted.put(1) 691 return tag
692
693 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 694 stderr=None, log=None, **opts):
695 """launch one job and wait for it""" 696 if isinstance(stdout, str): 697 stdout = open(stdout, 'w') 698 if isinstance(stderr, str): 699 stdout = open(stderr, 'w') 700 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
701
702 - def remove(self, error=None):
703 """Ensure that all thread are killed""" 704 705 # ensure the worker to stop 706 self.stoprequest.set() 707 if error and not self.fail_msg: 708 self.fail_msg = error 709 710 # cleaning the queue done_pid_queue and move them to done_pid 711 while not self.done_pid_queue.empty(): 712 pid = self.done_pid_queue.get() 713 self.done_pid.append(pid) 714 # self.done_pid_queue.task_done() 715 716 while not self.pids.empty(): 717 pid = self.pids.get() 718 self.pids.task_done() 719 if isinstance(pid, tuple): 720 continue 721 if pid in self.done_pid: 722 continue 723 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 724 % {'pid':pid} ) 725 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
726 727
728 - def wait(self, me_dir, update_status, update_first=None):
729 """Waiting that all the jobs are done. This function also control that 730 the submission by packet are handle correctly (i.e. submit the function)""" 731 732 import Queue 733 import threading 734 735 try: # to catch KeyBoardInterupt to see which kind of error to display 736 last_status = (0, 0, 0) 737 sleep_time = 1 738 use_lock = True 739 first = True 740 while True: 741 force_one_more_loop = False # some security 742 743 # Loop over the job tagged as done to check if some packet of jobs 744 # are finished in case, put the associate function in the queue 745 while self.done.qsize(): 746 try: 747 tag = self.done.get(True, 1) 748 except Queue.Empty: 749 pass 750 else: 751 if self.id_to_packet and tuple(tag) in self.id_to_packet: 752 packet = self.id_to_packet[tuple(tag)] 753 remaining = packet.remove_one() 754 if remaining == 0: 755 # fully ensure that the packet is finished (thread safe) 756 packet.queue.join() 757 self.submit(packet.fct, packet.args) 758 force_one_more_loop = True 759 self.nb_done += 1 760 self.done.task_done() 761 762 # Get from the various queue the Idle/Done/Running information 763 # Those variable should be thread safe but approximate. 764 Idle = self.queue.qsize() 765 Done = self.nb_done + self.done.qsize() 766 Running = max(0, self.submitted.qsize() - Idle - Done) 767 768 if Idle + Running <= 0 and not force_one_more_loop: 769 update_status(Idle, Running, Done) 770 # Going the quit since everything is done 771 # Fully Ensure that everything is indeed done. 772 self.queue.join() 773 break 774 775 if (Idle, Running, Done) != last_status: 776 if first and update_first: 777 update_first(Idle, Running, Done) 778 first = False 779 else: 780 update_status(Idle, Running, Done) 781 last_status = (Idle, Running, Done) 782 783 # cleaning the queue done_pid_queue and move them to done_pid 784 while not self.done_pid_queue.empty(): 785 pid = self.done_pid_queue.get() 786 self.done_pid.append(pid) 787 self.done_pid_queue.task_done() 788 789 790 # Define how to wait for the next iteration 791 if use_lock: 792 # simply wait that a worker release the lock 793 use_lock = self.lock.wait(300) 794 self.lock.clear() 795 if not use_lock and Idle > 0: 796 use_lock = True 797 else: 798 # to be sure that we will never fully lock at the end pass to 799 # a simple time.sleep() 800 time.sleep(sleep_time) 801 sleep_time = min(sleep_time + 2, 180) 802 if update_first: 803 update_first(Idle, Running, Done) 804 805 if self.stoprequest.isSet(): 806 if isinstance(self.fail_msg, Exception): 807 raise self.fail_msg 808 elif isinstance(self.fail_msg, str): 809 raise Exception, self.fail_msg 810 else: 811 misc.sprint(self.fail_msg) 812 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 813 # reset variable for next submission 814 try: 815 self.lock.clear() 816 except Exception: 817 pass 818 self.done = Queue.Queue() 819 self.done_pid = [] 820 self.done_pid_queue = Queue.Queue() 821 self.nb_done = 0 822 self.submitted = Queue.Queue() 823 self.pids = Queue.Queue() 824 self.stoprequest.clear() 825 826 except KeyboardInterrupt: 827 # if one of the node fails -> return that error 828 if isinstance(self.fail_msg, Exception): 829 raise self.fail_msg 830 elif isinstance(self.fail_msg, str): 831 raise Exception, self.fail_msg 832 elif self.fail_msg: 833 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 834 # else return orignal error 835 raise
836
837 -class CondorCluster(Cluster):
838 """Basic class for dealing with cluster submission""" 839 840 name = 'condor' 841 job_id = 'CONDOR_ID' 842 843 844 845 @multiple_try()
846 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 847 required_output=[], nb_submit=0):
848 """Submit a job prog to a Condor cluster""" 849 850 text = """Executable = %(prog)s 851 output = %(stdout)s 852 error = %(stderr)s 853 log = %(log)s 854 %(argument)s 855 environment = CONDOR_ID=$(Cluster).$(Process) 856 Universe = vanilla 857 notification = Error 858 Initialdir = %(cwd)s 859 %(requirement)s 860 getenv=True 861 queue 1 862 """ 863 864 if self.cluster_queue not in ['None', None]: 865 requirement = 'Requirements = %s=?=True' % self.cluster_queue 866 else: 867 requirement = '' 868 869 if cwd is None: 870 cwd = os.getcwd() 871 if stdout is None: 872 stdout = '/dev/null' 873 if stderr is None: 874 stderr = '/dev/null' 875 if log is None: 876 log = '/dev/null' 877 if not os.path.exists(prog): 878 prog = os.path.join(cwd, prog) 879 if argument: 880 argument = 'Arguments = %s' % ' '.join(argument) 881 else: 882 argument = '' 883 884 885 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 886 'stderr': stderr,'log': log,'argument': argument, 887 'requirement': requirement} 888 889 #open('submit_condor','w').write(text % dico) 890 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 891 stdin=subprocess.PIPE) 892 output, _ = a.communicate(text % dico) 893 #output = a.stdout.read() 894 #Submitting job(s). 895 #Logging submit event(s). 896 #1 job(s) submitted to cluster 2253622. 897 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 898 try: 899 id = pat.search(output).groups()[0] 900 except: 901 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 902 % output 903 self.submitted += 1 904 self.submitted_ids.append(id) 905 return id
906 907 @store_input() 908 @multiple_try()
909 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 910 log=None, input_files=[], output_files=[], required_output=[], 911 nb_submit=0):
912 """Submit the job on the cluster NO SHARE DISK 913 input/output file should be give relative to cwd 914 """ 915 916 if not required_output and output_files: 917 required_output = output_files 918 919 if (input_files == [] == output_files): 920 return self.submit(prog, argument, cwd, stdout, stderr, log, 921 required_output=required_output, nb_submit=nb_submit) 922 923 text = """Executable = %(prog)s 924 output = %(stdout)s 925 error = %(stderr)s 926 log = %(log)s 927 %(argument)s 928 should_transfer_files = YES 929 when_to_transfer_output = ON_EXIT 930 transfer_input_files = %(input_files)s 931 %(output_files)s 932 Universe = vanilla 933 notification = Error 934 Initialdir = %(cwd)s 935 %(requirement)s 936 getenv=True 937 queue 1 938 """ 939 940 if self.cluster_queue not in ['None', None]: 941 requirement = 'Requirements = %s=?=True' % self.cluster_queue 942 else: 943 requirement = '' 944 945 if cwd is None: 946 cwd = os.getcwd() 947 if stdout is None: 948 stdout = '/dev/null' 949 if stderr is None: 950 stderr = '/dev/null' 951 if log is None: 952 log = '/dev/null' 953 if not os.path.exists(prog): 954 prog = os.path.join(cwd, prog) 955 if argument: 956 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 957 else: 958 argument = '' 959 # input/output file treatment 960 if input_files: 961 input_files = ','.join(input_files) 962 else: 963 input_files = '' 964 if output_files: 965 output_files = 'transfer_output_files = %s' % ','.join(output_files) 966 else: 967 output_files = '' 968 969 970 971 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 972 'stderr': stderr,'log': log,'argument': argument, 973 'requirement': requirement, 'input_files':input_files, 974 'output_files':output_files} 975 976 #open('submit_condor','w').write(text % dico) 977 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 978 stdin=subprocess.PIPE) 979 output, _ = a.communicate(text % dico) 980 #output = a.stdout.read() 981 #Submitting job(s). 982 #Logging submit event(s). 983 #1 job(s) submitted to cluster 2253622. 984 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 985 try: 986 id = pat.search(output).groups()[0] 987 except: 988 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 989 % output 990 self.submitted += 1 991 self.submitted_ids.append(id) 992 return id
993 994 995 996 997 998 @multiple_try(nb_try=10, sleep=10)
999 - def control_one_job(self, id):
1000 """ control the status of a single job with it's cluster id """ 1001 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1002 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1003 stderr=subprocess.PIPE) 1004 1005 error = status.stderr.read() 1006 if status.returncode or error: 1007 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1008 1009 return status.stdout.readline().strip()
1010 1011 @check_interupt() 1012 @multiple_try(nb_try=10, sleep=10)
1013 - def control(self, me_dir):
1014 """ control the status of a single job with it's cluster id """ 1015 1016 if not self.submitted_ids: 1017 return 0, 0, 0, 0 1018 1019 packet = 15000 1020 idle, run, fail = 0, 0, 0 1021 ongoing = [] 1022 for i in range(1+(len(self.submitted_ids)-1)//packet): 1023 start = i * packet 1024 stop = (i+1) * packet 1025 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1026 " -format \'%-2s\ ' \'ClusterId\' " + \ 1027 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1028 1029 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1030 stderr=subprocess.PIPE) 1031 error = status.stderr.read() 1032 if status.returncode or error: 1033 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1034 1035 for line in status.stdout: 1036 id, status = line.strip().split() 1037 ongoing.append(int(id)) 1038 if status in ['I','U']: 1039 idle += 1 1040 elif status == 'R': 1041 run += 1 1042 elif status != 'C': 1043 fail += 1 1044 1045 for id in list(self.submitted_ids): 1046 if int(id) not in ongoing: 1047 status = self.check_termination(id) 1048 if status == 'wait': 1049 run += 1 1050 elif status == 'resubmit': 1051 idle += 1 1052 1053 return idle, run, self.submitted - (idle+run+fail), fail
1054 1055 @multiple_try()
1056 - def remove(self, *args, **opts):
1057 """Clean the jobson the cluster""" 1058 1059 if not self.submitted_ids: 1060 return 1061 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1062 1063 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1064 self.submitted_ids = []
1065
1066 -class PBSCluster(Cluster):
1067 """Basic class for dealing with cluster submission""" 1068 1069 name = 'pbs' 1070 job_id = 'PBS_JOBID' 1071 idle_tag = ['Q'] 1072 running_tag = ['T','E','R'] 1073 complete_tag = ['C'] 1074 1075 maximum_submited_jobs = 2500 1076 1077 @multiple_try()
1078 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1079 required_output=[], nb_submit=0):
1080 """Submit a job prog to a PBS cluster""" 1081 1082 me_dir = self.get_jobs_identifier(cwd, prog) 1083 1084 if len(self.submitted_ids) > self.maximum_submited_jobs: 1085 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1086 self.wait(me_dir, fct, self.maximum_submited_jobs) 1087 1088 1089 text = "" 1090 if cwd is None: 1091 cwd = os.getcwd() 1092 else: 1093 text = " cd %s;" % cwd 1094 if stdout is None: 1095 stdout = '/dev/null' 1096 if stderr is None: 1097 stderr = '/dev/null' 1098 elif stderr == -2: # -2 is subprocess.STDOUT 1099 stderr = stdout 1100 if log is None: 1101 log = '/dev/null' 1102 1103 if not os.path.isabs(prog): 1104 text += "./%s" % prog 1105 else: 1106 text+= prog 1107 1108 if argument: 1109 text += ' ' + ' '.join(argument) 1110 1111 command = ['qsub','-o', stdout, 1112 '-N', me_dir, 1113 '-e', stderr, 1114 '-V'] 1115 1116 if self.cluster_queue and self.cluster_queue != 'None': 1117 command.extend(['-q', self.cluster_queue]) 1118 1119 a = misc.Popen(command, stdout=subprocess.PIPE, 1120 stderr=subprocess.STDOUT, 1121 stdin=subprocess.PIPE, cwd=cwd) 1122 1123 output = a.communicate(text)[0] 1124 id = output.split('.')[0] 1125 if not id.isdigit() or a.returncode !=0: 1126 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1127 % output 1128 1129 self.submitted += 1 1130 self.submitted_ids.append(id) 1131 return id
1132 1133 @multiple_try()
1134 - def control_one_job(self, id):
1135 """ control the status of a single job with it's cluster id """ 1136 cmd = 'qstat '+str(id) 1137 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1138 stderr=subprocess.STDOUT) 1139 1140 for line in status.stdout: 1141 line = line.strip() 1142 if 'cannot connect to server' in line or 'cannot read reply' in line: 1143 raise ClusterManagmentError, 'server disconnected' 1144 if 'Unknown' in line: 1145 return 'F' 1146 elif line.startswith(str(id)): 1147 jobstatus = line.split()[4] 1148 else: 1149 jobstatus="" 1150 1151 if status.returncode != 0 and status.returncode is not None: 1152 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1153 if jobstatus in self.idle_tag: 1154 return 'I' 1155 elif jobstatus in self.running_tag: 1156 return 'R' 1157 return 'F'
1158 1159 1160 @multiple_try()
1161 - def control(self, me_dir):
1162 """ control the status of a single job with it's cluster id """ 1163 cmd = "qstat" 1164 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1165 1166 me_dir = self.get_jobs_identifier(me_dir) 1167 1168 ongoing = [] 1169 1170 idle, run, fail = 0, 0, 0 1171 for line in status.stdout: 1172 if 'cannot connect to server' in line or 'cannot read reply' in line: 1173 raise ClusterManagmentError, 'server disconnected' 1174 if me_dir in line: 1175 ongoing.append(line.split()[0].split('.')[0]) 1176 status2 = line.split()[4] 1177 if status2 in self.idle_tag: 1178 idle += 1 1179 elif status2 in self.running_tag: 1180 run += 1 1181 elif status2 in self.complete_tag: 1182 if not self.check_termination(line.split()[0].split('.')[0]): 1183 idle += 1 1184 else: 1185 fail += 1 1186 1187 if status.returncode != 0 and status.returncode is not None: 1188 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1189 1190 for id in list(self.submitted_ids): 1191 if id not in ongoing: 1192 status2 = self.check_termination(id) 1193 if status2 == 'wait': 1194 run += 1 1195 elif status2 == 'resubmit': 1196 idle += 1 1197 1198 return idle, run, self.submitted - (idle+run+fail), fail
1199 1200 @multiple_try()
1201 - def remove(self, *args, **opts):
1202 """Clean the jobs on the cluster""" 1203 1204 if not self.submitted_ids: 1205 return 1206 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1207 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1208 self.submitted_ids = []
1209
1210 1211 -class SGECluster(Cluster):
1212 """Basic class for dealing with cluster submission""" 1213 # Class written by Arian Abrahantes. 1214 1215 name = 'sge' 1216 job_id = 'JOB_ID' 1217 idle_tag = ['qw', 'hqw','hRqw','w'] 1218 running_tag = ['r','t','Rr','Rt'] 1219 identifier_length = 10 1220
1221 - def def_get_path(self,location):
1222 """replace string for path issues""" 1223 location = os.path.realpath(location) 1224 homePath = os.getenv("HOME") 1225 if homePath: 1226 location = location.replace(homePath,'$HOME') 1227 return location
1228 1229 @multiple_try()
1230 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1231 required_output=[], nb_submit=0):
1232 """Submit a job prog to an SGE cluster""" 1233 1234 me_dir = self.get_jobs_identifier(cwd, prog) 1235 1236 1237 if cwd is None: 1238 #cwd = os.getcwd() 1239 cwd = self.def_get_path(os.getcwd()) 1240 cwd1 = self.def_get_path(cwd) 1241 text = " cd %s;" % cwd1 1242 if stdout is None: 1243 stdout = '/dev/null' 1244 else: 1245 stdout = self.def_get_path(stdout) 1246 if stderr is None: 1247 stderr = '/dev/null' 1248 elif stderr == -2: # -2 is subprocess.STDOUT 1249 stderr = stdout 1250 else: 1251 stderr = self.def_get_path(stderr) 1252 1253 if log is None: 1254 log = '/dev/null' 1255 else: 1256 log = self.def_get_path(log) 1257 1258 text += prog 1259 if argument: 1260 text += ' ' + ' '.join(argument) 1261 1262 #if anything slips through argument 1263 #print "!=== inteded change ",text.replace('/srv/nfs','') 1264 #text = text.replace('/srv/nfs','') 1265 homePath = os.getenv("HOME") 1266 if homePath: 1267 text = text.replace(homePath,'$HOME') 1268 1269 logger.debug("!=== input %s" % text) 1270 logger.debug("!=== output %s" % stdout) 1271 logger.debug("!=== error %s" % stderr) 1272 logger.debug("!=== logs %s" % log) 1273 1274 command = ['qsub','-o', stdout, 1275 '-N', me_dir, 1276 '-e', stderr, 1277 '-V'] 1278 1279 if self.cluster_queue and self.cluster_queue != 'None': 1280 command.extend(['-q', self.cluster_queue]) 1281 1282 a = misc.Popen(command, stdout=subprocess.PIPE, 1283 stderr=subprocess.STDOUT, 1284 stdin=subprocess.PIPE, cwd=cwd) 1285 1286 output = a.communicate(text)[0] 1287 id = output.split(' ')[2] 1288 if not id.isdigit(): 1289 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1290 % output 1291 self.submitted += 1 1292 self.submitted_ids.append(id) 1293 logger.debug(output) 1294 1295 return id
1296 1297 @multiple_try()
1298 - def control_one_job(self, id):
1299 """ control the status of a single job with it's cluster id """ 1300 #cmd = 'qstat '+str(id) 1301 cmd = 'qstat ' 1302 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1303 for line in status.stdout: 1304 #print "!==",line 1305 #line = line.strip() 1306 #if 'Unknown' in line: 1307 # return 'F' 1308 #elif line.startswith(str(id)): 1309 # status = line.split()[4] 1310 if str(id) in line: 1311 status = line.split()[4] 1312 #print "!=status", status 1313 if status in self.idle_tag: 1314 return 'I' 1315 elif status in self.running_tag: 1316 return 'R' 1317 return 'F'
1318 1319 @multiple_try()
1320 - def control(self, me_dir):
1321 """ control the status of a single job with it's cluster id """ 1322 cmd = "qstat " 1323 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1324 1325 me_dir = self.get_jobs_identifier(me_dir) 1326 1327 finished = list(self.submitted_ids) 1328 1329 idle, run, fail = 0, 0, 0 1330 for line in status.stdout: 1331 if me_dir in line: 1332 id,_,_,_,status = line.split()[:5] 1333 if status in self.idle_tag: 1334 idle += 1 1335 finished.remove(id) 1336 elif status in self.running_tag: 1337 run += 1 1338 finished.remove(id) 1339 else: 1340 logger.debug(line) 1341 fail += 1 1342 finished.remove(id) 1343 1344 for id in finished: 1345 self.check_termination(id) 1346 1347 return idle, run, self.submitted - (idle+run+fail), fail
1348 1349 1350 1351 @multiple_try()
1352 - def remove(self, *args, **opts):
1353 """Clean the jobs on the cluster""" 1354 1355 if not self.submitted_ids: 1356 return 1357 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1358 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1359 self.submitted_ids = []
1360
1361 1362 -class LSFCluster(Cluster):
1363 """Basic class for dealing with cluster submission""" 1364 1365 name = 'lsf' 1366 job_id = 'LSB_JOBID' 1367 1368 @multiple_try()
1369 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1370 required_output=[], nb_submit=0):
1371 """Submit the job prog to an LSF cluster""" 1372 1373 1374 me_dir = self.get_jobs_identifier(cwd, prog) 1375 1376 text = "" 1377 command = ['bsub', '-C0', '-J', me_dir] 1378 if cwd is None: 1379 cwd = os.getcwd() 1380 else: 1381 text = " cd %s;" % cwd 1382 if stdout and isinstance(stdout, str): 1383 command.extend(['-o', stdout]) 1384 if stderr and isinstance(stdout, str): 1385 command.extend(['-e', stderr]) 1386 elif stderr == -2: # -2 is subprocess.STDOUT 1387 pass 1388 if log is None: 1389 log = '/dev/null' 1390 1391 text += prog 1392 if argument: 1393 text += ' ' + ' '.join(argument) 1394 1395 if self.cluster_queue and self.cluster_queue != 'None': 1396 command.extend(['-q', self.cluster_queue]) 1397 1398 a = misc.Popen(command, stdout=subprocess.PIPE, 1399 stderr=subprocess.STDOUT, 1400 stdin=subprocess.PIPE, cwd=cwd) 1401 1402 output = a.communicate(text)[0] 1403 #Job <nnnn> is submitted to default queue <normal>. 1404 try: 1405 id = output.split('>',1)[0].split('<')[1] 1406 except: 1407 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1408 % output 1409 if not id.isdigit(): 1410 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1411 % output 1412 self.submitted += 1 1413 self.submitted_ids.append(id) 1414 return id
1415 1416 1417 @multiple_try()
1418 - def control_one_job(self, id):
1419 """ control the status of a single job with it's cluster id """ 1420 1421 cmd = 'bjobs '+str(id) 1422 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1423 1424 for line in status.stdout: 1425 line = line.strip().upper() 1426 if 'JOBID' in line: 1427 continue 1428 elif str(id) not in line: 1429 continue 1430 status = line.split()[2] 1431 if status == 'RUN': 1432 return 'R' 1433 elif status == 'PEND': 1434 return 'I' 1435 elif status == 'DONE': 1436 return 'F' 1437 else: 1438 return 'H' 1439 return 'F'
1440 1441 @multiple_try()
1442 - def control(self, me_dir):
1443 """ control the status of a single job with it's cluster id """ 1444 1445 if not self.submitted_ids: 1446 return 0, 0, 0, 0 1447 1448 cmd = "bjobs " + ' '.join(self.submitted_ids) 1449 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1450 1451 jobstatus = {} 1452 for line in status.stdout: 1453 line = line.strip() 1454 if 'JOBID' in line: 1455 continue 1456 splitline = line.split() 1457 id = splitline[0] 1458 if id not in self.submitted_ids: 1459 continue 1460 jobstatus[id] = splitline[2] 1461 1462 idle, run, fail = 0, 0, 0 1463 for id in self.submitted_ids[:]: 1464 if id in jobstatus: 1465 status = jobstatus[id] 1466 else: 1467 status = 'MISSING' 1468 if status == 'RUN': 1469 run += 1 1470 elif status == 'PEND': 1471 idle += 1 1472 else: 1473 status = self.check_termination(id) 1474 if status == 'wait': 1475 run += 1 1476 elif status == 'resubmit': 1477 idle += 1 1478 1479 return idle, run, self.submitted - (idle+run+fail), fail
1480 1481 @multiple_try()
1482 - def remove(self, *args,**opts):
1483 """Clean the jobs on the cluster""" 1484 1485 if not self.submitted_ids: 1486 return 1487 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1488 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1489 self.submitted_ids = []
1490
1491 -class GECluster(Cluster):
1492 """Class for dealing with cluster submission on a GE cluster""" 1493 1494 name = 'ge' 1495 job_id = 'JOB_ID' 1496 idle_tag = ['qw'] 1497 running_tag = ['r'] 1498 1499 @multiple_try()
1500 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1501 required_output=[], nb_submit=0):
1502 """Submit a job prog to a GE cluster""" 1503 1504 text = "" 1505 if cwd is None: 1506 cwd = os.getcwd() 1507 else: 1508 text = " cd %s; bash " % cwd 1509 if stdout is None: 1510 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1511 if stderr is None: 1512 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1513 elif stderr == -2: # -2 is subprocess.STDOUT 1514 stderr = stdout 1515 if log is None: 1516 log = '/dev/null' 1517 1518 text += prog 1519 if argument: 1520 text += ' ' + ' '.join(argument) 1521 text += '\n' 1522 tmp_submit = os.path.join(cwd, 'tmp_submit') 1523 open(tmp_submit,'w').write(text) 1524 1525 a = misc.Popen(['qsub','-o', stdout, 1526 '-e', stderr, 1527 tmp_submit], 1528 stdout=subprocess.PIPE, 1529 stderr=subprocess.STDOUT, 1530 stdin=subprocess.PIPE, cwd=cwd) 1531 1532 output = a.communicate()[0] 1533 #Your job 874511 ("test.sh") has been submitted 1534 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1535 try: 1536 id = pat.search(output).groups()[0] 1537 except: 1538 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1539 % output 1540 self.submitted += 1 1541 self.submitted_ids.append(id) 1542 return id
1543 1544 @multiple_try()
1545 - def control_one_job(self, id):
1546 """ control the status of a single job with it's cluster id """ 1547 cmd = 'qstat | grep '+str(id) 1548 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1549 if not status: 1550 return 'F' 1551 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1552 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1553 stat = '' 1554 for line in status.stdout.read().split('\n'): 1555 if not line: 1556 continue 1557 line = line.strip() 1558 try: 1559 groups = pat.search(line).groups() 1560 except: 1561 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1562 if groups[0] != id: continue 1563 stat = groups[1] 1564 if not stat: 1565 return 'F' 1566 if stat in self.idle_tag: 1567 return 'I' 1568 if stat in self.running_tag: 1569 return 'R'
1570 1571 @multiple_try()
1572 - def control(self, me_dir=None):
1573 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1574 if not self.submitted_ids: 1575 return 0, 0, 0, 0 1576 idle, run, fail = 0, 0, 0 1577 ongoing = [] 1578 for statusflag in ['p', 'r', 'sh']: 1579 cmd = 'qstat -s %s' % statusflag 1580 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1581 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1582 pat = re.compile("^(\d+)") 1583 for line in status.stdout.read().split('\n'): 1584 line = line.strip() 1585 try: 1586 id = pat.search(line).groups()[0] 1587 except Exception: 1588 pass 1589 else: 1590 if id not in self.submitted_ids: 1591 continue 1592 ongoing.append(id) 1593 if statusflag == 'p': 1594 idle += 1 1595 if statusflag == 'r': 1596 run += 1 1597 if statusflag == 'sh': 1598 fail += 1 1599 for id in list(self.submitted_ids): 1600 if id not in ongoing: 1601 self.check_termination(id) 1602 #self.submitted_ids = ongoing 1603 1604 return idle, run, self.submitted - idle - run - fail, fail
1605 1606 @multiple_try()
1607 - def remove(self, *args, **opts):
1608 """Clean the jobs on the cluster""" 1609 1610 if not self.submitted_ids: 1611 return 1612 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1613 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1614 self.submitted_ids = []
1615
1616 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1617 """start a computation and not wait for it to finish. 1618 this fonction returns a lock which is locked as long as the job is 1619 running.""" 1620 1621 mc = MultiCore(1) 1622 mc.submit(exe, argument, cwd, stdout, **opt) 1623 mc.need_waiting = True 1624 return mc.lock
1625
1626 1627 -class SLURMCluster(Cluster):
1628 """Basic class for dealing with cluster submission""" 1629 1630 name = 'slurm' 1631 job_id = 'SLURM_JOBID' 1632 idle_tag = ['Q','PD','S','CF'] 1633 running_tag = ['R', 'CG'] 1634 complete_tag = ['C'] 1635 identifier_length = 8 1636 1637 @multiple_try()
1638 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1639 required_output=[], nb_submit=0):
1640 """Submit a job prog to a SLURM cluster""" 1641 1642 me_dir = self.get_jobs_identifier(cwd, prog) 1643 1644 1645 if cwd is None: 1646 cwd = os.getcwd() 1647 if stdout is None: 1648 stdout = '/dev/null' 1649 if stderr is None: 1650 stderr = '/dev/null' 1651 elif stderr == -2: # -2 is subprocess.STDOUT 1652 stderr = stdout 1653 if log is None: 1654 log = '/dev/null' 1655 1656 command = ['sbatch', '-o', stdout, 1657 '-J', me_dir, 1658 '-e', stderr, prog] + argument 1659 1660 if self.cluster_queue and self.cluster_queue != 'None': 1661 command.insert(1, '-p') 1662 command.insert(2, self.cluster_queue) 1663 1664 a = misc.Popen(command, stdout=subprocess.PIPE, 1665 stderr=subprocess.STDOUT, 1666 stdin=subprocess.PIPE, cwd=cwd) 1667 1668 output = a.communicate() 1669 output_arr = output[0].split(' ') 1670 id = output_arr[3].rstrip() 1671 1672 if not id.isdigit(): 1673 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1674 % (output[0] + '\n' + output[1]) 1675 1676 self.submitted += 1 1677 self.submitted_ids.append(id) 1678 return id
1679 1680 @multiple_try()
1681 - def control_one_job(self, id):
1682 """ control the status of a single job with it's cluster id """ 1683 cmd = 'squeue j'+str(id) 1684 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1685 stderr=open(os.devnull,'w')) 1686 1687 for line in status.stdout: 1688 line = line.strip() 1689 if 'Invalid' in line: 1690 return 'F' 1691 elif line.startswith(str(id)): 1692 status = line.split()[4] 1693 if status in self.idle_tag: 1694 return 'I' 1695 elif status in self.running_tag: 1696 return 'R' 1697 return 'F'
1698 1699 @multiple_try()
1700 - def control(self, me_dir):
1701 """ control the status of a single job with it's cluster id """ 1702 cmd = "squeue" 1703 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1704 1705 me_dir = self.get_jobs_identifier(me_dir) 1706 1707 idle, run, fail = 0, 0, 0 1708 ongoing=[] 1709 for line in pstatus.stdout: 1710 if me_dir in line: 1711 id, _, _,_ , status,_ = line.split(None,5) 1712 ongoing.append(id) 1713 if status in self.idle_tag: 1714 idle += 1 1715 elif status in self.running_tag: 1716 run += 1 1717 elif status in self.complete_tag: 1718 status = self.check_termination(id) 1719 if status == 'wait': 1720 run += 1 1721 elif status == 'resubmit': 1722 idle += 1 1723 else: 1724 fail += 1 1725 1726 #control other finished job 1727 for id in list(self.submitted_ids): 1728 if id not in ongoing: 1729 status = self.check_termination(id) 1730 if status == 'wait': 1731 run += 1 1732 elif status == 'resubmit': 1733 idle += 1 1734 1735 1736 return idle, run, self.submitted - (idle+run+fail), fail
1737 1738 @multiple_try()
1739 - def remove(self, *args, **opts):
1740 """Clean the jobs on the cluster""" 1741 1742 if not self.submitted_ids: 1743 return 1744 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1745 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1746 self.submitted_ids = []
1747
1748 -class HTCaaSCluster(Cluster):
1749 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1750 1751 name= 'htcaas' 1752 job_id = 'HTCAAS_JOBID' 1753 idle_tag = ['waiting'] 1754 running_tag = ['preparing','running'] 1755 complete_tag = ['done'] 1756 1757 @store_input() 1758 @multiple_try()
1759 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1760 log=None, input_files=[], output_files=[], required_output=[], 1761 nb_submit=0):
1762 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1763 input/output file should be given as relative to CWd 1764 """ 1765 # To make workspace name(temp) 1766 cur_usr = os.getenv('USER') 1767 1768 if cwd is None: 1769 cwd = os.getcwd() 1770 1771 cwd_cp = cwd.rsplit("/",2) 1772 1773 if not stdout is None: 1774 print "stdout: %s" % stdout 1775 1776 if not os.path.exists(prog): 1777 prog = os.path.join(cwd, prog) 1778 1779 if not required_output and output_files: 1780 required_output = output_files 1781 1782 logger.debug(prog) 1783 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1784 cwd_arg = cwd+"/arguments" 1785 temp = ' '.join([str(a) for a in argument]) 1786 arg_cmd="echo '"+temp+"' > " + cwd_arg 1787 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1788 if argument : 1789 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1790 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1791 id = a.stdout.read().strip() 1792 1793 else: 1794 cwd_arg = cwd+"/arguments" 1795 temp = ' '.join([str(a) for a in argument]) 1796 temp_file_name = "sub." + os.path.basename(prog) 1797 text = """#!/bin/bash 1798 MYPWD=%(cwd)s 1799 cd $MYPWD 1800 input_files=(%(input_files)s ) 1801 for i in ${input_files[@]} 1802 do 1803 chmod -f +x $i 1804 done 1805 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1806 """ 1807 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1808 'arguments': ' '.join([str(a) for a in argument]), 1809 'program': ' ' if '.py' in prog else 'bash'} 1810 1811 # writing a new script for the submission 1812 new_prog = pjoin(cwd, temp_file_name) 1813 open(new_prog, 'w').write(text % dico) 1814 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1815 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1816 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1817 id = a.stdout.read().strip() 1818 logger.debug(id) 1819 1820 nb_try=0 1821 nb_limit=5 1822 if not id.isdigit() : 1823 print "[ID is not digit]:" + id 1824 1825 while not id.isdigit() : 1826 nb_try+=1 1827 print "[fail_retry]:"+ nb_try 1828 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1829 id = a.stdout.read().strip() 1830 if nb_try > nb_limit : 1831 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1832 break 1833 1834 self.submitted += 1 1835 self.submitted_ids.append(id) 1836 1837 return id
1838 1839 @multiple_try(nb_try=10, sleep=5)
1840 - def control_one_job(self, id):
1841 """ control the status of a single job with it's cluster id """ 1842 1843 if id == 0 : 1844 status_out ='C' 1845 else : 1846 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1847 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1848 stderr=subprocess.PIPE) 1849 error = status.stderr.read() 1850 if status.returncode or error: 1851 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1852 status_out= status.stdout.read().strip() 1853 status_out= status_out.split(":",1)[1] 1854 if status_out == 'waiting': 1855 status_out='I' 1856 elif status_out == 'preparing' or status_out == 'running': 1857 status_out = 'R' 1858 elif status_out != 'done': 1859 status_out = 'F' 1860 elif status_out == 'done': 1861 status_out = 'C' 1862 1863 return status_out
1864 1865 @multiple_try()
1866 - def control(self, me_dir):
1867 """ control the status of a single job with it's cluster id """ 1868 if not self.submitted_ids: 1869 logger.debug("self.submitted_ids not exists") 1870 return 0, 0, 0, 0 1871 1872 ongoing = [] 1873 idle, run, fail = 0, 0, 0 1874 1875 start = self.submitted_ids[0] 1876 end = self.submitted_ids[-1] 1877 1878 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1879 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1880 1881 for line in status.stdout: 1882 #ongoing.append(line.split()[0].strip()) 1883 status2 = line.split()[-1] 1884 if status2 is not 'null' or line.split()[0].strip() is not '0': 1885 ongoing.append(line.split()[0].strip()) 1886 logger.debug("["+line.split()[0].strip()+"]"+status2) 1887 if status2 is 'null' or line.split()[0].strip() is '0': 1888 idle += 1 1889 elif status2 in self.idle_tag: 1890 idle += 1 1891 elif status2 in self.running_tag: 1892 run += 1 1893 elif status2 in self.complete_tag: 1894 if not self.check_termination(line.split()[0]): 1895 idle +=1 1896 else: 1897 fail += 1 1898 1899 return idle, run, self.submitted - (idle+run+fail), fail
1900 1901 @multiple_try()
1902 - def remove(self, *args, **opts):
1903 """Clean the jobson the cluster""" 1904 1905 if not self.submitted_ids: 1906 return 1907 for i in range(len(self.submitted_ids)): 1908 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1909 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1910
1911 -class HTCaaS2Cluster(Cluster):
1912 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1913 1914 name= 'htcaas2' 1915 job_id = 'HTCAAS2_JOBID' 1916 idle_tag = ['waiting'] 1917 running_tag = ['preparing','running'] 1918 complete_tag = ['done'] 1919 1920 @store_input() 1921 @multiple_try()
1922 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1923 log=None, input_files=[], output_files=[], required_output=[], 1924 nb_submit=0):
1925 1926 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1927 input/output file should be given as relative to CWD 1928 """ 1929 if cwd is None: 1930 cwd = os.getcwd() 1931 1932 if not os.path.exists(prog): 1933 prog = os.path.join(cwd, prog) 1934 1935 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1936 if cwd or prog : 1937 self.submitted_dirs.append(cwd) 1938 self.submitted_exes.append(prog) 1939 else: 1940 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1941 1942 if argument : 1943 self.submitted_args.append('='.join([str(a) for a in argument])) 1944 1945 if cwd or prog : 1946 self.submitted += 1 1947 id = self.submitted 1948 self.submitted_ids.append(id) 1949 else: 1950 logger.debug("cwd and prog are not exist! ") 1951 id = 0 1952 1953 else: 1954 temp_file_name = "sub."+ os.path.basename(prog) 1955 text = """#!/bin/bash 1956 MYPWD=%(cwd)s 1957 cd $MYPWD 1958 input_files=(%(input_files)s ) 1959 for i in ${input_files[@]} 1960 do 1961 chmod -f +x $i 1962 done 1963 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1964 """ 1965 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1966 'arguments': ' '.join([str(a) for a in argument]), 1967 'program': ' ' if '.py' in prog else 'bash'} 1968 # writing a new script for the submission 1969 new_prog = pjoin(cwd, temp_file_name) 1970 open(new_prog, 'w').write(text % dico) 1971 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1972 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 1973 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1974 id = a.stdout.read().strip() 1975 logger.debug("[mode2]-["+str(id)+"]") 1976 if cwd and prog : 1977 self.submitted += 1 1978 self.submitted_ids.append(id) 1979 else: 1980 logger.debug("cwd and prog are not exist! ") 1981 id = 0 1982 1983 return id
1984 1985 @multiple_try()
1986 - def metasubmit(self, me_dir=None):
1987 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 1988 tmp_leng= len(self.submitted_ids)/2 1989 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 1990 tmp_dirs2= self.submitted_dirs[tmp_leng:] 1991 tmp_exes1= self.submitted_exes[0:tmp_leng] 1992 tmp_exes2= self.submitted_exes[tmp_leng:] 1993 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a is not ' ']), 1994 '-e', ":".join([str(a) for a in tmp_exes1 if a and a is not ' '])] 1995 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a is not ' ']), 1996 '-e', ":".join([str(a) for a in tmp_exes2 if a and a is not ' '])] 1997 if len(self.submitted_args) > 0 : 1998 tmp_args1= self.submitted_args[0:tmp_leng] 1999 tmp_args2= self.submitted_args[tmp_leng:] 2000 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 2001 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 2002 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2003 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2004 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2005 2006 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2007 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a is not ' ']), 2008 '-e', ":".join([str(a) for a in self.submitted_exes if a and a is not ' '])] 2009 if len(self.submitted_args) > 0 : 2010 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2011 if self.submitted_dirs[0] or self.submitted_exes[0] : 2012 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2013 me_dir = result.stdout.read().strip() 2014 self.submitted_ids[0]=me_dir 2015 else: 2016 me_dir = self.submitted_ids[-1] 2017 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2018 me_dir = self.submitted_ids[0] 2019 else: 2020 me_dir = -1 2021 2022 logger.debug("[" + str(me_dir) + "]") 2023 2024 self.submitted_dirs = [] 2025 self.submitted_exes = [] 2026 self.submitted_args = [] 2027 2028 return me_dir
2029 2030 2031 @multiple_try(nb_try=10, sleep=5)
2032 - def control_one_job(self, id):
2033 """ control the status of a single job with it's cluster id """ 2034 #logger.debug("CONTROL ONE JOB MODE") 2035 if self.submitted == self.submitted_ids[-1] : 2036 id = self.metasubmit(self) 2037 tempid = self.submitted_ids[-1] 2038 self.submitted_ids.remove(self.submitted_ids[-1]) 2039 self.submitted_ids.append(id) 2040 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2041 2042 if id == 0 : 2043 status_out ='C' 2044 else: 2045 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2046 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2047 stderr=subprocess.PIPE) 2048 error = status.stderr.read() 2049 if status.returncode or error: 2050 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error 2051 status_out= status.stdout.read().strip() 2052 status_out= status_out.split(":",1)[1] 2053 logger.debug("[["+str(id)+"]]"+status_out) 2054 if status_out == 'waiting': 2055 status_out='I' 2056 elif status_out == 'preparing' or status_out == 'running': 2057 status_out = 'R' 2058 elif status_out != 'done': 2059 status_out = 'F' 2060 elif status_out == 'done': 2061 status_out = 'C' 2062 self.submitted -= 1 2063 2064 return status_out
2065 2066 @multiple_try()
2067 - def control(self, me_dir):
2068 """ control the status of a single job with it's cluster id """ 2069 if not self.submitted_ids: 2070 logger.debug("self.submitted_ids not exists") 2071 return 0, 0, 0, 0 2072 2073 if "//" in me_dir : 2074 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2075 start = me_dir.split("//")[0] 2076 end = me_dir.split("//")[1] 2077 else : 2078 start = me_dir.split("//")[1] 2079 end = me_dir.split("//")[0] 2080 elif "/" in me_dir : # update 2081 start = 0 2082 end = 0 2083 elif me_dir.isdigit(): 2084 start = me_dir 2085 end = me_dir 2086 elif not me_dir.isdigit(): 2087 me_dir = self.submitted_ids[0] 2088 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2089 2090 ongoing = [] 2091 idle, run, fail, done = 0, 0, 0, 0 2092 2093 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2094 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2095 2096 for line in status.stdout: 2097 status2 = line.split()[-1] 2098 if status2 is not 'null' or line.split()[0].strip() is not '0': 2099 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2100 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2101 2102 if status2 is 'null' or line.split()[0].strip() is '0': 2103 idle += 1 2104 elif status2 in self.idle_tag: 2105 idle += 1 2106 elif status2 in self.running_tag: 2107 run += 1 2108 elif status2 in self.complete_tag: 2109 done += 1 2110 self.submitted -= 1 2111 if not self.check_termination(line.split()[1]): 2112 idle +=1 2113 else: 2114 fail += 1 2115 2116 return idle, run, self.submitted - (idle+run+fail), fail
2117 2118 @multiple_try()
2119 - def remove(self, *args, **opts):
2120 """Clean the jobson the cluster""" 2121 2122 if not self.submitted_ids: 2123 return 2124 id = self.submitted_ids[0] 2125 if id is not 0 : 2126 cmd = "htcaas-job-cancel -m %s" % str(id) 2127 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2128 2129 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2130 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2131 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2132 2133 onecore=MultiCore(1) # create a thread to run simple bash job without having to 2134 #fork the main process 2135