Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  from __future__ import absolute_import 
  15  from __future__ import print_function 
  16  import subprocess 
  17  import logging 
  18  import os 
  19  import time 
  20  import re 
  21  import glob 
  22  import inspect 
  23  import sys 
  24  import six 
  25  from six.moves import range 
  26  from six.moves import input 
  27   
  28  logger = logging.getLogger('madgraph.cluster')  
  29   
  30  try: 
  31      from madgraph import MadGraph5Error 
  32      import madgraph.various.misc as misc 
  33  except Exception as error: 
  34      if __debug__: 
  35          print(str(error)) 
  36      from internal import MadGraph5Error 
  37      import internal.misc as misc 
  38   
  39  pjoin = os.path.join 
40 41 -class ClusterManagmentError(MadGraph5Error):
42 pass
43
44 -class NotImplemented(MadGraph5Error):
45 pass
46 47 48 multiple_try = misc.multiple_try 49 pjoin = os.path.join
50 51 52 -def check_interupt(error=KeyboardInterrupt):
53 54 def deco_interupt(f): 55 def deco_f_interupt(self, *args, **opt): 56 try: 57 return f(self, *args, **opt) 58 except error: 59 try: 60 self.remove(*args, **opt) 61 except Exception: 62 pass 63 raise error
64 return deco_f_interupt 65 return deco_interupt 66
67 -def store_input(arg=''):
68 69 def deco_store(f): 70 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 71 input_files=[], output_files=[], required_output=[], nb_submit=0): 72 frame = inspect.currentframe() 73 args, _, _, values = inspect.getargvalues(frame) 74 args = dict([(i, values[i]) for i in args if i != 'self']) 75 id = f(self, **args) 76 if self.nb_retry > 0: 77 self.retry_args[id] = args 78 return id
79 return deco_f_store 80 return deco_store 81
82 -def need_transfer(options):
83 """ This function checks whether compression of input files are necessary 84 given the running options given. """ 85 86 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 87 return False 88 else: 89 return True
90
91 -class Cluster(object):
92 """Basic Class for all cluster type submission""" 93 name = 'mother class' 94 identifier_length = 14 95
96 - def __init__(self,*args, **opts):
97 """Init the cluster""" 98 99 self.submitted = 0 100 self.submitted_ids = [] 101 self.finish = 0 102 self.submitted_dirs = [] #HTCaaS 103 self.submitted_exes = [] #HTCaaS 104 self.submitted_args = [] #HTCaaS 105 106 if 'cluster_queue' in opts: 107 self.cluster_queue = opts['cluster_queue'] 108 else: 109 self.cluster_queue = 'madgraph' 110 if 'cluster_temp_path' in opts: 111 self.temp_dir = opts['cluster_temp_path'] 112 else: 113 self.temp_dir = None 114 self.options = {'cluster_status_update': (600, 30)} 115 for key,value in opts.items(): 116 self.options[key] = value 117 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 118 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 119 self.options = dict(opts) 120 self.retry_args = {} 121 # controlling jobs in controlled type submision 122 self.packet = {} 123 self.id_to_packet = {}
124
125 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 126 log=None, required_output=[], nb_submit=0):
127 """How to make one submission. Return status id on the cluster.""" 128 raise NotImplemented('No implementation of how to submit a job to cluster \'%s\'' % self.name)
129 130 131 @store_input()
132 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 133 log=None, input_files=[], output_files=[], required_output=[], 134 nb_submit=0):
135 """How to make one submission. Return status id on the cluster. 136 NO SHARE DISK""" 137 138 if cwd is None: 139 cwd = os.getcwd() 140 if not os.path.exists(prog): 141 prog = os.path.join(cwd, prog) 142 143 if not required_output and output_files: 144 required_output = output_files 145 146 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 147 (input_files == [] == output_files): 148 149 return self.submit(prog, argument, cwd, stdout, stderr, log, 150 required_output=required_output, nb_submit=nb_submit) 151 152 if not input_files and not output_files: 153 # not input/output so not using submit2 154 return self.submit(prog, argument, cwd, stdout, stderr, log, 155 required_output=required_output, nb_submit=nb_submit) 156 157 if cwd is None: 158 cwd = os.getcwd() 159 if not os.path.exists(prog): 160 prog = os.path.join(cwd, prog) 161 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 162 163 text = """#!/bin/bash 164 MYTMP=%(tmpdir)s/run$%(job_id)s 165 MYPWD=%(cwd)s 166 mkdir -p $MYTMP 167 cd $MYPWD 168 input_files=( %(input_files)s ) 169 for i in ${input_files[@]} 170 do 171 cp -R -L $i $MYTMP 172 done 173 cd $MYTMP 174 echo '%(arguments)s' > arguments 175 chmod +x ./%(script)s 176 %(program)s ./%(script)s %(arguments)s 177 exit=$? 178 output_files=( %(output_files)s ) 179 for i in ${output_files[@]} 180 do 181 cp -r $MYTMP/$i $MYPWD 182 done 183 # if [ "$exit" -eq "0" ] 184 # then 185 rm -rf $MYTMP 186 # fi 187 """ 188 189 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 190 'cwd': cwd, 'job_id': self.job_id, 191 'input_files': ' '.join(input_files + [prog]), 192 'output_files': ' '.join(output_files), 193 'arguments': ' '.join([str(a) for a in argument]), 194 'program': ' ' if '.py' in prog else 'bash'} 195 196 # writing a new script for the submission 197 new_prog = pjoin(cwd, temp_file_name) 198 open(new_prog, 'w').write(text % dico) 199 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 200 201 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 202 required_output=required_output, nb_submit=nb_submit)
203 204
205 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 206 log=None, input_files=[], output_files=[], required_output=[], 207 nb_submit=0, packet_member=None):
208 """This function wrap the cluster submition with cluster independant 209 method should not be overwritten (but for DAG type submission)""" 210 211 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 212 output_files, required_output, nb_submit) 213 214 215 if not packet_member: 216 return id 217 else: 218 if isinstance(packet_member, Packet): 219 self.id_to_packet[id] = packet_member 220 packet_member.put(id) 221 if packet_member.tag not in self.packet: 222 self.packet[packet_member.tag] = packet_member 223 else: 224 if packet_member in self.packet: 225 packet = self.packet[packet_member] 226 packet.put(id) 227 self.id_to_packet[id] = packet 228 return id
229
230 - def control(self, me_dir=None):
231 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 232 if not self.submitted_ids: 233 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name) 234 idle, run, fail = 0, 0, 0 235 for pid in self.submitted_ids[:]: 236 status = self.control_one_job(id) 237 if status == 'I': 238 idle += 1 239 elif status == 'R': 240 run += 1 241 elif status == 'F': 242 self.finish +=1 243 self.submitted_ids.remove(pid) 244 else: 245 fail += 1 246 247 return idle, run, self.finish, fail
248
249 - def control_one_job(self, pid):
250 """ control the status of a single job with it's cluster id """ 251 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
252
253 - def get_jobs_identifier(self, path, second_path=None):
254 """get a unique run_name for all the jobs helps to identify the runs 255 in the controller for some cluster.""" 256 257 if second_path: 258 path = os.path.realpath(pjoin(path, second_path)) 259 elif not os.path.exists(path): 260 return path # job already done 261 262 if 'SubProcesses' in path: 263 target = path.rsplit('/SubProcesses',1)[0] 264 elif 'MCatNLO' in path: 265 target = path.rsplit('/MCatNLO',1)[0] 266 elif 'PY8_parallelization' in path: 267 target = path.rsplit('/PY8_parallelization',1)[0] 268 elif second_path: 269 target=path 270 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 271 else: 272 target = path 273 274 if target.endswith('/'): 275 target = target[:-1] 276 277 target = misc.digest(target)[-self.identifier_length:] 278 if not target[0].isalpha(): 279 target = 'a' + target[1:] 280 281 return target
282 283 284 @check_interupt()
285 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
286 """Wait that all job are finish. 287 if minimal_job set, then return if idle + run is lower than that number""" 288 289 290 mode = 1 # 0 is long waiting/ 1 is short waiting 291 nb_iter = 0 292 nb_short = 0 293 change_at = 5 # number of iteration from which we wait longer between update. 294 295 if update_first: 296 idle, run, finish, fail = self.control(me_dir) 297 update_first(idle, run, finish) 298 299 #usefull shortcut for readibility 300 longtime, shorttime = self.options['cluster_status_update'] 301 302 nb_job = 0 303 304 if self.options['cluster_type'] == 'htcaas2': 305 me_dir = self.metasubmit(self) 306 307 while 1: 308 old_mode = mode 309 nb_iter += 1 310 idle, run, finish, fail = self.control(me_dir) 311 if nb_job: 312 if idle + run + finish + fail != nb_job: 313 nb_job = idle + run + finish + fail 314 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 315 else: 316 nb_job = idle + run + finish + fail 317 if fail: 318 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 319 if idle + run == 0: 320 #time.sleep(20) #security to ensure that the file are really written on the disk 321 logger.info('All jobs finished') 322 fct(idle, run, finish) 323 break 324 if idle + run < minimal_job: 325 return 326 fct(idle, run, finish) 327 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 328 if nb_iter < change_at: 329 mode = 1 330 elif idle < run: 331 if old_mode == 0: 332 if nb_short: 333 mode = 0 #we already be back from short to long so stay in long 334 #check if we need to go back to short mode 335 elif idle: 336 if nb_iter > change_at + int(longtime)//shorttime: 337 mode = 0 #stay in long waiting mode 338 else: 339 mode = 1 # pass in short waiting mode 340 nb_short =0 341 else: 342 mode = 1 # pass in short waiting mode 343 nb_short = 0 344 elif old_mode == 1: 345 nb_short +=1 346 if nb_short > 3* max(change_at, int(longtime)//shorttime): 347 mode = 0 #go back in slow waiting 348 else: 349 mode = 0 350 351 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 352 if old_mode > mode: 353 logger.info('''Start to wait %ss between checking status. 354 Note that you can change this time in the configuration file. 355 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 356 357 #now Waiting! 358 if mode == 0: 359 try: 360 time.sleep(self.options['cluster_status_update'][0]) 361 except KeyboardInterrupt: 362 logger.info('start to update the status') 363 nb_iter = min(0, change_at -2) 364 nb_short = 0 365 else: 366 time.sleep(self.options['cluster_status_update'][1]) 367 368 369 self.submitted = 0 370 self.submitted_ids = []
371
372 - def check_termination(self, job_id):
373 """Check the termination of the jobs with job_id and relaunch it if needed.""" 374 375 376 if job_id not in self.retry_args: 377 if job_id in self.id_to_packet: 378 nb_in_packet = self.id_to_packet[job_id].remove_one() 379 if nb_in_packet == 0: 380 # packet done run the associate function 381 packet = self.id_to_packet[job_id] 382 # fully ensure that the packet is finished (thread safe) 383 packet.queue.join() 384 #running the function 385 packet.fct(*packet.args) 386 del self.id_to_packet[job_id] 387 return 'resubmit' 388 else: 389 return True 390 391 args = self.retry_args[job_id] 392 if 'time_check' in args: 393 time_check = args['time_check'] 394 else: 395 time_check = 0 396 397 for path in args['required_output']: 398 if args['cwd']: 399 path = pjoin(args['cwd'], path) 400 # check that file exists and is not empty. 401 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 402 break 403 else: 404 # all requested output are present 405 if time_check > 0: 406 logger.info('Job %s Finally found the missing output.' % (job_id)) 407 del self.retry_args[job_id] 408 self.submitted_ids.remove(job_id) 409 # check if the job_id is in a packet 410 if job_id in self.id_to_packet: 411 nb_in_packet = self.id_to_packet[job_id].remove_one() 412 if nb_in_packet == 0: 413 # packet done run the associate function 414 packet = self.id_to_packet[job_id] 415 # fully ensure that the packet is finished (thread safe) 416 packet.queue.join() 417 #running the function 418 packet.fct(*packet.args) 419 del self.id_to_packet[job_id] 420 return 'resubmit' 421 422 return 'done' 423 424 if time_check == 0: 425 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 426 args['time_check'] = time.time() 427 return 'wait' 428 elif self.cluster_retry_wait > time.time() - time_check: 429 return 'wait' 430 431 #jobs failed to be completed even after waiting time!! 432 if self.nb_retry < 0: 433 logger.critical('''Fail to run correctly job %s. 434 with option: %s 435 file missing: %s''' % (job_id, args, path)) 436 input('press enter to continue.') 437 elif self.nb_retry == 0: 438 logger.critical('''Fail to run correctly job %s. 439 with option: %s 440 file missing: %s. 441 Stopping all runs.''' % (job_id, args, path)) 442 self.remove() 443 elif args['nb_submit'] >= self.nb_retry: 444 logger.critical('''Fail to run correctly job %s. 445 with option: %s 446 file missing: %s 447 Fails %s times 448 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 449 self.remove() 450 else: 451 args['nb_submit'] += 1 452 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 453 del self.retry_args[job_id] 454 self.submitted_ids.remove(job_id) 455 if 'time_check' in args: 456 del args['time_check'] 457 if job_id in self.id_to_packet: 458 self.id_to_packet[job_id].remove_one() 459 args['packet_member'] = self.id_to_packet[job_id] 460 del self.id_to_packet[job_id] 461 self.cluster_submit(**args) 462 else: 463 self.submit2(**args) 464 return 'resubmit' 465 return 'done'
466 467 @check_interupt()
468 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 469 stderr=None, log=None, required_output=[], nb_submit=0, 470 input_files=[], output_files=[]):
471 """launch one job on the cluster and wait for it""" 472 473 special_output = False # tag for concatenate the error with the output. 474 if stderr == -2 and stdout: 475 #We are suppose to send the output to stdout 476 special_output = True 477 stderr = stdout + '.err' 478 479 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 480 required_output=required_output, input_files=input_files, 481 output_files=output_files) 482 483 if self.options['cluster_type']=='htcaas2': 484 if self.submitted == self.submitted_ids[-1]: 485 id = self.metasubmit(self) 486 487 frame = inspect.currentframe() 488 args, _, _, values = inspect.getargvalues(frame) 489 args = dict([(i, values[i]) for i in args if i != 'self']) 490 self.retry_args[id] = args 491 492 nb_wait=0 493 while 1: 494 nb_wait+=1 495 status = self.control_one_job(id) 496 if not status in ['R','I']: 497 status = self.check_termination(id) 498 if status in ['wait']: 499 time.sleep(30) 500 continue 501 elif status in ['resubmit']: 502 id = self.submitted_ids[0] 503 time.sleep(30) 504 continue 505 #really stop! 506 time.sleep(30) #security to ensure that the file are really written on the disk 507 break 508 time.sleep(self.options['cluster_status_update'][1]) 509 510 if required_output: 511 status = self.check_termination(id) 512 if status == 'wait': 513 run += 1 514 elif status == 'resubmit': 515 idle += 1 516 517 518 if special_output: 519 # combine the stdout and the stderr 520 #wait up to 50 s to see if those files exists 521 for i in range(5): 522 if os.path.exists(stdout): 523 if not os.path.exists(stderr): 524 time.sleep(5) 525 if os.path.exists(stderr): 526 err_text = open(stderr).read() 527 if not err_text: 528 return 529 logger.warning(err_text) 530 text = open(stdout).read() 531 open(stdout,'w').write(text + err_text) 532 else: 533 return 534 time.sleep(10)
535
536 - def remove(self, *args, **opts):
537 """ """ 538 logger.warning("""This cluster didn't support job removal, 539 the jobs are still running on the cluster.""")
540 541 @store_input()
542 - def metasubmit(self, me_dir):
543 logger.warning("""This cluster didn't support metajob submit.""") 544 return 0
545
546 - def modify_interface(self, run_interface):
547 """routine which allow to modify the run_card/mg5cmd object to change the 548 default behavior of the runs. 549 This is called at the time of the compilation of the run_card. 550 Note that this function can be called multiple times by run. 551 """ 552 #run_card = run_interface.run_card 553 return
554
555 -class Packet(object):
556 """ an object for handling packet of job, it is designed to be thread safe 557 """ 558
559 - def __init__(self, name, fct, args, opts={}):
560 import six.moves.queue 561 import threading 562 self.queue = six.moves.queue.Queue() 563 self.tag = name 564 self.fct = fct 565 self.args = args 566 self.opts = opts 567 self.done = threading.Event()
568
569 - def put(self, *args, **opts):
570 self.queue.put(*args, **opts)
571 572 append = put 573
574 - def remove_one(self):
575 self.queue.get(True) 576 self.queue.task_done() 577 return self.queue.qsize()
578
579 -class MultiCore(Cluster):
580 """class for dealing with the submission in multiple node""" 581 582 job_id = "$" 583
584 - def __init__(self, *args, **opt):
585 """Init the cluster """ 586 587 588 super(MultiCore, self).__init__(self, *args, **opt) 589 590 import six.moves.queue 591 import threading 592 import six.moves._thread 593 self.queue = six.moves.queue.Queue() # list of job to do 594 self.done = six.moves.queue.Queue() # list of job finisned 595 self.submitted = six.moves.queue.Queue() # one entry by job submitted 596 self.stoprequest = threading.Event() #flag to ensure everything to close 597 self.demons = [] 598 self.nb_done =0 599 if 'nb_core' in opt: 600 self.nb_core = opt['nb_core'] 601 elif isinstance(args[0],int): 602 self.nb_core = args[0] 603 else: 604 self.nb_core = 1 605 self.update_fct = None 606 607 self.lock = threading.Event() # allow nice lock of the main thread 608 self.pids = six.moves.queue.Queue() # allow to clean jobs submit via subprocess 609 self.done_pid = [] # list of job finisned 610 self.done_pid_queue = six.moves.queue.Queue() 611 self.fail_msg = None 612 613 # starting the worker node 614 for _ in range(self.nb_core): 615 self.start_demon()
616 617
618 - def start_demon(self):
619 import threading 620 t = threading.Thread(target=self.worker) 621 t.daemon = True 622 t.start() 623 self.demons.append(t)
624 625
626 - def worker(self):
627 import six.moves.queue 628 import six.moves._thread 629 while not self.stoprequest.isSet(): 630 try: 631 args = self.queue.get() 632 tag, exe, arg, opt = args 633 try: 634 # check for executable case 635 if isinstance(exe,str): 636 if os.path.exists(exe) and not exe.startswith('/'): 637 exe = './' + exe 638 if isinstance(opt['stdout'],str): 639 opt['stdout'] = open(opt['stdout'],'w') 640 if opt['stderr'] == None: 641 opt['stderr'] = subprocess.STDOUT 642 if arg: 643 proc = misc.Popen([exe] + arg, **opt) 644 else: 645 proc = misc.Popen(exe, **opt) 646 pid = proc.pid 647 self.pids.put(pid) 648 proc.wait() 649 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 650 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 651 (' '.join([exe]+arg), proc.returncode) 652 logger.warning(fail_msg) 653 self.stoprequest.set() 654 self.remove(fail_msg) 655 # handle the case when this is a python function. Note that 656 # this use Thread so they are NO built-in parralelization this is 657 # going to work on a single core! (but this is fine for IO intensive 658 # function. for CPU intensive fct this will slow down the computation 659 else: 660 pid = tag 661 self.pids.put(pid) 662 # the function should return 0 if everything is fine 663 # the error message otherwise 664 returncode = exe(*arg, **opt) 665 if returncode != 0: 666 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 667 self.stoprequest.set() 668 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 669 except Exception as error: 670 self.fail_msg = sys.exc_info() 671 logger.warning(str(error)) 672 self.stoprequest.set() 673 self.remove(error) 674 675 if __debug__: 676 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 677 678 self.queue.task_done() 679 self.done.put(tag) 680 self.done_pid_queue.put(pid) 681 #release the mother to print the status on the screen 682 try: 683 self.lock.set() 684 except six.moves._thread.error: 685 continue 686 except six.moves.queue.Empty: 687 continue
688 689 690 691
692 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 693 log=None, required_output=[], nb_submit=0):
694 """submit a job on multicore machine""" 695 696 tag = (prog, tuple(argument), cwd, nb_submit) 697 if isinstance(prog, str): 698 699 opt = {'cwd': cwd, 700 'stdout':stdout, 701 'stderr': stderr} 702 703 self.queue.put((tag, prog, argument, opt)) 704 self.submitted.put(1) 705 return tag 706 else: 707 # python function 708 self.queue.put((tag, prog, argument, {})) 709 self.submitted.put(1) 710 return tag
711
712 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 713 stderr=None, log=None, **opts):
714 """launch one job and wait for it""" 715 if isinstance(stdout, str): 716 stdout = open(stdout, 'w') 717 if isinstance(stderr, str): 718 stdout = open(stderr, 'w') 719 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
720
721 - def remove(self, error=None):
722 """Ensure that all thread are killed""" 723 724 # ensure the worker to stop 725 self.stoprequest.set() 726 if error and not self.fail_msg: 727 self.fail_msg = error 728 729 # cleaning the queue done_pid_queue and move them to done_pid 730 while not self.done_pid_queue.empty(): 731 pid = self.done_pid_queue.get() 732 self.done_pid.append(pid) 733 # self.done_pid_queue.task_done() 734 735 while not self.pids.empty(): 736 pid = self.pids.get() 737 self.pids.task_done() 738 if isinstance(pid, tuple): 739 continue 740 if pid in self.done_pid: 741 continue 742 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 743 % {'pid':pid} ) 744 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
745 746
747 - def wait(self, me_dir, update_status, update_first=None):
748 """Waiting that all the jobs are done. This function also control that 749 the submission by packet are handle correctly (i.e. submit the function)""" 750 751 import six.moves.queue 752 import threading 753 754 try: # to catch KeyBoardInterupt to see which kind of error to display 755 last_status = (0, 0, 0) 756 sleep_time = 1 757 use_lock = True 758 first = True 759 while True: 760 force_one_more_loop = False # some security 761 762 # Loop over the job tagged as done to check if some packet of jobs 763 # are finished in case, put the associate function in the queue 764 while self.done.qsize(): 765 try: 766 tag = self.done.get(True, 1) 767 except six.moves.queue.Empty: 768 pass 769 else: 770 if self.id_to_packet and tuple(tag) in self.id_to_packet: 771 packet = self.id_to_packet[tuple(tag)] 772 remaining = packet.remove_one() 773 if remaining == 0: 774 # fully ensure that the packet is finished (thread safe) 775 packet.queue.join() 776 self.submit(packet.fct, packet.args) 777 force_one_more_loop = True 778 self.nb_done += 1 779 self.done.task_done() 780 781 # Get from the various queue the Idle/Done/Running information 782 # Those variable should be thread safe but approximate. 783 Idle = self.queue.qsize() 784 Done = self.nb_done + self.done.qsize() 785 Running = max(0, self.submitted.qsize() - Idle - Done) 786 787 if Idle + Running <= 0 and not force_one_more_loop: 788 update_status(Idle, Running, Done) 789 # Going the quit since everything is done 790 # Fully Ensure that everything is indeed done. 791 self.queue.join() 792 break 793 794 if (Idle, Running, Done) != last_status: 795 if first and update_first: 796 update_first(Idle, Running, Done) 797 first = False 798 else: 799 update_status(Idle, Running, Done) 800 last_status = (Idle, Running, Done) 801 802 # cleaning the queue done_pid_queue and move them to done_pid 803 while not self.done_pid_queue.empty(): 804 pid = self.done_pid_queue.get() 805 self.done_pid.append(pid) 806 self.done_pid_queue.task_done() 807 808 809 # Define how to wait for the next iteration 810 if use_lock: 811 # simply wait that a worker release the lock 812 use_lock = self.lock.wait(300) 813 self.lock.clear() 814 if not use_lock and Idle > 0: 815 use_lock = True 816 else: 817 # to be sure that we will never fully lock at the end pass to 818 # a simple time.sleep() 819 time.sleep(sleep_time) 820 sleep_time = min(sleep_time + 2, 180) 821 if update_first: 822 update_first(Idle, Running, Done) 823 824 if self.stoprequest.isSet(): 825 if isinstance(self.fail_msg, Exception): 826 raise self.fail_msg 827 elif isinstance(self.fail_msg, str): 828 raise Exception(self.fail_msg) 829 else: 830 misc.sprint(self.fail_msg) 831 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 832 # reset variable for next submission 833 try: 834 self.lock.clear() 835 except Exception: 836 pass 837 self.done = six.moves.queue.Queue() 838 self.done_pid = [] 839 self.done_pid_queue = six.moves.queue.Queue() 840 self.nb_done = 0 841 self.submitted = six.moves.queue.Queue() 842 self.pids = six.moves.queue.Queue() 843 self.stoprequest.clear() 844 845 except KeyboardInterrupt: 846 # if one of the node fails -> return that error 847 if isinstance(self.fail_msg, Exception): 848 raise self.fail_msg 849 elif isinstance(self.fail_msg, str): 850 raise Exception(self.fail_msg) 851 elif self.fail_msg: 852 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 853 # else return orignal error 854 raise
855
856 -class CondorCluster(Cluster):
857 """Basic class for dealing with cluster submission""" 858 859 name = 'condor' 860 job_id = 'CONDOR_ID' 861 862 863 864 @multiple_try()
865 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 866 required_output=[], nb_submit=0):
867 """Submit a job prog to a Condor cluster""" 868 869 text = """Executable = %(prog)s 870 output = %(stdout)s 871 error = %(stderr)s 872 log = %(log)s 873 %(argument)s 874 environment = CONDOR_ID=$(Cluster).$(Process) 875 Universe = vanilla 876 notification = Error 877 Initialdir = %(cwd)s 878 %(requirement)s 879 getenv=True 880 queue 1 881 """ 882 883 if self.cluster_queue not in ['None', None]: 884 requirement = 'Requirements = %s=?=True' % self.cluster_queue 885 else: 886 requirement = '' 887 888 if cwd is None: 889 cwd = os.getcwd() 890 if stdout is None: 891 stdout = '/dev/null' 892 if stderr is None: 893 stderr = '/dev/null' 894 if log is None: 895 log = '/dev/null' 896 if not os.path.exists(prog): 897 prog = os.path.join(cwd, prog) 898 if argument: 899 argument = 'Arguments = %s' % ' '.join(argument) 900 else: 901 argument = '' 902 903 904 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 905 'stderr': stderr,'log': log,'argument': argument, 906 'requirement': requirement} 907 908 #open('submit_condor','w').write(text % dico) 909 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 910 stdin=subprocess.PIPE) 911 output, _ = a.communicate((text % dico).encode()) 912 #output = a.stdout.read() 913 #Submitting job(s). 914 #Logging submit event(s). 915 #1 job(s) submitted to cluster 2253622. 916 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 917 output = output.decode() 918 try: 919 id = pat.search(output).groups()[0] 920 except: 921 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 922 % output) 923 self.submitted += 1 924 self.submitted_ids.append(id) 925 return id
926 927 @store_input() 928 @multiple_try()
929 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 930 log=None, input_files=[], output_files=[], required_output=[], 931 nb_submit=0):
932 """Submit the job on the cluster NO SHARE DISK 933 input/output file should be give relative to cwd 934 """ 935 936 if not required_output and output_files: 937 required_output = output_files 938 939 if (input_files == [] == output_files): 940 return self.submit(prog, argument, cwd, stdout, stderr, log, 941 required_output=required_output, nb_submit=nb_submit) 942 943 text = """Executable = %(prog)s 944 output = %(stdout)s 945 error = %(stderr)s 946 log = %(log)s 947 %(argument)s 948 should_transfer_files = YES 949 when_to_transfer_output = ON_EXIT 950 transfer_input_files = %(input_files)s 951 %(output_files)s 952 Universe = vanilla 953 notification = Error 954 Initialdir = %(cwd)s 955 %(requirement)s 956 getenv=True 957 queue 1 958 """ 959 960 if self.cluster_queue not in ['None', None]: 961 requirement = 'Requirements = %s=?=True' % self.cluster_queue 962 else: 963 requirement = '' 964 965 if cwd is None: 966 cwd = os.getcwd() 967 if stdout is None: 968 stdout = '/dev/null' 969 if stderr is None: 970 stderr = '/dev/null' 971 if log is None: 972 log = '/dev/null' 973 if not os.path.exists(prog): 974 prog = os.path.join(cwd, prog) 975 if argument: 976 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 977 else: 978 argument = '' 979 # input/output file treatment 980 if input_files: 981 input_files = ','.join(input_files) 982 else: 983 input_files = '' 984 if output_files: 985 output_files = 'transfer_output_files = %s' % ','.join(output_files) 986 else: 987 output_files = '' 988 989 990 991 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 992 'stderr': stderr,'log': log,'argument': argument, 993 'requirement': requirement, 'input_files':input_files, 994 'output_files':output_files} 995 996 #open('submit_condor','w').write(text % dico) 997 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 998 stdin=subprocess.PIPE) 999 output, _ = a.communicate((text % dico).encode()) 1000 #output = a.stdout.read() 1001 #Submitting job(s). 1002 #Logging submit event(s). 1003 #1 job(s) submitted to cluster 2253622. 1004 output = output.decode() 1005 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 1006 try: 1007 id = pat.search(output).groups()[0] 1008 except: 1009 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1010 % output) 1011 self.submitted += 1 1012 self.submitted_ids.append(id) 1013 return id
1014 1015 1016 1017 1018 1019 @multiple_try(nb_try=10, sleep=10)
1020 - def control_one_job(self, id):
1021 """ control the status of a single job with it's cluster id """ 1022 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1023 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1024 stderr=subprocess.PIPE) 1025 1026 error = status.stderr.read().decode() 1027 if status.returncode or error: 1028 raise ClusterManagmentError('condor_q returns error: %s' % error) 1029 1030 return status.stdout.readline().decode().strip()
1031 1032 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'} 1033 @check_interupt() 1034 @multiple_try(nb_try=10, sleep=10)
1035 - def control(self, me_dir):
1036 """ control the status of a single job with it's cluster id """ 1037 1038 if not self.submitted_ids: 1039 return 0, 0, 0, 0 1040 1041 packet = 15000 1042 idle, run, fail = 0, 0, 0 1043 ongoing = [] 1044 for i in range(1+(len(self.submitted_ids)-1)//packet): 1045 start = i * packet 1046 stop = (i+1) * packet 1047 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1048 " -format \"%d \" ClusterId " + \ 1049 " -format \"%d\\n\" JobStatus " 1050 1051 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1052 stderr=subprocess.PIPE) 1053 error = status.stderr.read().decode() 1054 if status.returncode or error: 1055 raise ClusterManagmentError('condor_q returns error: %s' % error) 1056 1057 for line in status.stdout: 1058 id, status = line.decode().strip().split() 1059 status = self.jobstatus[status] 1060 ongoing.append(id) 1061 if status in ['I','U']: 1062 idle += 1 1063 elif status == 'R': 1064 run += 1 1065 elif status != 'C': 1066 fail += 1 1067 1068 for id in list(self.submitted_ids): 1069 if id not in ongoing: 1070 status = self.check_termination(id) 1071 if status == 'wait': 1072 run += 1 1073 elif status == 'resubmit': 1074 idle += 1 1075 1076 return idle, run, self.submitted - (idle+run+fail), fail
1077 1078 @multiple_try()
1079 - def remove(self, *args, **opts):
1080 """Clean the jobson the cluster""" 1081 1082 if not self.submitted_ids: 1083 return 1084 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1085 1086 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1087 self.submitted_ids = []
1088
1089 -class PBSCluster(Cluster):
1090 """Basic class for dealing with cluster submission""" 1091 1092 name = 'pbs' 1093 job_id = 'PBS_JOBID' 1094 idle_tag = ['Q'] 1095 running_tag = ['T','E','R'] 1096 complete_tag = ['C'] 1097 1098 maximum_submited_jobs = 2500 1099 1100 @multiple_try()
1101 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1102 required_output=[], nb_submit=0):
1103 """Submit a job prog to a PBS cluster""" 1104 1105 me_dir = self.get_jobs_identifier(cwd, prog) 1106 1107 if len(self.submitted_ids) > self.maximum_submited_jobs: 1108 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1109 self.wait(me_dir, fct, self.maximum_submited_jobs) 1110 1111 1112 text = "" 1113 if cwd is None: 1114 cwd = os.getcwd() 1115 else: 1116 text = " cd %s;" % cwd 1117 if stdout is None: 1118 stdout = '/dev/null' 1119 if stderr is None: 1120 stderr = '/dev/null' 1121 elif stderr == -2: # -2 is subprocess.STDOUT 1122 stderr = stdout 1123 if log is None: 1124 log = '/dev/null' 1125 1126 if not os.path.isabs(prog): 1127 text += "./%s" % prog 1128 else: 1129 text+= prog 1130 1131 if argument: 1132 text += ' ' + ' '.join(argument) 1133 1134 command = ['qsub','-o', stdout, 1135 '-N', me_dir, 1136 '-e', stderr, 1137 '-V'] 1138 1139 if self.cluster_queue and self.cluster_queue != 'None': 1140 command.extend(['-q', self.cluster_queue]) 1141 1142 a = misc.Popen(command, stdout=subprocess.PIPE, 1143 stderr=subprocess.STDOUT, 1144 stdin=subprocess.PIPE, cwd=cwd) 1145 1146 output = a.communicate(text.encode())[0].decode() 1147 id = output.split('.')[0] 1148 if not id.isdigit() or a.returncode !=0: 1149 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1150 % output) 1151 1152 self.submitted += 1 1153 self.submitted_ids.append(id) 1154 return id
1155 1156 @multiple_try()
1157 - def control_one_job(self, id):
1158 """ control the status of a single job with it's cluster id """ 1159 cmd = 'qstat '+str(id) 1160 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1161 stderr=subprocess.STDOUT) 1162 1163 for line in status.stdout: 1164 line = line.decode().strip() 1165 if 'cannot connect to server' in line or 'cannot read reply' in line: 1166 raise ClusterManagmentError('server disconnected') 1167 if 'Unknown' in line: 1168 return 'F' 1169 elif line.startswith(str(id)): 1170 jobstatus = line.split()[4] 1171 else: 1172 jobstatus="" 1173 1174 if status.returncode != 0 and status.returncode is not None: 1175 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode) 1176 if jobstatus in self.idle_tag: 1177 return 'I' 1178 elif jobstatus in self.running_tag: 1179 return 'R' 1180 return 'F'
1181 1182 1183 @multiple_try()
1184 - def control(self, me_dir):
1185 """ control the status of a single job with it's cluster id """ 1186 cmd = "qstat" 1187 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1188 1189 me_dir = self.get_jobs_identifier(me_dir) 1190 1191 ongoing = [] 1192 1193 idle, run, fail = 0, 0, 0 1194 for line in status.stdout: 1195 line = line.decode() 1196 if 'cannot connect to server' in line or 'cannot read reply' in line: 1197 raise ClusterManagmentError('server disconnected') 1198 if me_dir in line: 1199 ongoing.append(line.split()[0].split('.')[0]) 1200 status2 = line.split()[4] 1201 if status2 in self.idle_tag: 1202 idle += 1 1203 elif status2 in self.running_tag: 1204 run += 1 1205 elif status2 in self.complete_tag: 1206 if not self.check_termination(line.split()[0].split('.')[0]): 1207 idle += 1 1208 else: 1209 fail += 1 1210 1211 if status.returncode != 0 and status.returncode is not None: 1212 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode) 1213 1214 for id in list(self.submitted_ids): 1215 if id not in ongoing: 1216 status2 = self.check_termination(id) 1217 if status2 == 'wait': 1218 run += 1 1219 elif status2 == 'resubmit': 1220 idle += 1 1221 1222 return idle, run, self.submitted - (idle+run+fail), fail
1223 1224 @multiple_try()
1225 - def remove(self, *args, **opts):
1226 """Clean the jobs on the cluster""" 1227 1228 if not self.submitted_ids: 1229 return 1230 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1231 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1232 self.submitted_ids = []
1233
1234 1235 -class SGECluster(Cluster):
1236 """Basic class for dealing with cluster submission""" 1237 # Class written by Arian Abrahantes. 1238 1239 name = 'sge' 1240 job_id = 'JOB_ID' 1241 idle_tag = ['qw', 'hqw','hRqw','w'] 1242 running_tag = ['r','t','Rr','Rt'] 1243 identifier_length = 10 1244
1245 - def def_get_path(self,location):
1246 """replace string for path issues""" 1247 location = os.path.realpath(location) 1248 homePath = os.getenv("HOME") 1249 if homePath: 1250 location = location.replace(homePath,'$HOME') 1251 return location
1252 1253 @multiple_try()
1254 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1255 required_output=[], nb_submit=0):
1256 """Submit a job prog to an SGE cluster""" 1257 1258 me_dir = self.get_jobs_identifier(cwd, prog) 1259 1260 1261 if cwd is None: 1262 #cwd = os.getcwd() 1263 cwd = self.def_get_path(os.getcwd()) 1264 cwd1 = self.def_get_path(cwd) 1265 text = " cd %s;" % cwd1 1266 if stdout is None: 1267 stdout = '/dev/null' 1268 else: 1269 stdout = self.def_get_path(stdout) 1270 if stderr is None: 1271 stderr = '/dev/null' 1272 elif stderr == -2: # -2 is subprocess.STDOUT 1273 stderr = stdout 1274 else: 1275 stderr = self.def_get_path(stderr) 1276 1277 if log is None: 1278 log = '/dev/null' 1279 else: 1280 log = self.def_get_path(log) 1281 1282 text += prog 1283 if argument: 1284 text += ' ' + ' '.join(argument) 1285 1286 #if anything slips through argument 1287 #print "!=== inteded change ",text.replace('/srv/nfs','') 1288 #text = text.replace('/srv/nfs','') 1289 homePath = os.getenv("HOME") 1290 if homePath: 1291 text = text.replace(homePath,'$HOME') 1292 1293 logger.debug("!=== input %s" % text) 1294 logger.debug("!=== output %s" % stdout) 1295 logger.debug("!=== error %s" % stderr) 1296 logger.debug("!=== logs %s" % log) 1297 1298 command = ['qsub','-o', stdout, 1299 '-N', me_dir, 1300 '-e', stderr, 1301 '-V'] 1302 1303 if self.cluster_queue and self.cluster_queue != 'None': 1304 command.extend(['-q', self.cluster_queue]) 1305 1306 a = misc.Popen(command, stdout=subprocess.PIPE, 1307 stderr=subprocess.STDOUT, 1308 stdin=subprocess.PIPE, cwd=cwd) 1309 1310 output = a.communicate(text.encode())[0].decode() 1311 id = output.split(' ')[2] 1312 if not id.isdigit(): 1313 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1314 % output) 1315 self.submitted += 1 1316 self.submitted_ids.append(id) 1317 logger.debug(output) 1318 1319 return id
1320 1321 @multiple_try()
1322 - def control_one_job(self, id):
1323 """ control the status of a single job with it's cluster id """ 1324 #cmd = 'qstat '+str(id) 1325 cmd = 'qstat ' 1326 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1327 for line in status.stdout: 1328 line = line.decode() 1329 #print "!==",line 1330 #line = line.strip() 1331 #if 'Unknown' in line: 1332 # return 'F' 1333 #elif line.startswith(str(id)): 1334 # status = line.split()[4] 1335 if str(id) in line: 1336 status = line.split()[4] 1337 #print "!=status", status 1338 if status in self.idle_tag: 1339 return 'I' 1340 elif status in self.running_tag: 1341 return 'R' 1342 return 'F'
1343 1344 @multiple_try()
1345 - def control(self, me_dir):
1346 """ control the status of a single job with it's cluster id """ 1347 cmd = "qstat " 1348 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1349 1350 me_dir = self.get_jobs_identifier(me_dir) 1351 1352 finished = list(self.submitted_ids) 1353 1354 idle, run, fail = 0, 0, 0 1355 for line in status.stdout: 1356 line = line.decode() 1357 if me_dir in line: 1358 id,_,_,_,status = line.split()[:5] 1359 if status in self.idle_tag: 1360 idle += 1 1361 finished.remove(id) 1362 elif status in self.running_tag: 1363 run += 1 1364 finished.remove(id) 1365 else: 1366 logger.debug(line) 1367 fail += 1 1368 finished.remove(id) 1369 1370 for id in finished: 1371 self.check_termination(id) 1372 1373 return idle, run, self.submitted - (idle+run+fail), fail
1374 1375 1376 1377 @multiple_try()
1378 - def remove(self, *args, **opts):
1379 """Clean the jobs on the cluster""" 1380 1381 if not self.submitted_ids: 1382 return 1383 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1384 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1385 self.submitted_ids = []
1386
1387 1388 -class LSFCluster(Cluster):
1389 """Basic class for dealing with cluster submission""" 1390 1391 name = 'lsf' 1392 job_id = 'LSB_JOBID' 1393 1394 @multiple_try()
1395 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1396 required_output=[], nb_submit=0):
1397 """Submit the job prog to an LSF cluster""" 1398 1399 1400 me_dir = self.get_jobs_identifier(cwd, prog) 1401 1402 text = "" 1403 command = ['bsub', '-C0', '-J', me_dir] 1404 if cwd is None: 1405 cwd = os.getcwd() 1406 else: 1407 text = " cd %s;" % cwd 1408 if stdout and isinstance(stdout, str): 1409 command.extend(['-o', stdout]) 1410 if stderr and isinstance(stdout, str): 1411 command.extend(['-e', stderr]) 1412 elif stderr == -2: # -2 is subprocess.STDOUT 1413 pass 1414 if log is None: 1415 log = '/dev/null' 1416 1417 text += prog 1418 if argument: 1419 text += ' ' + ' '.join(argument) 1420 1421 if self.cluster_queue and self.cluster_queue != 'None': 1422 command.extend(['-q', self.cluster_queue]) 1423 1424 a = misc.Popen(command, stdout=subprocess.PIPE, 1425 stderr=subprocess.STDOUT, 1426 stdin=subprocess.PIPE, cwd=cwd) 1427 1428 output = a.communicate(text.encode())[0].decode() 1429 #Job <nnnn> is submitted to default queue <normal>. 1430 try: 1431 id = output.split('>',1)[0].split('<')[1] 1432 except: 1433 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1434 % output) 1435 if not id.isdigit(): 1436 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1437 % output) 1438 self.submitted += 1 1439 self.submitted_ids.append(id) 1440 return id
1441 1442 1443 @multiple_try()
1444 - def control_one_job(self, id):
1445 """ control the status of a single job with it's cluster id """ 1446 1447 cmd = 'bjobs '+str(id) 1448 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1449 1450 for line in status.stdout: 1451 line = line.decode().strip().upper() 1452 if 'JOBID' in line: 1453 continue 1454 elif str(id) not in line: 1455 continue 1456 status = line.split()[2] 1457 if status == 'RUN': 1458 return 'R' 1459 elif status == 'PEND': 1460 return 'I' 1461 elif status == 'DONE': 1462 return 'F' 1463 else: 1464 return 'H' 1465 return 'F'
1466 1467 @multiple_try()
1468 - def control(self, me_dir):
1469 """ control the status of a single job with it's cluster id """ 1470 1471 if not self.submitted_ids: 1472 return 0, 0, 0, 0 1473 1474 cmd = "bjobs " + ' '.join(self.submitted_ids) 1475 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1476 1477 jobstatus = {} 1478 for line in status.stdout: 1479 line = line.decode().strip() 1480 if 'JOBID' in line: 1481 continue 1482 splitline = line.split() 1483 id = splitline[0] 1484 if id not in self.submitted_ids: 1485 continue 1486 jobstatus[id] = splitline[2] 1487 1488 idle, run, fail = 0, 0, 0 1489 for id in self.submitted_ids[:]: 1490 if id in jobstatus: 1491 status = jobstatus[id] 1492 else: 1493 status = 'MISSING' 1494 if status == 'RUN': 1495 run += 1 1496 elif status == 'PEND': 1497 idle += 1 1498 else: 1499 status = self.check_termination(id) 1500 if status == 'wait': 1501 run += 1 1502 elif status == 'resubmit': 1503 idle += 1 1504 1505 return idle, run, self.submitted - (idle+run+fail), fail
1506 1507 @multiple_try()
1508 - def remove(self, *args,**opts):
1509 """Clean the jobs on the cluster""" 1510 1511 if not self.submitted_ids: 1512 return 1513 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1514 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1515 self.submitted_ids = []
1516
1517 -class GECluster(Cluster):
1518 """Class for dealing with cluster submission on a GE cluster""" 1519 1520 name = 'ge' 1521 job_id = 'JOB_ID' 1522 idle_tag = ['qw'] 1523 running_tag = ['r'] 1524 1525 @multiple_try()
1526 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1527 required_output=[], nb_submit=0):
1528 """Submit a job prog to a GE cluster""" 1529 1530 text = "" 1531 if cwd is None: 1532 cwd = os.getcwd() 1533 else: 1534 text = " cd %s; bash " % cwd 1535 if stdout is None: 1536 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1537 if stderr is None: 1538 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1539 elif stderr == -2: # -2 is subprocess.STDOUT 1540 stderr = stdout 1541 if log is None: 1542 log = '/dev/null' 1543 1544 text += prog 1545 if argument: 1546 text += ' ' + ' '.join(argument) 1547 text += '\n' 1548 tmp_submit = os.path.join(cwd, 'tmp_submit') 1549 open(tmp_submit,'w').write(text) 1550 1551 a = misc.Popen(['qsub','-o', stdout, 1552 '-e', stderr, 1553 tmp_submit], 1554 stdout=subprocess.PIPE, 1555 stderr=subprocess.STDOUT, 1556 stdin=subprocess.PIPE, cwd=cwd) 1557 1558 output = a.communicate()[0].decode() 1559 #Your job 874511 ("test.sh") has been submitted 1560 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1561 try: 1562 id = pat.search(output).groups()[0] 1563 except: 1564 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1565 % output) 1566 self.submitted += 1 1567 self.submitted_ids.append(id) 1568 return id
1569 1570 @multiple_try()
1571 - def control_one_job(self, id):
1572 """ control the status of a single job with it's cluster id """ 1573 cmd = 'qstat | grep '+str(id) 1574 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1575 if not status: 1576 return 'F' 1577 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1578 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1579 stat = '' 1580 for line in status.stdout.read().decode().split('\n'): 1581 if not line: 1582 continue 1583 line = line.strip() 1584 try: 1585 groups = pat.search(line).groups() 1586 except: 1587 raise ClusterManagmentError('bad syntax for stat: \n\"%s\"' % line) 1588 if groups[0] != id: continue 1589 stat = groups[1] 1590 if not stat: 1591 return 'F' 1592 if stat in self.idle_tag: 1593 return 'I' 1594 if stat in self.running_tag: 1595 return 'R'
1596 1597 @multiple_try()
1598 - def control(self, me_dir=None):
1599 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1600 if not self.submitted_ids: 1601 return 0, 0, 0, 0 1602 idle, run, fail = 0, 0, 0 1603 ongoing = [] 1604 for statusflag in ['p', 'r', 'sh']: 1605 cmd = 'qstat -s %s' % statusflag 1606 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1607 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1608 pat = re.compile("^(\d+)") 1609 for line in status.stdout.read().decode().split('\n'): 1610 line = line.strip() 1611 try: 1612 id = pat.search(line).groups()[0] 1613 except Exception: 1614 pass 1615 else: 1616 if id not in self.submitted_ids: 1617 continue 1618 ongoing.append(id) 1619 if statusflag == 'p': 1620 idle += 1 1621 if statusflag == 'r': 1622 run += 1 1623 if statusflag == 'sh': 1624 fail += 1 1625 for id in list(self.submitted_ids): 1626 if id not in ongoing: 1627 self.check_termination(id) 1628 #self.submitted_ids = ongoing 1629 1630 return idle, run, self.submitted - idle - run - fail, fail
1631 1632 @multiple_try()
1633 - def remove(self, *args, **opts):
1634 """Clean the jobs on the cluster""" 1635 1636 if not self.submitted_ids: 1637 return 1638 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1639 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1640 self.submitted_ids = []
1641
1642 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1643 """start a computation and not wait for it to finish. 1644 this fonction returns a lock which is locked as long as the job is 1645 running.""" 1646 1647 mc = MultiCore(1) 1648 mc.submit(exe, argument, cwd, stdout, **opt) 1649 mc.need_waiting = True 1650 return mc.lock
1651
1652 1653 -class SLURMCluster(Cluster):
1654 """Basic class for dealing with cluster submission""" 1655 1656 name = 'slurm' 1657 job_id = 'SLURM_JOBID' 1658 idle_tag = ['Q','PD','S','CF'] 1659 running_tag = ['R', 'CG'] 1660 complete_tag = ['C'] 1661 identifier_length = 8 1662 1663 @multiple_try()
1664 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1665 required_output=[], nb_submit=0):
1666 """Submit a job prog to a SLURM cluster""" 1667 1668 me_dir = self.get_jobs_identifier(cwd, prog) 1669 1670 1671 if cwd is None: 1672 cwd = os.getcwd() 1673 if stdout is None: 1674 stdout = '/dev/null' 1675 if stderr is None: 1676 stderr = '/dev/null' 1677 elif stderr == -2: # -2 is subprocess.STDOUT 1678 stderr = stdout 1679 if log is None: 1680 log = '/dev/null' 1681 1682 command = ['sbatch', '-o', stdout, 1683 '-J', me_dir, 1684 '-e', stderr, prog] + argument 1685 1686 if self.cluster_queue and self.cluster_queue != 'None': 1687 command.insert(1, '-p') 1688 command.insert(2, self.cluster_queue) 1689 1690 a = misc.Popen(command, stdout=subprocess.PIPE, 1691 stderr=subprocess.STDOUT, 1692 stdin=subprocess.PIPE, cwd=cwd) 1693 1694 output = a.communicate() 1695 output_arr = output[0].decode().split(' ') 1696 id = output_arr[3].rstrip() 1697 1698 if not id.isdigit(): 1699 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1700 % (output[0] + '\n' + output[1])) 1701 1702 self.submitted += 1 1703 self.submitted_ids.append(id) 1704 return id
1705 1706 @multiple_try()
1707 - def control_one_job(self, id):
1708 """ control the status of a single job with it's cluster id """ 1709 cmd = 'squeue j'+str(id) 1710 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1711 stderr=open(os.devnull,'w')) 1712 1713 for line in status.stdout: 1714 line = line.decode().strip() 1715 if 'Invalid' in line: 1716 return 'F' 1717 elif line.startswith(str(id)): 1718 status = line.split()[4] 1719 if status in self.idle_tag: 1720 return 'I' 1721 elif status in self.running_tag: 1722 return 'R' 1723 return 'F'
1724 1725 @multiple_try()
1726 - def control(self, me_dir):
1727 """ control the status of a single job with it's cluster id """ 1728 cmd = "squeue" 1729 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1730 1731 me_dir = self.get_jobs_identifier(me_dir) 1732 1733 idle, run, fail = 0, 0, 0 1734 ongoing=[] 1735 for line in pstatus.stdout: 1736 line = line.decode() 1737 if me_dir in line: 1738 id, _, _,_ , status,_ = line.split(None,5) 1739 ongoing.append(id) 1740 if status in self.idle_tag: 1741 idle += 1 1742 elif status in self.running_tag: 1743 run += 1 1744 elif status in self.complete_tag: 1745 status = self.check_termination(id) 1746 if status == 'wait': 1747 run += 1 1748 elif status == 'resubmit': 1749 idle += 1 1750 else: 1751 fail += 1 1752 1753 #control other finished job 1754 for id in list(self.submitted_ids): 1755 if id not in ongoing: 1756 status = self.check_termination(id) 1757 if status == 'wait': 1758 run += 1 1759 elif status == 'resubmit': 1760 idle += 1 1761 1762 1763 return idle, run, self.submitted - (idle+run+fail), fail
1764 1765 @multiple_try()
1766 - def remove(self, *args, **opts):
1767 """Clean the jobs on the cluster""" 1768 1769 if not self.submitted_ids: 1770 return 1771 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1772 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1773 self.submitted_ids = []
1774
1775 -class HTCaaSCluster(Cluster):
1776 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1777 1778 name= 'htcaas' 1779 job_id = 'HTCAAS_JOBID' 1780 idle_tag = ['waiting'] 1781 running_tag = ['preparing','running'] 1782 complete_tag = ['done'] 1783 1784 @store_input() 1785 @multiple_try()
1786 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1787 log=None, input_files=[], output_files=[], required_output=[], 1788 nb_submit=0):
1789 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1790 input/output file should be given as relative to CWd 1791 """ 1792 # To make workspace name(temp) 1793 cur_usr = os.getenv('USER') 1794 1795 if cwd is None: 1796 cwd = os.getcwd() 1797 1798 cwd_cp = cwd.rsplit("/",2) 1799 1800 if not stdout is None: 1801 print("stdout: %s" % stdout) 1802 1803 if not os.path.exists(prog): 1804 prog = os.path.join(cwd, prog) 1805 1806 if not required_output and output_files: 1807 required_output = output_files 1808 1809 logger.debug(prog) 1810 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1811 cwd_arg = cwd+"/arguments" 1812 temp = ' '.join([str(a) for a in argument]) 1813 arg_cmd="echo '"+temp+"' > " + cwd_arg 1814 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1815 if argument : 1816 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1817 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1818 id = a.stdout.read().strip() 1819 1820 else: 1821 cwd_arg = cwd+"/arguments" 1822 temp = ' '.join([str(a) for a in argument]) 1823 temp_file_name = "sub." + os.path.basename(prog) 1824 text = """#!/bin/bash 1825 MYPWD=%(cwd)s 1826 cd $MYPWD 1827 input_files=(%(input_files)s ) 1828 for i in ${input_files[@]} 1829 do 1830 chmod -f +x $i 1831 done 1832 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1833 """ 1834 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1835 'arguments': ' '.join([str(a) for a in argument]), 1836 'program': ' ' if '.py' in prog else 'bash'} 1837 1838 # writing a new script for the submission 1839 new_prog = pjoin(cwd, temp_file_name) 1840 open(new_prog, 'w').write(text % dico) 1841 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1842 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1843 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1844 id = a.stdout.read().strip() 1845 logger.debug(id) 1846 1847 nb_try=0 1848 nb_limit=5 1849 if not id.isdigit() : 1850 print("[ID is not digit]:" + id) 1851 1852 while not id.isdigit() : 1853 nb_try+=1 1854 print("[fail_retry]:"+ nb_try) 1855 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1856 id = a.stdout.read().strip() 1857 if nb_try > nb_limit : 1858 raise ClusterManagementError('fail to submit to the HTCaaS cluster: \n %s' % id) 1859 break 1860 1861 self.submitted += 1 1862 self.submitted_ids.append(id) 1863 1864 return id
1865 1866 @multiple_try(nb_try=10, sleep=5)
1867 - def control_one_job(self, id):
1868 """ control the status of a single job with it's cluster id """ 1869 1870 if id == 0 : 1871 status_out ='C' 1872 else : 1873 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1874 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1875 stderr=subprocess.PIPE) 1876 error = status.stderr.read().decode() 1877 if status.returncode or error: 1878 raise ClusterManagmentError('htcaas-job-submit returns error: %s' % error) 1879 status_out= status.stdout.read().decode().strip() 1880 status_out= status_out.split(":",1)[1] 1881 if status_out == 'waiting': 1882 status_out='I' 1883 elif status_out == 'preparing' or status_out == 'running': 1884 status_out = 'R' 1885 elif status_out != 'done': 1886 status_out = 'F' 1887 elif status_out == 'done': 1888 status_out = 'C' 1889 1890 return status_out
1891 1892 @multiple_try()
1893 - def control(self, me_dir):
1894 """ control the status of a single job with it's cluster id """ 1895 if not self.submitted_ids: 1896 logger.debug("self.submitted_ids not exists") 1897 return 0, 0, 0, 0 1898 1899 ongoing = [] 1900 idle, run, fail = 0, 0, 0 1901 1902 start = self.submitted_ids[0] 1903 end = self.submitted_ids[-1] 1904 1905 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1906 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1907 1908 for line in status.stdout: 1909 #ongoing.append(line.split()[0].strip()) 1910 status2 = line.decode().split()[-1] 1911 if status2 != 'null' or line.split()[0].strip() != '0': 1912 ongoing.append(line.split()[0].strip()) 1913 logger.debug("["+line.split()[0].strip()+"]"+status2) 1914 if status2 != 'null' or line.split()[0].strip() != '0': 1915 idle += 1 1916 elif status2 in self.idle_tag: 1917 idle += 1 1918 elif status2 in self.running_tag: 1919 run += 1 1920 elif status2 in self.complete_tag: 1921 if not self.check_termination(line.split()[0]): 1922 idle +=1 1923 else: 1924 fail += 1 1925 1926 return idle, run, self.submitted - (idle+run+fail), fail
1927 1928 @multiple_try()
1929 - def remove(self, *args, **opts):
1930 """Clean the jobson the cluster""" 1931 1932 if not self.submitted_ids: 1933 return 1934 for i in range(len(self.submitted_ids)): 1935 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1936 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1937
1938 -class HTCaaS2Cluster(Cluster):
1939 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1940 1941 name= 'htcaas2' 1942 job_id = 'HTCAAS2_JOBID' 1943 idle_tag = ['waiting'] 1944 running_tag = ['preparing','running'] 1945 complete_tag = ['done'] 1946 1947 @store_input() 1948 @multiple_try()
1949 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1950 log=None, input_files=[], output_files=[], required_output=[], 1951 nb_submit=0):
1952 1953 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1954 input/output file should be given as relative to CWD 1955 """ 1956 if cwd is None: 1957 cwd = os.getcwd() 1958 1959 if not os.path.exists(prog): 1960 prog = os.path.join(cwd, prog) 1961 1962 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1963 if cwd or prog : 1964 self.submitted_dirs.append(cwd) 1965 self.submitted_exes.append(prog) 1966 else: 1967 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1968 1969 if argument : 1970 self.submitted_args.append('='.join([str(a) for a in argument])) 1971 1972 if cwd or prog : 1973 self.submitted += 1 1974 id = self.submitted 1975 self.submitted_ids.append(id) 1976 else: 1977 logger.debug("cwd and prog are not exist! ") 1978 id = 0 1979 1980 else: 1981 temp_file_name = "sub."+ os.path.basename(prog) 1982 text = """#!/bin/bash 1983 MYPWD=%(cwd)s 1984 cd $MYPWD 1985 input_files=(%(input_files)s ) 1986 for i in ${input_files[@]} 1987 do 1988 chmod -f +x $i 1989 done 1990 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1991 """ 1992 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1993 'arguments': ' '.join([str(a) for a in argument]), 1994 'program': ' ' if '.py' in prog else 'bash'} 1995 # writing a new script for the submission 1996 new_prog = pjoin(cwd, temp_file_name) 1997 open(new_prog, 'w').write(text % dico) 1998 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1999 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 2000 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 2001 id = a.stdout.read().strip() 2002 logger.debug("[mode2]-["+str(id)+"]") 2003 if cwd and prog : 2004 self.submitted += 1 2005 self.submitted_ids.append(id) 2006 else: 2007 logger.debug("cwd and prog are not exist! ") 2008 id = 0 2009 2010 return id
2011 2012 @multiple_try()
2013 - def metasubmit(self, me_dir=None):
2014 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 2015 tmp_leng= len(self.submitted_ids)/2 2016 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 2017 tmp_dirs2= self.submitted_dirs[tmp_leng:] 2018 tmp_exes1= self.submitted_exes[0:tmp_leng] 2019 tmp_exes2= self.submitted_exes[tmp_leng:] 2020 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a != ' ']), 2021 '-e', ":".join([str(a) for a in tmp_exes1 if a and a != ' '])] 2022 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a != ' ']), 2023 '-e', ":".join([str(a) for a in tmp_exes2 if a and a != ' '])] 2024 if len(self.submitted_args) > 0 : 2025 tmp_args1= self.submitted_args[0:tmp_leng] 2026 tmp_args2= self.submitted_args[tmp_leng:] 2027 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 2028 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 2029 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2030 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2031 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2032 2033 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2034 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a != ' ']), 2035 '-e', ":".join([str(a) for a in self.submitted_exes if a and a != ' '])] 2036 if len(self.submitted_args) > 0 : 2037 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2038 if self.submitted_dirs[0] or self.submitted_exes[0] : 2039 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2040 me_dir = result.stdout.read().strip() 2041 self.submitted_ids[0]=me_dir 2042 else: 2043 me_dir = self.submitted_ids[-1] 2044 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2045 me_dir = self.submitted_ids[0] 2046 else: 2047 me_dir = -1 2048 2049 logger.debug("[" + str(me_dir) + "]") 2050 2051 self.submitted_dirs = [] 2052 self.submitted_exes = [] 2053 self.submitted_args = [] 2054 2055 return me_dir
2056 2057 2058 @multiple_try(nb_try=10, sleep=5)
2059 - def control_one_job(self, id):
2060 """ control the status of a single job with it's cluster id """ 2061 #logger.debug("CONTROL ONE JOB MODE") 2062 if self.submitted == self.submitted_ids[-1] : 2063 id = self.metasubmit(self) 2064 tempid = self.submitted_ids[-1] 2065 self.submitted_ids.remove(self.submitted_ids[-1]) 2066 self.submitted_ids.append(id) 2067 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2068 2069 if id == 0 : 2070 status_out ='C' 2071 else: 2072 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2073 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2074 stderr=subprocess.PIPE) 2075 error = status.stderr.read().decode() 2076 if status.returncode or error: 2077 raise ClusterManagmentError('htcaas-job-status returns error: %s' % error) 2078 status_out= status.stdout.read().decode().strip() 2079 status_out= status_out.split(":",1)[1] 2080 logger.debug("[["+str(id)+"]]"+status_out) 2081 if status_out == 'waiting': 2082 status_out='I' 2083 elif status_out == 'preparing' or status_out == 'running': 2084 status_out = 'R' 2085 elif status_out != 'done': 2086 status_out = 'F' 2087 elif status_out == 'done': 2088 status_out = 'C' 2089 self.submitted -= 1 2090 2091 return status_out
2092 2093 @multiple_try()
2094 - def control(self, me_dir):
2095 """ control the status of a single job with it's cluster id """ 2096 if not self.submitted_ids: 2097 logger.debug("self.submitted_ids not exists") 2098 return 0, 0, 0, 0 2099 2100 if "//" in me_dir : 2101 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2102 start = me_dir.split("//")[0] 2103 end = me_dir.split("//")[1] 2104 else : 2105 start = me_dir.split("//")[1] 2106 end = me_dir.split("//")[0] 2107 elif "/" in me_dir : # update 2108 start = 0 2109 end = 0 2110 elif me_dir.isdigit(): 2111 start = me_dir 2112 end = me_dir 2113 elif not me_dir.isdigit(): 2114 me_dir = self.submitted_ids[0] 2115 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2116 2117 ongoing = [] 2118 idle, run, fail, done = 0, 0, 0, 0 2119 2120 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2121 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2122 2123 for line in status.stdout: 2124 line = line.decode() 2125 status2 = line.split()[-1] 2126 if status2 != 'null' or line.split()[0].strip() != '0': 2127 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2128 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2129 2130 if status2 == 'null' or line.split()[0].strip() == '0': 2131 idle += 1 2132 elif status2 in self.idle_tag: 2133 idle += 1 2134 elif status2 in self.running_tag: 2135 run += 1 2136 elif status2 in self.complete_tag: 2137 done += 1 2138 self.submitted -= 1 2139 if not self.check_termination(line.split()[1]): 2140 idle +=1 2141 else: 2142 fail += 1 2143 2144 return idle, run, self.submitted - (idle+run+fail), fail
2145 2146 @multiple_try()
2147 - def remove(self, *args, **opts):
2148 """Clean the jobson the cluster""" 2149 2150 if not self.submitted_ids: 2151 return 2152 id = self.submitted_ids[0] 2153 if id: 2154 cmd = "htcaas-job-cancel -m %s" % str(id) 2155 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2156 2157 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2158 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2159 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2160 2161 onecore=MultiCore(1) # create a thread to run simple bash job without having to 2162 #fork the main process 2163