1 2############################################################################### 3 # 4 # This file is part of canu, a software program that assembles whole-genome 5 # sequencing reads into contigs. 6 # 7 # This software is based on: 8 # 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) 9 # the 'kmer package' r1994 (http://kmer.sourceforge.net) 10 # 11 # Except as indicated otherwise, this is a 'United States Government Work', 12 # and is released in the public domain. 13 # 14 # File 'README.licenses' in the root directory of this distribution 15 # contains full conditions and disclaimers. 16 ## 17 18package canu::Execution; 19 20require Exporter; 21 22@ISA = qw(Exporter); 23@EXPORT = qw(stopAfter 24 resetIteration 25 touch 26 makeExecutable 27 getJobIDShellCode 28 getLimitShellCode 29 getBinDirectory 30 getBinDirectoryShellCode 31 setWorkDirectory 32 setWorkDirectoryShellCode 33 submitScript 34 submitOrRunParallelJob 35 runCommand 36 runCommandSilently 37 findCommand 38 findExecutable 39 caExit 40 caFailure); 41 42use strict; 43use warnings "all"; 44no warnings "uninitialized"; 45 46use Config; # for @signame 47use Cwd qw(getcwd); 48use Carp qw(longmess); 49 50use POSIX ":sys_wait_h"; # For waitpid(..., &WNOHANG) 51use File::Basename; 52use List::Util qw(min max); 53use File::Path 2.08 qw(make_path remove_tree); 54use File::Spec; 55 56use canu::Defaults; 57 58 59 60 61 62# Log that we've finished a task. 63 64sub logFinished ($$) { 65 my $dir = shift @_; 66 my $startsecs = shift @_; 67 68 my $diskfree = diskSpace("."); 69 70 my $warning = " !!! WARNING !!!" if ($diskfree < 10); 71 my $elapsed = time() - $startsecs; 72 my $message; 73 74 my @fast; 75 76 push @fast, "lickety-split"; 77 push @fast, "fast as lightning"; 78 push @fast, "furiously fast"; 79 push @fast, "like a bat out of hell"; 80 push @fast, "in the blink of an eye"; 81 82 my @slow; 83 84 push @slow, "fashionably late"; 85 push @slow, "better late than never"; 86 push @slow, "like watching paint dry"; 87 push @slow, "at least I didn't crash"; 88 push @slow, "it'll be worth it in the end"; 89 push @slow, "no bitcoins found either"; 90 91 my $rf = int(rand(scalar(@fast))); 92 my $rs = int(rand(scalar(@slow))); 93 my $rp = int(rand(100)); 94 95 $message = "$elapsed seconds" if ($elapsed > 1); 96 $message = "one second" if ($elapsed == 1); 97 $message = $fast[$rf] if ($elapsed < 1); 98 99 $message .= ", " . $slow[$rs] if ((($elapsed > 1000) && ($rp < 1)) || 100 (($elapsed > 10000) && ($rp < 50)) || 101 (($elapsed > 86400))); 102 103 print STDERR "\n"; 104 print STDERR "-- Finished on ", scalar(localtime()), " ($message) with $diskfree GB free disk space$warning\n"; 105 print STDERR "----------------------------------------\n"; 106} 107 108 109 110# 111# Functions for running multiple processes at the same time. This is private to the module. 112# 113 114my $numberOfProcesses = 0; # Number of jobs concurrently running 115my $numberOfProcessesToWait = 0; # Number of jobs we can leave running at exit 116my @processQueue = (); 117my @processesRunning = (); 118my $printProcessCommand = 1; # Show commands as they run 119 120sub schedulerSetNumberOfProcesses { 121 $numberOfProcesses = shift @_; 122} 123 124sub schedulerSubmit ($) { 125 my $cmd = shift @_; 126 127 chomp $cmd; 128 129 push @processQueue, $cmd; 130} 131 132sub schedulerForkProcess ($) { 133 my $process = shift @_; 134 my $pid; 135 136 # From Programming Perl, page 167 137 FORK: { 138 if ($pid = fork) { 139 # Parent 140 # 141 return($pid); 142 } elsif (defined $pid) { 143 # Child 144 # 145 exec($process); 146 } elsif ($! =~ /No more processes/) { 147 # EAGIN, supposedly a recoverable fork error 148 sleep 1; 149 redo FORK; 150 } else { 151 die "Can't fork: $!\n"; 152 } 153 } 154} 155 156sub schedulerReapProcess ($) { 157 my $pid = shift @_; 158 159 if (waitpid($pid, &WNOHANG) > 0) { 160 return(1); 161 } else { 162 return(0); 163 } 164} 165 166sub schedulerRun () { 167 my @running; 168 169 # Reap any processes that have finished 170 171 foreach my $i (@processesRunning) { 172 push @running, $i if (schedulerReapProcess($i) == 0); 173 } 174 175 @processesRunning = @running; 176 177 # Run processes in any available slots 178 179 while ((scalar(@processesRunning) < $numberOfProcesses) && 180 (scalar(@processQueue) > 0)) { 181 my $process = shift @processQueue; 182 print STDERR " $process\n"; 183 push @processesRunning, schedulerForkProcess($process); 184 } 185} 186 187sub schedulerFinish ($$) { 188 my $dir = shift @_; 189 my $nam = shift @_; 190 my $child; 191 my @newProcesses; 192 my $remain; 193 194 $remain = scalar(@processQueue); 195 196 my $startsecs = time(); 197 my $diskfree = (defined($dir)) ? (diskSpace($dir)) : (0); 198 199 print STDERR "----------------------------------------\n"; 200 print STDERR "-- Starting '$nam' concurrent execution on ", scalar(localtime()), " with $diskfree GB free disk space ($remain processes; $numberOfProcesses concurrently)\n" if (defined($dir)); 201 print STDERR "-- Starting '$nam' concurrent execution on ", scalar(localtime()), " ($remain processes; $numberOfProcesses concurrently)\n" if (!defined($dir)); 202 print STDERR "\n"; 203 print STDERR " cd $dir\n"; 204 205 my $cwd = getcwd(); # Remember where we are. 206 chdir($dir); # So we can root the jobs in the correct location. 207 208 # Run all submitted jobs 209 # 210 while ($remain > 0) { 211 schedulerRun(); 212 213 $remain = scalar(@processQueue); 214 215 if ($remain > 0) { 216 $child = waitpid -1, 0; 217 218 undef @newProcesses; 219 foreach my $i (@processesRunning) { 220 push @newProcesses, $i if ($child != $i); 221 } 222 undef @processesRunning; 223 @processesRunning = @newProcesses; 224 } 225 } 226 227 # Wait for them to finish, if requested 228 # 229 while (scalar(@processesRunning) > $numberOfProcessesToWait) { 230 waitpid(shift @processesRunning, 0); 231 } 232 233 logFinished($dir, $startsecs); 234 235 chdir($cwd); 236} 237 238 239 240# 241# File Management 242# 243 244sub touch ($@) { 245 open(F, "> $_[0]") or caFailure("failed to touch file '$_[0]'", undef); 246 print F "$_[1]\n" if (defined($_[1])); 247 close(F); 248} 249 250 251 252sub makeExecutable ($) { 253 my $file = shift @_; 254 255 chmod(0755 & ~umask(), $file); 256} 257 258 259# 260# State management 261# 262 263sub stopAfter ($) { 264 my $stopAfter = shift @_; 265 266 $stopAfter =~ tr/A-Z/a-z/; 267 268 return if (($stopAfter ne "theend") && 269 ($stopAfter ne getGlobal("stopAfter"))); 270 271 if ($stopAfter ne "theend") { 272 print STDERR "--\n"; 273 print STDERR "-- Stop requested after '$stopAfter'.\n"; 274 } 275 276 if (defined(getGlobal("onSuccess"))) { 277 print STDERR "--\n"; 278 print STDERR "-- Running user-supplied termination command.\n"; 279 280 runCommand(getGlobal("onExitDir"), getGlobal("onSuccess") . " " . getGlobal("onExitNam")); 281 } 282 283 print STDERR "--\n"; 284 print STDERR "-- Bye.\n"; 285 286 exit(0); 287} 288 289 290sub resetIteration ($) { 291 my $stage = shift @_; 292 293 print STDERR "-- Finished stage '$stage', reset canuIteration.\n" if (defined($stage)); 294 295 setGlobal("canuIteration", 0); 296} 297 298 299 300# Decide what bin directory to use. 301# 302# When we are running on the grid, the path of this perl script is NOT always the correct 303# architecture. If the submission host is FreeBSD, but the grid is Linux, the BSD box will submit 304# FreeBSD/bin/canu to the grid. Unless it knows which grid host it will run on in advance, there 305# is no way to pick the correct one. The grid host then has to have enough smarts to choose the 306# correct binaries, and that is what we're doing here. 307# 308# To make it more trouble, shell scripts need to do all this by themselves. 309# 310#sub getInstallDirectory () { 311# my $installDir = ; 312# 313# if ($installDir =~ m!^(.*)/\w+-\w+/bin$!) { 314# $installDir = $1; 315# } 316# 317# return($installDir); 318#} 319 320 321# Emits a block of shell code to parse the grid task id and offset. 322# Expects zero or one argument, which is interpreted different in grid and non-grid mode. 323# Off grid - the job to run 324# On grid - an offset to add to SGE_TASK_ID or SLURM_ARRAY_TASK_ID to compute the job to run 325# 326# PBSPro refuses to run an array job with one element. They're submitted as a normal job. Here, 327# we check if it is running on the grid and if the task ID (aka, array ID) isn't set. If so, we 328# assume it is job 1. 329# 330sub getJobIDShellCode () { 331 my $string; 332 my $taskenv = getGlobal('gridEngineTaskID'); 333 334 $string .= "# Discover the job ID to run, from either a grid environment variable and a\n"; 335 $string .= "# command line offset, or directly from the command line.\n"; 336 $string .= "#\n"; 337 $string .= "if [ x\$$taskenv = x -o x\$$taskenv = xundefined -o x\$$taskenv = x0 ]; then\n"; 338 $string .= " baseid=\$1\n"; # Off grid 339 $string .= " offset=0\n"; 340 $string .= "else\n"; 341 $string .= " baseid=\$$taskenv\n"; # On Grid 342 $string .= " offset=\$1\n"; 343 $string .= "fi\n"; 344 $string .= "if [ x\$offset = x ]; then\n"; 345 $string .= " offset=0\n"; 346 $string .= "fi\n"; 347 $string .= "if [ x\$baseid = x ]; then\n"; 348 $string .= " echo Error: I need $taskenv set, or a job index on the command line.\n"; 349 $string .= " exit\n"; 350 $string .= "fi\n"; 351 $string .= "jobid=`expr -- \$baseid + \$offset`\n"; 352 $string .= "if [ x\$baseid = x0 ]; then\n"; 353 $string .= " echo Error: jobid 0 is invalid\\; I need $taskenv set, or a job index on the command line.\n"; 354 $string .= " exit\n"; 355 $string .= "fi\n"; 356 $string .= "if [ x\$$taskenv = x ]; then\n"; 357 $string .= " echo Running job \$jobid based on command line options.\n"; 358 $string .= "else\n"; 359 $string .= " echo Running job \$jobid based on $taskenv=\$$taskenv and offset=\$offset.\n"; 360 $string .= "fi\n"; 361} 362 363 364# Emits a block of shell code to change shell imposed limit on the number of open files and 365# processes. 366# 367sub getLimitShellCode () { 368 my $string; 369 370 $string .= "echo \"\"\n"; 371 $string .= "echo \"Attempting to increase maximum allowed processes and open files.\""; 372 $string .= "\n"; 373 $string .= "max=`ulimit -Hu`\n"; 374 $string .= "bef=`ulimit -Su`\n"; 375 $string .= "if [ \$bef -lt \$max ] ; then\n"; 376 $string .= " ulimit -Su \$max\n"; 377 $string .= " aft=`ulimit -Su`\n"; 378 $string .= " echo \" Changed max processes per user from \$bef to \$aft (max \$max).\"\n"; 379 $string .= "else\n"; 380 $string .= " echo \" Max processes per user limited to \$bef, no increase possible.\"\n"; 381 $string .= "fi\n"; 382 $string .= "\n"; 383 $string .= "max=`ulimit -Hn`\n"; 384 $string .= "bef=`ulimit -Sn`\n"; 385 $string .= "if [ \$bef -lt \$max ] ; then\n"; 386 $string .= " ulimit -Sn \$max\n"; 387 $string .= " aft=`ulimit -Sn`\n"; 388 $string .= " echo \" Changed max open files from \$bef to \$aft (max \$max).\"\n"; 389 $string .= "else\n"; 390 $string .= " echo \" Max open files limited to \$bef, no increase possible.\"\n"; 391 $string .= "fi\n"; 392 $string .= "\n"; 393 $string .= "echo \"\"\n"; 394 $string .= "\n"; 395 396 return($string); 397} 398 399 400# Used inside canu to find where binaries are located. 401# 402sub getBinDirectory () { 403 return($FindBin::RealBin); 404 405 #my $idir = getInstallDirectory(); 406 #my $path = $idir; 407 # 408 #$path = "$idir/bin" if (-d "$idir/bin"); 409 # 410 #return($path); 411} 412 413 414# Emits a block of shell code to locate binaries during shell scripts. See comments on 415# getBinDirectory. 416# 417sub getBinDirectoryShellCode () { 418 my $idir = $FindBin::RealBin; 419 my $string; 420 421 # First, run any preExec command that might exist. 422 423 if (defined(getGlobal("preExec"))) { 424 $string .= "# Pre-execution commands.\n"; 425 $string .= "\n"; 426 $string .= getGlobal('preExec') . "\n"; 427 $string .= "\n"; 428 } 429 430 # Then, setup and report paths. 431 432 my $javaPath = getGlobal("java"); 433 my $canu = "\$bin/" . basename($0); # NOTE: $bin decided at script run time 434 435 $string .= "\n"; 436 $string .= "# Path to Canu.\n"; 437 $string .= "\n"; 438 $string .= "bin=\"$idir\"\n"; 439 $string .= "\n"; 440 $string .= "# Report paths.\n"; 441 $string .= "\n"; 442 $string .= "echo \"\"\n"; 443 $string .= "echo \"Found perl:\"\n"; 444 $string .= "echo \" \" `which perl`\n"; 445 $string .= "echo \" \" `perl --version | grep version`\n"; 446 $string .= "echo \"\"\n"; 447 $string .= "echo \"Found java:\"\n"; 448 $string .= "echo \" \" `which $javaPath`\n"; 449 $string .= "echo \" \" `$javaPath -showversion 2>&1 | head -n 1`\n"; 450 $string .= "echo \"\"\n"; 451 $string .= "echo \"Found canu:\"\n"; 452 $string .= "echo \" \" $canu\n"; 453 $string .= "echo \" \" `$canu -version`\n"; 454 $string .= "echo \"\"\n"; 455 $string .= "\n"; 456 $string .= "\n"; 457 $string .= "# Environment for any object storage.\n"; 458 $string .= "\n"; 459 $string .= "export CANU_OBJECT_STORE_CLIENT=" . getGlobal("objectStoreClient") . "\n"; 460 $string .= "export CANU_OBJECT_STORE_CLIENT_UA=" . getGlobal("objectStoreClientUA") . "\n"; 461 $string .= "export CANU_OBJECT_STORE_CLIENT_DA=" . getGlobal("objectStoreClientDA") . "\n"; 462 $string .= "export CANU_OBJECT_STORE_NAMESPACE=" . getGlobal("objectStoreNameSpace") . "\n"; 463 $string .= "export CANU_OBJECT_STORE_PROJECT=" . getGlobal("objectStoreProject") . "\n"; 464 $string .= "\n"; 465 $string .= "\n"; 466 467 return($string); 468} 469 470 471 472 473# 474# If running on a cloud system, shell scripts are started in some random location. 475# setWorkDirectory() will create the directory the script is supposed to run in (e.g., 476# correction/0-mercounts) and move into it. This will keep the scripts compatible with the way 477# they are run from within canu.pl. 478# 479# If you're fine running in 'some random location' do nothing here. 480# 481# Note that canu does minimal cleanup. 482# 483 484sub setWorkDirectory ($$) { 485 my $asm = shift @_; 486 my $rootdir = shift @_; 487 488 # Set the initial directory based on various rules. 489 # 490 # For the canu executive, in grid mode, both setWorkDirectoryShellCode and 491 # this (in that order) are called. TEST is assuming that all (non-executive) 492 # compute jobs are run as arrays. 493 494 if ((getGlobal("objectStore") eq "TEST") && (defined($ENV{"JOB_ID"}))) { 495 my $jid = $ENV{'JOB_ID'}; 496 my $tid = $ENV{'SGE_TASK_ID'}; # 'undefined' since this isn't an array job. 497 498 remove_tree("/assembly/objectstore/job-$jid"); 499 make_path ("/assembly/objectstore/job-$jid"); 500 chdir ("/assembly/objectstore/job-$jid"); 501 } 502 503 elsif (getGlobal("objectStore") eq "DNANEXUS") { 504 } 505 506 elsif (getGlobal("gridEngine") eq "PBSPRO") { 507 chdir($ENV{"PBS_O_WORKDIR"}) if (exists($ENV{"PBS_O_WORKDIR"})); 508 delete $ENV{"PBS_O_WORKDIR"}; 509 } 510 511 # Now move into the assembly directory. 512 513 if (defined($rootdir)) { 514 make_path($rootdir) if (! -d $rootdir); 515 chdir($rootdir); 516 } 517 518 # And save some pieces we need when we quit. 519 520 setGlobal("onExitDir", getcwd()); 521 setGlobal("onExitNam", $asm); 522} 523 524 525 526sub setWorkDirectoryShellCode ($) { 527 my $path = shift @_; 528 my $code = ""; 529 530 if (getGlobal("objectStore") eq "TEST") { 531 $code .= "if [ z\$SGE_TASK_ID != z ] ; then\n"; 532 $code .= " jid=\$JOB_ID\n"; 533 $code .= " tid=\$SGE_TASK_ID\n"; 534 $code .= " if [ x\$tid != xundefined ] ; then\n"; 535 $code .= " rm -rf /assembly/objectstore/job-\$jid-\$tid/\n"; 536 $code .= " mkdir -p /assembly/objectstore/job-\$jid-\$tid/$path\n"; 537 $code .= " cd /assembly/objectstore/job-\$jid-\$tid/$path\n"; 538 $code .= " fi\n"; 539 $code .= "fi\n"; 540 } 541 542 elsif (getGlobal("objectStore") eq "DNANEXUS") { 543 # You're probably fine running in some random location, but if there is faster disk 544 # available, move there. 545 } 546 547 elsif (getGlobal("gridEngine") eq "PBSPRO") { 548 $code .= "if [ z\$PBS_O_WORKDIR != z ] ; then\n"; 549 $code .= " cd \$PBS_O_WORKDIR\n"; 550 $code .= "fi\n"; 551 } 552 553 return($code); 554} 555 556 557 558# Spend too much effort ensuring that the name is unique in the system. For 'canu' jobs, we don't 559# care. 560 561sub makeRandomSuffix ($) { 562 my $length = shift @_; 563 my @chars = +('0'..'9', 'a'..'k', 'm'..'z', 'A'..'H', 'J'..'N', 'P'..'Z'); # Remove 'l', 'I' and 'O' 564 my $suffix; 565 566 while ($length-- > 0) { 567 $suffix .= @chars[int(rand(59))]; 568 } 569 570 return($suffix); 571} 572 573 574sub makeUniqueJobName ($$) { 575 my $jobType = shift @_; 576 my $asm = shift @_; 577 578 # If a canu job, just return the standard name. No uniquification needed. 579 580 if ($jobType eq "canu") { 581 return("canu_" . $asm . ((defined(getGlobal("gridOptionsJobName"))) ? ("_" . getGlobal("gridOptionsJobName")) : (""))); 582 } 583 584 # For all other jobs, we need to ensure the name is unique. We do this by adding digits at the end. 585 586 my $jobName = "${jobType}_" . $asm . ((defined(getGlobal("gridOptionsJobName"))) ? ("_" . getGlobal("gridOptionsJobName")) : ("")); 587 my %jobs; 588 589 # First, find the list of all jobs that exist. 590 591 if (uc(getGlobal("gridEngine")) eq "SGE") { 592 open(F, "qstat -xml |"); 593 while (<F>) { 594 $jobs{$1}++ if (m/^\s*<JB_name>(.*)<\/JB_name>$/); 595 } 596 close(F); 597 } 598 599 if (uc(getGlobal("gridEngine")) eq "PBS") { 600 } 601 602 if (uc(getGlobal("gridEngine")) eq "PBSPro") { 603 } 604 605 if (uc(getGlobal("gridEngine")) eq "LSF") { 606 } 607 608 if (uc(getGlobal("gridEngine")) eq "DNANEXUS") { 609 } 610 611 # If the jobName doesn't exist, we can use it. 612 613 return($jobName) if (! exists($jobs{$jobName})); 614 615 # Otherwise, find a unique random 2-letter suffix. 616 617 my $jobIdx = makeRandomSuffix(2); 618 619 while (exists($jobs{"${jobName}_$jobIdx"})) { 620 $jobIdx = makeRandomSuffix(2); 621 } 622 623 # And return it! Simple! 624 625 # this was breaking dependencies when multiple jobs were submitted like for a failed consensus run, turn off for now 626 return("${jobName}"); 627 #return("${jobName}_$jobIdx"); 628} 629 630 631 632 633# Submit ourself back to the grid. If the one argument is defined, make us hold on jobs with that 634# name. 635# 636# The previous version (CA) would use "gridPropagateHold" to reset holds on existing jobs so that 637# they would also hold on this job. 638# 639sub submitScript ($$) { 640 my $asm = shift @_; 641 my $jobHold = shift @_; 642 643 return if (getGlobal("useGrid") ne "1"); # If not requested to run on the grid, 644 return if (getGlobal("gridEngine") eq undef); # or can't run on the grid, don't run on the grid. 645 646 # If no job hold, and we are already on the grid, do NOT resubmit ourself. 647 # 648 # When the user launches canu on the head node, a call to submitScript() is made to launch canu 649 # under grid control. That results in a restart of canu, and another call to submitScript(), 650 # but this time, the envorinment variable is set, we we can skip the resubmission, and continue 651 # with canu execution. 652 653 return if (($jobHold eq undef) && (exists($ENV{getGlobal("gridEngineJobID")}))); 654 655 # Figure out the name of the script we want to be making, and a place for it to write output. 656 657 my $idx = "01"; 658 659 while ((-e "canu-scripts/canu.$idx.sh") || 660 (-e "canu-scripts/canu.$idx.out")) { 661 $idx++; 662 } 663 664 my $script = "canu-scripts/canu.$idx.sh"; 665 my $scriptOut = "canu-scripts/canu.$idx.out"; 666 667 # Make a script for us to submit. 668 669 open(F, "> $script") or caFailure("failed to open '$script' for writing", undef); 670 print F "#!" . getGlobal("shell") . "\n"; 671 print F "\n"; 672 print F "# Attempt to (re)configure SGE. For unknown reasons, jobs submitted\n" if (getGlobal("gridEngine") eq "SGE"); 673 print F "# to SGE, and running under SGE, fail to read the shell init scripts,\n" if (getGlobal("gridEngine") eq "SGE"); 674 print F "# and so they don't set up SGE (or ANY other paths, etc) properly.\n" if (getGlobal("gridEngine") eq "SGE"); 675 print F "# For the record, interactive logins (qlogin) DO set the environment.\n" if (getGlobal("gridEngine") eq "SGE"); 676 print F "\n" if (getGlobal("gridEngine") eq "SGE"); 677 print F "if [ \"x\$SGE_ROOT\" != \"x\" -a \\\n" if (getGlobal("gridEngine") eq "SGE"); 678 print F " -e \$SGE_ROOT/\$SGE_CELL/common/settings.sh ]; then\n" if (getGlobal("gridEngine") eq "SGE"); 679 print F " . \$SGE_ROOT/\$SGE_CELL/common/settings.sh\n" if (getGlobal("gridEngine") eq "SGE"); 680 print F "fi\n" if (getGlobal("gridEngine") eq "SGE"); 681 print F "\n"; 682 print F getBinDirectoryShellCode(); 683 print F "\n"; 684 print F setWorkDirectoryShellCode("."); 685 print F "\n"; 686 print F "rm -f canu.out\n"; 687 print F "ln -s $scriptOut canu.out\n"; 688 print F "\n"; 689 print F "/usr/bin/env perl \\\n"; 690 print F "\$bin/" . basename($0) . " " . getCommandLineOptions() . " canuIteration=" . getGlobal("canuIteration") . "\n"; 691 close(F); 692 693 makeExecutable("$script"); 694 695 # Construct a submission command line. 696 697 my $jobName = makeUniqueJobName("canu", $asm); 698 699 # The canu.pl script isn't expected to take resources. We'll default to 4gb and one thread. 700 701 my $mem = getGlobal("executiveMemory"); 702 my $thr = getGlobal("executiveThreads"); 703 704 my $resOption = buildResourceOption($mem, $thr); 705 706 my $gridOpts; 707 708 $gridOpts = $jobHold; 709 $gridOpts .= " " if (defined($gridOpts)); 710 711 # LSF ignores all but the first option, so options need to be reversed. 712 # DNAnexus doesn't use threads, and memory is the instance type. 713 714 if (uc(getGlobal("gridEngine")) eq "LSF") { 715 $gridOpts .= getGlobal("gridOptionsExecutive") if (defined(getGlobal("gridOptionsExecutive"))); 716 $gridOpts .= " " if (defined($gridOpts)); 717 $gridOpts .= getGlobal("gridOptions") if (defined(getGlobal("gridOptions"))); 718 $gridOpts .= " " if (defined($gridOpts)); 719 $gridOpts .= $resOption if (defined($resOption)); 720 } 721 722 elsif (uc(getGlobal("gridEngine")) eq "DNANEXUS") { 723 $gridOpts .= getGlobal("gridOptions") if (defined(getGlobal("gridOptions"))); 724 $gridOpts .= " " if (defined($gridOpts)); 725 $gridOpts .= getGlobal("gridOptionsExecutive") if (defined(getGlobal("gridOptionsExecutive"))); 726 } 727 728 else { 729 $gridOpts .= $resOption if (defined($resOption)); 730 $gridOpts .= " " if (defined($gridOpts)); 731 $gridOpts .= getGlobal("gridOptions") if (defined(getGlobal("gridOptions"))); 732 $gridOpts .= " " if (defined($gridOpts)); 733 $gridOpts .= getGlobal("gridOptionsExecutive") if (defined(getGlobal("gridOptionsExecutive"))); 734 } 735 736 my $submitCommand = getGlobal("gridEngineSubmitCommand"); 737 my $nameOption = getGlobal("gridEngineNameOption"); 738 my $outputOption = getGlobal("gridEngineOutputOption"); 739 740 my $qcmd = "$submitCommand $gridOpts"; 741 $qcmd .= " $nameOption '$jobName'" if defined($nameOption); 742 $qcmd .= " $outputOption $scriptOut" if defined($outputOption); 743 744 # DNAnexus doesn't submit scripts; all parameters are passed through 745 # '-i' options. The 'fetch_and_run' magic is in dx-canu/src/canu-job-launcher.sh. 746 # It will download the requested shell script and execute said function 747 # in it. 748 749 if (uc(getGlobal("gridEngine")) eq "DNANEXUS") { 750 $qcmd .= " \\\n"; 751 $qcmd .= " -ioutput_folder:string=\"" . getGlobal("objectStoreNamespace") . "\" \\\n"; 752 $qcmd .= " -iscript_path:string=\"\" \\\n"; 753 $qcmd .= " -iscript_name:string=\"canu-executive.sh\" \\\n"; 754 $qcmd .= " -icanu_iteration:int=" . getGlobal("canuIteration") . " \\\n"; 755 $qcmd .= " -icanu_iteration_max:int=" . getGlobal("canuIterationMax") . " \\\n"; 756 $qcmd .= " fetch_and_run \\\n"; 757 } 758 759 else { 760 $qcmd .= " $script"; 761 } 762 763 if (runCommand(getcwd(), $qcmd) == 0) { # Exit sucessfully if we've submitted 764 exit(0); # the next part successfully. 765 } 766 767 print STDERR "-- Failed to submit Canu executive. Delay 10 seconds and try again.\n"; 768 769 sleep(10); 770 771 if (runCommand(getcwd(), $qcmd) == 0) { 772 exit(0); 773 } 774 775 print STDERR "-- Failed to submit Canu executive. Giving up after two tries.\n"; 776 777 exit(1); 778} 779 780 781 782sub buildGridArray (@) { 783 my ( $name, $bgn, $end, $opt, $thr ) = @_; 784 my $off = 0; 785 786 # In some grids (SGE) this is the maximum size of an array job. 787 # In some grids (Slurm) this is the maximum index of an array job. 788 # 789 # So, here, we just don't let any index be above the value. Both types will be happy. 790 791 if ($end > getGlobal('gridEngineArrayMaxJobs')) { 792 $off = $bgn - 1; 793 $bgn -= $off; 794 $end -= $off; 795 } 796 797 # PBSPro requires array jobs to have bgn < end. When $bgn == $end, we 798 # just remove the array qualifier. But only if this option is setting 799 # the number of jobs, not if it is setting the name. 800 # New versions of PBS have this behavior too 801 802 if (uc(getGlobal("gridEngine")) eq "PBSPRO" || uc(getGlobal("gridEngine")) eq "PBS") { 803 if (($bgn == $end) && ($opt =~ m/ARRAY_JOBS/)) { 804 $opt = ""; 805 $off = $bgn; 806 } 807 } 808 # DNA nexus doesn't have arrays and only supports 1 job, which we use to pass the identifier 809 # Set the offset to blank since it is not supported as well 810 if (uc(getGlobal("gridEngine")) eq "DNANEXUS" && ($bgn == $end) && ($opt =~ m/ARRAY_JOBS/)) { 811 my $jid = $bgn + $off; 812 $opt =~ s/ARRAY_JOBS/$jid/g; 813 $off = ""; 814 } 815 816 # Further, PBS/Torque won't let scripts be passed options unless they 817 # are prefixed with a -F....and PBSPro doesn't need this. 818 819 if (uc(getGlobal("gridEngine")) eq "PBS") { 820 $off = "-F \"$off\""; 821 } 822 823 if( $opt =~ m/(ARRAY_NAME)/ ) 824 { 825 $opt =~ s/$1/$name/; # Replace ARRAY_NAME with 'job name' 826 } 827 elsif( $opt =~ m/(ARRAY_JOBS)/ ) 828 { 829 $opt =~ s/$1/$bgn-$end/; # Replace ARRAY_JOBS with 'bgn-end' 830 831 if( lc( getGlobal( 'gridEngine' ) ) eq 'slurm' && $end > 1 ) 832 { 833 if( $name =~ m/^cormhap_/i && defined getGlobal( 'slurmCormhapCoreLimit' ) ) 834 { 835 $opt .= '%' . int( getGlobal( 'slurmCormhapCoreLimit' ) / $thr ); 836 } 837 elsif( $name =~ m/^ovb_/i && defined getGlobal( 'slurmOvbCoreLimit' ) ) 838 { 839 $opt .= '%' . getGlobal( 'slurmOvbCoreLimit' ); 840 } 841 elsif( $name =~ m/^ovs_/i && defined getGlobal( 'slurmOvsCoreLimit' ) ) 842 { 843 $opt .= '%' . getGlobal( 'slurmOvsCoreLimit' ); 844 } 845 elsif( $name =~ m/^red_/i && defined getGlobal( 'slurmRedCoreLimit' ) ) 846 { 847 $opt .= '%' . int( getGlobal( 'slurmRedCoreLimit' ) / $thr ); 848 } 849 elsif( defined getGlobal( 'slurmArrayTaskLimit' ) ) 850 { 851 $opt .= '%' . getGlobal( 'slurmArrayTaskLimit' ); 852 } 853 elsif( defined getGlobal( 'slurmArrayCoreLimit' ) ) 854 { 855 $opt .= '%' . int( getGlobal( 'slurmArrayCoreLimit' ) / $thr ); 856 } 857 } 858 } 859 860 return($opt, $off); 861} 862 863 864sub buildOutputName ($$$) { 865 my $path = shift @_; 866 my $script = shift @_; 867 my $tid = substr("000000" . (shift @_), -6); 868 my $o; 869 870 # When this function is called, canu.pl is running in the assembly directory. 871 # But, when the script is executed, it is rooted in '$path'. To get the 872 # 'logs' working, we need to check if the directory relative to the assembly root exists, 873 # but set it relative to $path (which is also where $script is relative to). 874 875 $o = "$script.$tid.out"; 876 $o = "logs/$1.$tid.out" if ((-e "$path/logs") && ($script =~ m/scripts\/(.*)/)); 877 878 return($o); 879} 880 881 882sub buildOutputOption ($$) { 883 my $path = shift @_; 884 my $script = shift @_; 885 my $tid = getGlobal("gridEngineArraySubmitID"); 886 my $opt = getGlobal("gridEngineOutputOption"); 887 888 if (defined($tid) && defined($opt)) { 889 my $o; 890 891 $o = "$script.$tid.out"; 892 $o = "logs/$1.$tid.out" if ((-e "$path/logs") && ($script =~ m/scripts\/(.*)/)); 893 894 return("$opt $o"); 895 } 896 897 return(undef); 898} 899 900 901sub buildStageOption ($$) { 902 my $t = shift @_; 903 my $d = shift @_; 904 my $r; 905 906 if ($t eq "cor" || $t eq "cormhap" || $t eq "obtmhap" || $t eq "utgmhap") { 907 $r = getGlobal("gridEngineStageOption"); 908 $r =~ s/DISK_SPACE/${d}/g; 909 } 910 911 return($r); 912} 913 914 915sub buildResourceOption ($$) { 916 my $m = shift @_; 917 my $t = shift @_; 918 my $u = "g"; 919 920 # Increase memory slightly if this is a retry. 921 922 if (getGlobal("canuIteration") > 0) { 923 $m *= 1.25 ** (getGlobal("canuIteration")-1); 924 } 925 926 # Massage the memory requested into a format the grid is happy with. 927 928 if (getGlobal("gridEngineMemoryPerJob") != "1") { # If anything but "1", divide the memory request 929 $m /= $t; # by the number of slots we request. Default behavior 930 } # for SGE and Slurm when mem-per-cpu is used. 931 932 if (uc(getGlobal("gridEngine")) eq "LSF") { # But then reset for LSF, 933 # always round up 934 $m = int($m / 1024 + 0.5) if (getGlobal("gridEngineMemoryUnits") =~ m/t/i); # because LSF wants to 935 $m = int($m * 1 + 0.5) if (getGlobal("gridEngineMemoryUnits") =~ m/g/i); # enforce the units used. 936 $m = int($m * 1024 + 0.5) if (getGlobal("gridEngineMemoryUnits") =~ m/m/i); 937 $m = int($m * 1024 * 1024 + 0.5) if (getGlobal("gridEngineMemoryUnits") =~ m/k/i); 938 $u = ""; 939 } 940 941 if (uc(getGlobal("gridEngine")) eq "DNANEXUS") { 942 $m = canu::Grid_DNANexus::getDNANexusInstance($m, $t); 943 $u = ""; 944 } 945 946 if (($u eq "g") && # If we're not an integral number of gigabytes, 947 (int($m) != $m)) { # switch over to megabytes. 948 $m = int($m * 1024); # But only if we're still gigabytes! 949 $u = "m"; # In particular, both LSF and DNANEXUS set units to "". 950 } 951 952 # Replace MEMORY and THREADS with actual values. 953 954 my $r = getGlobal("gridEngineResourceOption"); 955 956 $r =~ s/MEMORY/${m}${u}/g; 957 $r =~ s/THREADS/$t/g; 958 959 return($r); 960} 961 962 963sub purgeGridJobSubmitScripts ($$) { 964 my $path = shift @_; 965 my $script = shift @_; 966 my $idx = "01"; 967 968 while (-e "$path/$script.jobSubmit-$idx.sh") { 969 unlink "$path/$script.jobSubmit-$idx.sh"; 970 $idx++; 971 } 972} 973 974 975sub buildGridJob ($$$$$$$$$) { 976 my $asm = shift @_; 977 my $jobType = shift @_; 978 my $path = shift @_; 979 my $script = shift @_; 980 my $mem = shift @_; 981 my $thr = shift @_; 982 my $dsk = shift @_; 983 my $bgnJob = shift @_; 984 my $endJob = shift @_; 985 986 # Unpack the job range if needed. 987 988 if ($bgnJob =~ m/^(\d+)-(\d+)$/) { 989 $bgnJob = $1; 990 $endJob = $2; 991 } 992 993 if (!defined($endJob)) { 994 $endJob = $bgnJob; 995 } 996 997 # Figure out the command and options needed to run the job. 998 999 my $submitCommand = getGlobal("gridEngineSubmitCommand"); 1000 my $nameOption = getGlobal("gridEngineNameOption"); 1001 1002 my $jobNameT = makeUniqueJobName($jobType, $asm); 1003 1004 my ($jobName, $jobOff) = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayName")); 1005 my ( $arrayOpt, $arrayOff ) = buildGridArray( $jobNameT, $bgnJob, $endJob, getGlobal( "gridEngineArrayOption" ), $thr ); 1006 1007 my $outputOption = buildOutputOption($path, $script); 1008 1009 my $stageOption = buildStageOption($jobType, $dsk); 1010 my $resOption = buildResourceOption($mem, $thr); 1011 my $globalOptions = getGlobal("gridOptions"); 1012 my $jobOptions = getGlobal("gridOptions$jobType"); 1013 1014 my $opts; 1015 1016 $opts = "$stageOption " if (defined($stageOption)); 1017 $opts .= "$resOption " if (defined($resOption)); 1018 $opts .= "$globalOptions " if (defined($globalOptions)); 1019 $opts .= "$jobOptions " if (defined($jobOptions)); 1020 $opts .= "$outputOption " if (defined($outputOption)); 1021 $opts =~ s/\s+$//; 1022 1023 # Find a unique file name to save the command. 1024 1025 my $idx = "01"; 1026 1027 while (-e "$path/$script.jobSubmit-$idx.sh") { 1028 $idx++; 1029 } 1030 1031 # Build and save the command line. Return the command PREFIX (we'll be adding .sh and .out as 1032 # appropriate), and the job name it will be submitted with (which isn't expected to be used). 1033 1034 open(F, "> $path/$script.jobSubmit-$idx.sh") or die; 1035 print F "#!/bin/sh\n"; 1036 print F "\n"; 1037 print F "$submitCommand \\\n"; 1038 print F " $opts \\\n" if (defined($opts)); 1039 print F " $nameOption \"$jobName\" \\\n"; 1040 print F " $arrayOpt \\\n"; 1041 1042 1043 if (uc(getGlobal("gridEngine")) eq "PBSPRO") { # PBSpro needs '--' to tell it to 1044 print F " -- "; # stop parsing the command line. 1045 } 1046 1047 # DNAnexus wants job parameters via options to the submit command; 1048 # everyone else wants the script itself. 1049 1050 if (uc(getGlobal("gridEngine")) eq "DNANEXUS") { 1051 print F " -ioutput_folder:string=\"" . getGlobal("objectStoreNamespace") . "\" \\\n"; 1052 print F " -iscript_path:string=\"$path\" \\\n"; 1053 print F " -iscript_name:string=\"$script.sh\" \\\n"; 1054 print F " fetch_and_run \\\n"; 1055 } else { 1056 print F " `pwd`/$script.sh $arrayOff \\\n"; 1057 } 1058 1059 print F "> ./$script.jobSubmit-$idx.out 2>&1\n"; 1060 close(F); 1061 1062 makeExecutable("$path/$script.jobSubmit-$idx.sh"); 1063 1064 return("$script.jobSubmit-$idx", $jobName); 1065} 1066 1067 1068 1069 1070# Convert @jobs to a list of ranges, 1-4, 5, 10-20, etc. These will be directly submitted to the 1071# grid, or run one-by-one locally. 1072# 1073# If we're SGE, we can combine everything to one job range: 1-4, 5, 10-20. Except that 1074# buildGridJob() doesn't know how to handle that. 1075 1076sub convertToJobRange (@) { 1077 my @jobs; 1078 1079 # Expand the ranges into a simple list of job ids. 1080 1081 foreach my $j (@_) { 1082 if ($j =~ m/^0*(\d+)-0*(\d+)$/) { 1083 for (my $a=$1; $a<=$2; $a++) { 1084 push @jobs, $a; 1085 } 1086 1087 } elsif ($j =~ m/^0*(\d+)$/) { 1088 push @jobs, $1; 1089 1090 } else { 1091 caFailure("invalid job format in '$j'", undef); 1092 } 1093 } 1094 1095 # Sort. 1096 1097 my @jobsA = sort { $a <=> $b } @jobs; 1098 1099 undef @jobs; 1100 1101 # Merge adjacent ids into a range. 1102 1103 my $st = $jobsA[0]; 1104 my $ed = $jobsA[0]; 1105 1106 shift @jobsA; 1107 1108 foreach my $j (@jobsA) { 1109 if ($ed + 1 == $j) { 1110 $ed = $j; 1111 } else { 1112 push @jobs, ($st == $ed) ? "$st" : "$st-$ed"; 1113 $st = $j; 1114 $ed = $j; 1115 } 1116 } 1117 1118 push @jobs, ($st == $ed) ? "$st" : "$st-$ed"; 1119 1120 1121 # In some grids (SGE) this is the maximum size of an array job. 1122 # In some grids (Slurm) this is the maximum index of an array job. 1123 # 1124 # So, here, we make blocks that have at most that many jobs. When we submit the job, we'll 1125 # offset the indices to be 1..Max. 1126 1127 my $l = getGlobal("gridEngineArrayMaxJobs") - 1; 1128 1129 if ($l >= 0) { 1130 @jobsA = @jobs; 1131 undef @jobs; 1132 1133 foreach my $j (@jobsA) { 1134 if ($j =~ m/^0*(\d+)-0*(\d+)$/) { 1135 my $b = $1; 1136 my $e = $2; 1137 1138 while ($b <= $e) { 1139 my $B = ($b + $l < $e) ? ($b + $l) : $e; 1140 push @jobs, "$b-$B"; 1141 $b += $l + 1; 1142 } 1143 } else { 1144 push @jobs, $j 1145 } 1146 } 1147 1148 undef @jobsA; 1149 } 1150 1151 return(@jobs); 1152} 1153 1154 1155 1156sub countJobsInRange (@) { 1157 my @jobs = @_; 1158 my $nJobs = 0; 1159 1160 foreach my $j (@jobs) { 1161 if ($j =~ m/^(\d+)-(\d+)$/) { 1162 $nJobs += $2 - $1 + 1; 1163 } else { 1164 $nJobs++; 1165 } 1166 } 1167 1168 return($nJobs); 1169} 1170 1171 1172 1173# Expects 1174# job type ("ovl", etc) 1175# output directory 1176# script name with no directory or .sh 1177# number of jobs in the task 1178# 1179# If under grid control, submit grid jobs. Otherwise, run in parallel locally. 1180# 1181sub submitOrRunParallelJob ($$$$@) { 1182 my $asm = shift @_; # Name of the assembly 1183 1184 my $jobType = shift @_; # E.g., ovl, cns, ... - populates 'gridOptionsXXX 1185 # - also becomes the grid job name prefix, so three letters suggested 1186 1187 my $path = shift @_; # Location of script to run 1188 my $script = shift @_; # Runs $path/$script.sh > $path/$script.######.out 1189 1190 my $mem = getGlobal("${jobType}Memory"); 1191 my $thr = getGlobal("${jobType}Threads"); 1192 my $dsk = getGlobal("${jobType}StageSpace"); 1193 1194 my @jobs = convertToJobRange(@_); 1195 my $nJobs = countJobsInRange(@jobs); 1196 1197 my $runDirectly = 0; 1198 1199 # The script MUST be executable. 1200 1201 makeExecutable("$path/$script.sh"); 1202 1203 # If the job can fit in the task running the executive, run it right here. 1204 1205 if (($nJobs * $mem + 0.5 <= getGlobal("executiveMemory")) && 1206 ($nJobs * $thr <= getGlobal("executiveThreads"))) { 1207 $runDirectly = 1; 1208 } 1209 1210 # Report what we're doing. 1211 1212 #my $t = localtime(); 1213 #print STDERR "----------------------------------------GRIDSTART $t\n"; 1214 #print STDERR "$path/$script.sh with $mem gigabytes memory and $thr threads.\n"; 1215 1216 # Break infinite loops. If the grid jobs keep failing, give up after a few attempts. 1217 # 1218 # submitScript() passes canuIteration on to the next call. 1219 # canuIteration is reset to zero if the Check() for any parallel step succeeds. 1220 # 1221 # Assuming grid jobs die on each attempt: 1222 # 0) canu run from the command line submits iteration 1; canuIteration is NOT incremented 1223 # because no parallel jobs have been submitted. 1224 # 1) Iteration 1 - canu.pl submits jobs, increments the iteration count, and submits itself as iteration 2 1225 # 2) Iteration 2 - canu.pl submits jobs, increments the iteration count, and submits itself as iteration 3 1226 # 3) Iteration 3 - canu.pl fails with the error below 1227 # 1228 # If the jobs succeed in Iteration 2, the canu in iteration 3 will pass the Check(), never call 1229 # this function, and continue the pipeline. 1230 1231 my $iter = getGlobal("canuIteration"); 1232 my $max = getGlobal("canuIterationMax"); 1233 1234 if ($iter >= $max) { 1235 caExit("canu iteration count too high, stopping pipeline (most likely a problem in the grid-based computes)", undef); 1236 } elsif ($iter == 0) { 1237 $iter = "First"; 1238 } elsif ($iter == 1) { 1239 $iter = "Second"; 1240 } elsif ($iter == 2) { 1241 $iter = "Third"; 1242 } elsif ($iter == 3) { 1243 $iter = "Fourth"; 1244 } elsif ($iter == 4) { 1245 $iter = "Fifth"; 1246 } else { 1247 $iter = "${iter}th"; 1248 } 1249 1250 print STDERR "--\n"; 1251 print STDERR "-- Running jobs. $iter attempt out of $max.\n"; 1252 1253 setGlobal("canuIteration", getGlobal("canuIteration") + 1); 1254 1255 # If 'gridEngineJobID' environment variable exists (SGE: JOB_ID; LSF: LSB_JOBID) then we are 1256 # currently running under grid crontrol. If so, run the grid command to submit more jobs, then 1257 # submit ourself back to the grid. If not, tell the user to run the grid command by hand. 1258 1259 # Jobs under grid control, and we submit them 1260 1261 if (defined(getGlobal("gridEngine")) && 1262 (getGlobal("useGrid") eq "1") && 1263 (getGlobal("useGrid$jobType") eq "1") && 1264 (exists($ENV{getGlobal("gridEngineJobID")})) && 1265 ($runDirectly == 0)) { 1266 my @jobsSubmitted; 1267 1268 print STDERR "--\n"; 1269 1270 purgeGridJobSubmitScripts($path, $script); 1271 1272 if (getGlobal("showNext")) { 1273 print STDERR "--\n"; 1274 print STDERR "-- NEXT COMMANDS\n"; 1275 print STDERR "--\n"; 1276 print STDERR "\n"; 1277 print STDERR prettifyCommand("cd $path"), "\n"; 1278 foreach my $j (@jobs) { 1279 my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef); 1280 1281 print STDERR prettifyCommand("./$cmd.sh") . "\n"; 1282 } 1283 exit(0); 1284 } 1285 1286 foreach my $j (@jobs) { 1287 my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef); 1288 1289 if (runCommandSilently($path, "./$cmd.sh", 0)) { 1290 print STDERR "-- Failed to submit compute jobs. Delay 10 seconds and try again.\n"; 1291 sleep(10); 1292 1293 runCommandSilently($path, "./$cmd.sh", 0) and caFailure("Failed to submit compute jobs", "$path/$cmd.out"); 1294 } 1295 1296 # Parse the stdout/stderr from the submit command to find the id of the job 1297 # we just submitted. We'll use this to hold the next iteration until all these 1298 # jobs have completed. 1299 1300 open(F, "< $path/$cmd.out"); 1301 while (<F>) { 1302 chomp; 1303 1304 if (uc(getGlobal("gridEngine")) eq "SGE") { 1305 # Your job 148364 ("canu_asm") has been submitted 1306 if (m/Your\sjob\s(\d+)\s/) { 1307 $jobName = $1; 1308 } 1309 # Your job-array 148678.1500-1534:1 ("canu_asm") has been submitted 1310 if (m/Your\sjob-array\s(\d+).\d+-\d+:\d\s/) { 1311 $jobName = $1; 1312 } 1313 } 1314 1315 if (uc(getGlobal("gridEngine")) eq "LSF") { 1316 # Job <759810> is submitted to queue <14>. 1317 if (m/Job\s<(\d+)>\sis/) { 1318 $jobName = "ended($1)"; 1319 } 1320 } 1321 1322 if (uc(getGlobal("gridEngine")) eq "PBS") { 1323 # 123456.qm2 1324 $jobName = $_; 1325 } 1326 1327 if (uc(getGlobal("gridEngine")) eq "PBSPRO") { 1328 # ?? 1329 $jobName = $_; 1330 } 1331 1332 if (uc(getGlobal("gridEngine")) eq "SLURM") { 1333 # BPW has seen Slurm report "ERROR" instead of something 1334 # useful here. If that is seen, report the error to the 1335 # screen and ignore this job. We'll redo it on the next 1336 # iteration (unless this is the second iteration, then 1337 # we're screwed either way). 1338 if (m/Submitted\sbatch\sjob\s(\d+)/) { 1339 $jobName = $1; 1340 } elsif (m/ERROR/) { 1341 $jobName = undef; 1342 } else { 1343 $jobName = $_; 1344 } 1345 } 1346 1347 if (uc(getGlobal("gridEngine")) eq "DNANEXUS") { 1348 $jobName = $_; 1349 } 1350 } 1351 close(F); 1352 1353 if (!defined($jobName)) { 1354 print STDERR "-- '$cmd.sh' -> returned an error; job not submitted.\n"; 1355 } elsif ($j =~ m/^\d+$/) { 1356 print STDERR "-- '$cmd.sh' -> job $jobName task $j.\n"; 1357 } else { 1358 print STDERR "-- '$cmd.sh' -> job $jobName tasks $j.\n"; 1359 } 1360 1361 if (defined($jobName)) { 1362 push @jobsSubmitted, $jobName; 1363 } 1364 } 1365 1366 print STDERR "--\n"; 1367 1368 # All jobs submitted. Make an option to hold the executive on those jobs. 1369 1370 my $jobHold; 1371 1372 if (uc(getGlobal("gridEngine")) eq "SGE") { 1373 $jobHold = "-hold_jid " . join ",", @jobsSubmitted; 1374 } 1375 1376 if (uc(getGlobal("gridEngine")) eq "LSF") { 1377 $jobHold = "-w \"" . (join "&&", @jobsSubmitted) . "\""; 1378 } 1379 1380 if (uc(getGlobal("gridEngine")) eq "PBS") { 1381 # new PBS versions dont have 1-task arrays like PBSPro but still have afteranyarray (which doesn't work on a not-array task) 1382 # so we need to check if we are waiting for a regular job or array 1383 my $holdType = (join ":", @jobsSubmitted) =~ m/^(\d+)\[(.*)\]/ ? "afteranyarray" : "afterany"; 1384 $jobHold = "-W depend=$holdType:" . join ":", @jobsSubmitted; 1385 } 1386 1387 if (uc(getGlobal("gridEngine")) eq "PBSPRO") { 1388 $jobHold = "-W depend=afterany:" . join ":", @jobsSubmitted; 1389 } 1390 1391 if (uc(getGlobal("gridEngine")) eq "SLURM") { 1392 $jobHold = "--depend=afterany:" . join ":", @jobsSubmitted; 1393 } 1394 1395 if (uc(getGlobal("gridEngine")) eq "DNANEXUS") { 1396 $jobHold = "--depends-on " . join " ", @jobsSubmitted; 1397 } 1398 1399 submitScript($asm, $jobHold); 1400 1401 # submitScript() should never return. If it does, then a parallel step was attempted too many time. 1402 1403 caExit("Too many attempts to run a parallel stage on the grid. Stop.", undef); 1404 } 1405 1406 # Jobs under grid control, but the user must submit them 1407 1408 if (defined(getGlobal("gridEngine")) && 1409 (getGlobal("useGrid") ne "0") && 1410 (getGlobal("useGrid$jobType") eq "1") && 1411 (! exists($ENV{getGlobal("gridEngineJobID")})) && 1412 ($runDirectly == 0)) { 1413 my $cwd = getcwd(); 1414 my $s = (scalar(@jobs) == 1) ? "" : "s"; 1415 1416 print STDERR "\n"; 1417 print STDERR "Please run the following command$s to submit tasks to the grid for execution.\n"; 1418 print STDERR "Each task will use $mem gigabytes memory and $thr threads.\n"; 1419 print STDERR "\n"; 1420 print STDERR " cd $cwd/$path\n"; 1421 1422 purgeGridJobSubmitScripts($path, $script); 1423 1424 foreach my $j (@jobs) { 1425 my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef); 1426 1427 print " ./$cmd.sh\n"; 1428 } 1429 1430 print STDERR "\n"; 1431 print STDERR "When all tasks are finished, restart canu as before. The output of the grid\n"; 1432 print STDERR "submit command$s will be in *jobSubmit*out.\n"; 1433 print STDERR "\n"; 1434 1435 exit(0); 1436 } 1437 1438 # Standard jobs, run locally. 1439 1440 foreach my $j (@jobs) { 1441 my $st; 1442 my $ed; 1443 1444 if ($j =~ m/^(\d+)-(\d+)$/) { 1445 $st = $1; 1446 $ed = $2; 1447 } else { 1448 $st = $ed = $j; 1449 } 1450 1451 if (getGlobal("showNext")) { 1452 print STDERR "--\n"; 1453 print STDERR "-- NEXT COMMANDS\n"; 1454 print STDERR "--\n"; 1455 print STDERR "\n"; 1456 print STDERR prettifyCommand("cd $path") . "\n"; 1457 for (my $i=$st; $i<=$ed; $i++) { 1458 print STDERR prettifyCommand("./$script.sh $i") . "\n"; 1459 } 1460 exit(0); 1461 } 1462 1463 for (my $i=$st; $i<=$ed; $i++) { 1464 schedulerSubmit("./$script.sh $i > ./" . buildOutputName($path, $script, $i) . " 2>&1"); 1465 } 1466 } 1467 1468 # compute limit based on # of cpus 1469 my $nCParallel = getGlobal("${jobType}Concurrency"); 1470 $nCParallel = int(getGlobal("maxThreads") / $thr) if ((!defined($nCParallel)) || ($nCParallel == 0)); 1471 $nCParallel = 1 if ((!defined($nCParallel)) || ($nCParallel == 0)); 1472 1473 # compute limit based on physical memory 1474 my $nMParallel = getGlobal("${jobType}Concurrency"); 1475 $nMParallel = int(getGlobal("maxMemory") / getGlobal("${jobType}Memory")) if ((!defined($nMParallel)) || ($nMParallel == 0)); 1476 $nMParallel = 1 if ((!defined($nMParallel)) || ($nMParallel == 0)); 1477 1478 # run min of our limits 1479 my $nParallel = $nCParallel < $nMParallel ? $nCParallel : $nMParallel; 1480 1481 schedulerSetNumberOfProcesses($nParallel); 1482 schedulerFinish($path, $jobType); 1483} 1484 1485 1486 1487 1488# Pretty-ify the command. If there are no newlines already in it, break 1489# before every switch and before file redirects. 1490 1491sub prettifyCommand ($) { 1492 my $dis = shift @_; 1493 1494 if (($dis =~ tr/\n/\n/) == 0) { 1495 $dis =~ s/\s-/ \\\n -/g; # Replace ' -' with '\n -' (newline, two spaces, then the dash) 1496 $dis =~ s/\s>\s/ \\\n> /; # Replace ' > ' with '\n> ' 1497 $dis =~ s/\s2>\s/ \\\n2> /; # Replace ' 2> ' with '\n2> ' 1498 } 1499 1500 $dis = " " . $dis; # Indent the command by four spaces. 1501 $dis =~ s/\n/\n /g; 1502 1503 return($dis); 1504} 1505 1506 1507sub reportRunError ($) { 1508 my $rc = shift @_; 1509 1510 # Bunch of busy work to get the names of signals. Is it really worth it?! 1511 1512 my @signame; 1513 if (defined($Config{sig_name})) { 1514 my $i = 0; 1515 foreach my $n (split('\s+', $Config{sig_name})) { 1516 $signame[$i] = $n; 1517 $i++; 1518 } 1519 } else { 1520 for (my $i=0; $i<127; $i++) { 1521 $signame[$i] = "signal $i"; 1522 } 1523 } 1524 1525 # The rest is rather straightforward at least. 1526 1527 print STDERR "\n"; 1528 print STDERR "ERROR:\n"; 1529 1530 if ($rc == -1) { 1531 print STDERR "ERROR: Failed to run the command. (rc=$rc)\n"; 1532 } elsif ($rc & 127) { 1533 print STDERR "ERROR: Failed with signal $signame[$rc & 127]. (rc=$rc)\n"; 1534 } else { 1535 print STDERR "ERROR: Failed with exit code ", $rc >> 8 , ". (rc=$rc)\n"; 1536 } 1537 1538 print STDERR "ERROR:\n"; 1539} 1540 1541 1542# Utility to run a command and check the exit status, report time used. 1543# 1544sub runCommand ($$) { 1545 my $dir = shift @_; 1546 my $cmd = shift @_; 1547 my $dis = prettifyCommand($cmd); 1548 1549 return(0) if ($cmd eq ""); 1550 1551 # Check if the directory exists. 1552 1553 if (! -d $dir) { 1554 caFailure("Directory '$dir' doesn't exist, can't run command", ""); 1555 } 1556 1557 # If only showing the next command, show it and stop. 1558 1559 if (getGlobal("showNext")) { 1560 print STDERR "--\n"; 1561 print STDERR "-- NEXT COMMAND\n"; 1562 print STDERR "--\n"; 1563 print STDERR "\n"; 1564 print STDERR prettifyCommand("cd $dir") . "\n"; 1565 print STDERR "$dis\n"; 1566 exit(0); 1567 } 1568 1569 # Log that we're starting, and show the pretty-ified command. 1570 1571 my $cwd = getcwd(); # Remember where we are. 1572 chdir($dir); # So we can root the jobs in the correct location. 1573 1574 my $startsecs = time(); 1575 my $diskfree = diskSpace("."); 1576 1577 print STDERR "----------------------------------------\n"; 1578 print STDERR "-- Starting command on ", scalar(localtime()), " with $diskfree GB free disk space\n"; 1579 print STDERR "\n"; 1580 print STDERR " cd $dir\n"; 1581 print STDERR "$dis\n"; 1582 1583 my $rc = 0xffff & system($cmd); 1584 1585 logFinished(".", $startsecs); 1586 1587 chdir($cwd); 1588 1589 # Pretty much copied from Programming Perl page 230 1590 1591 return(0) if ($rc == 0); 1592 1593 reportRunError($rc); 1594 1595 return(1); 1596} 1597 1598 1599 1600# Duplicated in Grid_Cloud.pm to get around recursive 'use' statements. 1601 1602sub runCommandSilently ($$$) { 1603 my $dir = shift @_; 1604 my $cmd = shift @_; 1605 my $dis = prettifyCommand($cmd); 1606 my $critical = shift @_; 1607 1608 return(0) if ($cmd eq ""); 1609 1610 my $cwd = getcwd(); # Remember where we are. 1611 chdir($dir); # So we can root the jobs in the correct location. 1612 1613 my $rc = 0xffff & system($cmd); 1614 1615 chdir($cwd); 1616 1617 return(0) if ($rc == 0); # No errors, return no error. 1618 return(1) if ($critical == 0); # If not critical, return that it failed, otherwise, report error and fail. 1619 1620 print STDERR "$dis\n"; 1621 1622 reportRunError($rc); 1623 1624 return(1); 1625} 1626 1627 1628 1629sub findCommand ($) { 1630 my $cmd = shift @_; 1631 my @path = File::Spec->path; 1632 1633 for my $path (@path) { 1634 if (-x "$path/$cmd") { 1635 return("$path/$cmd"); 1636 } 1637 } 1638 1639 return(undef); 1640} 1641 1642 1643 1644sub findExecutable ($) { 1645 my $exec = shift @_; 1646 1647 my $path = `which \"$exec\" 2> /dev/null`; 1648 1649 $path =~ s/^\s+//; 1650 $path =~ s/\s+$//; 1651 1652 return(undef) if ($path eq ""); 1653 return($path); 1654} 1655 1656 1657# Use caExit() for transient errors, like not opening files, processes that die, etc. 1658sub caExit ($$) { 1659 my $asm = getGlobal("onExitNam"); 1660 my $msg = shift @_; 1661 my $log = shift @_; 1662 my $version = getGlobal("version"); 1663 1664 $msg = undef if ($msg eq ""); 1665 $log = undef if ($log eq ""); 1666 1667 print STDERR "\n"; 1668 print STDERR "ABORT:\n"; 1669 print STDERR "ABORT: $version\n"; 1670 print STDERR "ABORT: Don't panic, but a mostly harmless error occurred and Canu stopped.\n"; 1671 print STDERR "ABORT: Try restarting. If that doesn't work, ask for help.\n"; 1672 print STDERR "ABORT:\n"; 1673 print STDERR "ABORT: $msg.\n" if (defined($msg)); 1674 print STDERR "ABORT:\n" if (defined($msg)); 1675 1676 if (defined($log) && -e $log) { 1677 my $df = diskSpace($log); 1678 1679 print STDERR "ABORT: Disk space available: $df GB\n"; 1680 print STDERR "ABORT:\n"; 1681 } 1682 1683 if (-e $log) { 1684 print STDERR "ABORT: Last 50 lines of the relevant log file ($log):\n"; 1685 print STDERR "ABORT:\n"; 1686 1687 open(Z, "tail -n 50 $log |"); 1688 while (<Z>) { 1689 print STDERR "ABORT: $_"; 1690 } 1691 close(Z); 1692 1693 print STDERR "ABORT:\n"; 1694 } 1695 1696 my $fail = getGlobal('onFailure'); 1697 if (defined($fail)) { 1698 runCommandSilently(getGlobal("onExitDir"), "$fail $asm", 0); 1699 } 1700 1701 exit(1); 1702} 1703 1704 1705# Use caFailure() for errors that definitely will require code changes to fix. 1706sub caFailure ($$) { 1707 my $asm = getGlobal("onExitNam"); 1708 my $msg = shift @_; 1709 my $log = shift @_; 1710 my $version = getGlobal("version"); 1711 my $trace = longmess("Failed"); 1712 1713 $trace =~ s/\n/\nCRASH: /g; 1714 1715 print STDERR "\n"; 1716 print STDERR "CRASH:\n"; 1717 print STDERR "CRASH: $version\n"; 1718 print STDERR "CRASH: Please panic, this is abnormal.\n"; 1719 print STDERR "CRASH:\n"; 1720 print STDERR "CRASH: $msg.\n"; 1721 print STDERR "CRASH:\n"; 1722 print STDERR "CRASH: $trace\n"; 1723 #print STDERR "CRASH:\n"; # $trace has an extra CRASH: at the end 1724 1725 if (-e $log) { 1726 print STDERR "CRASH: Last 50 lines of the relevant log file ($log):\n"; 1727 print STDERR "CRASH:\n"; 1728 1729 open(Z, "tail -n 50 $log |"); 1730 while (<Z>) { 1731 print STDERR "CRASH: $_"; 1732 } 1733 close(Z); 1734 1735 print STDERR "CRASH:\n"; 1736 } else { 1737 print STDERR "CRASH: No log file supplied.\n"; 1738 print STDERR "CRASH:\n"; 1739 } 1740 1741 my $fail = getGlobal('onFailure'); 1742 if (defined($fail)) { 1743 runCommandSilently(getGlobal("onExitDir"), "$fail $asm", 0); 1744 } 1745 1746 exit(1); 1747} 1748 1749 17501; 1751