1
2###############################################################################
3 #
4 #  This file is part of canu, a software program that assembles whole-genome
5 #  sequencing reads into contigs.
6 #
7 #  This software is based on:
8 #    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 #    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 #
11 #  Except as indicated otherwise, this is a 'United States Government Work',
12 #  and is released in the public domain.
13 #
14 #  File 'README.licenses' in the root directory of this distribution
15 #  contains full conditions and disclaimers.
16 ##
17
18package canu::Execution;
19
20require Exporter;
21
22@ISA    = qw(Exporter);
23@EXPORT = qw(stopAfter
24             resetIteration
25             touch
26             makeExecutable
27             getJobIDShellCode
28             getLimitShellCode
29             getBinDirectory
30             getBinDirectoryShellCode
31             setWorkDirectory
32             setWorkDirectoryShellCode
33             submitScript
34             submitOrRunParallelJob
35             runCommand
36             runCommandSilently
37             findCommand
38             findExecutable
39             caExit
40             caFailure);
41
42use strict;
43use warnings "all";
44no  warnings "uninitialized";
45
46use Config;            #  for @signame
47use Cwd qw(getcwd);
48use Carp qw(longmess);
49
50use POSIX ":sys_wait_h";  #  For waitpid(..., &WNOHANG)
51use File::Basename;
52use List::Util qw(min max);
53use File::Path 2.08 qw(make_path remove_tree);
54use File::Spec;
55
56use canu::Defaults;
57
58
59
60
61
62#  Log that we've finished a task.
63
64sub logFinished ($$) {
65    my $dir       = shift @_;
66    my $startsecs = shift @_;
67
68    my $diskfree  = diskSpace(".");
69
70    my $warning = "  !!! WARNING !!!" if ($diskfree < 10);
71    my $elapsed = time() - $startsecs;
72    my $message;
73
74    my @fast;
75
76    push @fast, "lickety-split";
77    push @fast, "fast as lightning";
78    push @fast, "furiously fast";
79    push @fast, "like a bat out of hell";
80    push @fast, "in the blink of an eye";
81
82    my @slow;
83
84    push @slow, "fashionably late";
85    push @slow, "better late than never";
86    push @slow, "like watching paint dry";
87    push @slow, "at least I didn't crash";
88    push @slow, "it'll be worth it in the end";
89    push @slow, "no bitcoins found either";
90
91    my $rf = int(rand(scalar(@fast)));
92    my $rs = int(rand(scalar(@slow)));
93    my $rp = int(rand(100));
94
95    $message  = "$elapsed seconds" if ($elapsed  > 1);
96    $message  = "one second"       if ($elapsed == 1);
97    $message  = $fast[$rf]         if ($elapsed  < 1);
98
99    $message .= ", " . $slow[$rs]  if ((($elapsed > 1000)  && ($rp < 1)) ||
100                                       (($elapsed > 10000) && ($rp < 50)) ||
101                                       (($elapsed > 86400)));
102
103    print STDERR "\n";
104    print STDERR "-- Finished on ", scalar(localtime()), " ($message) with $diskfree GB free disk space$warning\n";
105    print STDERR "----------------------------------------\n";
106}
107
108
109
110#
111#  Functions for running multiple processes at the same time.  This is private to the module.
112#
113
114my $numberOfProcesses       = 0;     #  Number of jobs concurrently running
115my $numberOfProcessesToWait = 0;     #  Number of jobs we can leave running at exit
116my @processQueue            = ();
117my @processesRunning        = ();
118my $printProcessCommand     = 1;     #  Show commands as they run
119
120sub schedulerSetNumberOfProcesses {
121    $numberOfProcesses = shift @_;
122}
123
124sub schedulerSubmit ($) {
125    my $cmd = shift @_;
126
127    chomp $cmd;
128
129    push @processQueue, $cmd;
130}
131
132sub schedulerForkProcess ($) {
133    my $process = shift @_;
134    my $pid;
135
136    #  From Programming Perl, page 167
137  FORK: {
138      if ($pid = fork) {
139          # Parent
140          #
141          return($pid);
142      } elsif (defined $pid) {
143          # Child
144          #
145          exec($process);
146      } elsif ($! =~ /No more processes/) {
147          # EAGIN, supposedly a recoverable fork error
148          sleep 1;
149          redo FORK;
150      } else {
151          die "Can't fork: $!\n";
152      }
153    }
154}
155
156sub schedulerReapProcess ($) {
157    my $pid = shift @_;
158
159    if (waitpid($pid, &WNOHANG) > 0) {
160        return(1);
161    } else {
162        return(0);
163    }
164}
165
166sub schedulerRun () {
167    my @running;
168
169    #  Reap any processes that have finished
170
171    foreach my $i (@processesRunning) {
172        push @running, $i  if (schedulerReapProcess($i) == 0);
173    }
174
175    @processesRunning = @running;
176
177    #  Run processes in any available slots
178
179    while ((scalar(@processesRunning) < $numberOfProcesses) &&
180           (scalar(@processQueue) > 0)) {
181        my $process = shift @processQueue;
182        print STDERR "    $process\n";
183        push @processesRunning, schedulerForkProcess($process);
184    }
185}
186
187sub schedulerFinish ($$) {
188    my $dir = shift @_;
189    my $nam = shift @_;
190    my $child;
191    my @newProcesses;
192    my $remain;
193
194    $remain = scalar(@processQueue);
195
196    my $startsecs = time();
197    my $diskfree  = (defined($dir)) ? (diskSpace($dir)) : (0);
198
199    print STDERR "----------------------------------------\n";
200    print STDERR "-- Starting '$nam' concurrent execution on ", scalar(localtime()), " with $diskfree GB free disk space ($remain processes; $numberOfProcesses concurrently)\n"  if  (defined($dir));
201    print STDERR "-- Starting '$nam' concurrent execution on ", scalar(localtime()), " ($remain processes; $numberOfProcesses concurrently)\n"                                    if (!defined($dir));
202    print STDERR "\n";
203    print STDERR "    cd $dir\n";
204
205    my $cwd = getcwd();  #  Remember where we are.
206    chdir($dir);        #  So we can root the jobs in the correct location.
207
208    #  Run all submitted jobs
209    #
210    while ($remain > 0) {
211        schedulerRun();
212
213        $remain = scalar(@processQueue);
214
215        if ($remain > 0) {
216            $child = waitpid -1, 0;
217
218            undef @newProcesses;
219            foreach my $i (@processesRunning) {
220                push @newProcesses, $i if ($child != $i);
221            }
222            undef @processesRunning;
223            @processesRunning = @newProcesses;
224        }
225    }
226
227    #  Wait for them to finish, if requested
228    #
229    while (scalar(@processesRunning) > $numberOfProcessesToWait) {
230        waitpid(shift @processesRunning, 0);
231    }
232
233    logFinished($dir, $startsecs);
234
235    chdir($cwd);
236}
237
238
239
240#
241#  File Management
242#
243
244sub touch ($@) {
245    open(F, "> $_[0]") or caFailure("failed to touch file '$_[0]'", undef);
246    print F "$_[1]\n"  if (defined($_[1]));
247    close(F);
248}
249
250
251
252sub makeExecutable ($) {
253    my $file = shift @_;
254
255    chmod(0755 & ~umask(), $file);
256}
257
258
259#
260#  State management
261#
262
263sub stopAfter ($) {
264    my $stopAfter = shift @_;
265
266    $stopAfter =~ tr/A-Z/a-z/;
267
268    return   if (($stopAfter ne "theend") &&
269                 ($stopAfter ne getGlobal("stopAfter")));
270
271    if ($stopAfter ne "theend") {
272        print STDERR "--\n";
273        print STDERR "--  Stop requested after '$stopAfter'.\n";
274    }
275
276    if (defined(getGlobal("onSuccess"))) {
277        print STDERR "--\n";
278        print STDERR "--  Running user-supplied termination command.\n";
279
280        runCommand(getGlobal("onExitDir"), getGlobal("onSuccess") . " " . getGlobal("onExitNam"));
281    }
282
283    print STDERR "--\n";
284    print STDERR "-- Bye.\n";
285
286    exit(0);
287}
288
289
290sub resetIteration ($) {
291    my $stage = shift @_;
292
293    print STDERR "-- Finished stage '$stage', reset canuIteration.\n"       if (defined($stage));
294
295    setGlobal("canuIteration", 0);
296}
297
298
299
300#  Decide what bin directory to use.
301#
302#  When we are running on the grid, the path of this perl script is NOT always the correct
303#  architecture.  If the submission host is FreeBSD, but the grid is Linux, the BSD box will submit
304#  FreeBSD/bin/canu to the grid.  Unless it knows which grid host it will run on in advance, there
305#  is no way to pick the correct one.  The grid host then has to have enough smarts to choose the
306#  correct binaries, and that is what we're doing here.
307#
308#  To make it more trouble, shell scripts need to do all this by themselves.
309#
310#sub getInstallDirectory () {
311#    my $installDir = ;
312#
313#    if ($installDir =~ m!^(.*)/\w+-\w+/bin$!) {
314#        $installDir = $1;
315#    }
316#
317#    return($installDir);
318#}
319
320
321#  Emits a block of shell code to parse the grid task id and offset.
322#  Expects zero or one argument, which is interpreted different in grid and non-grid mode.
323#    Off grid - the job to run
324#    On grid  - an offset to add to SGE_TASK_ID or SLURM_ARRAY_TASK_ID to compute the job to run
325#
326#  PBSPro refuses to run an array job with one element.  They're submitted as a normal job.  Here,
327#  we check if it is running on the grid and if the task ID (aka, array ID) isn't set.  If so, we
328#  assume it is job 1.
329#
330sub getJobIDShellCode () {
331    my $string;
332    my $taskenv = getGlobal('gridEngineTaskID');
333
334    $string .= "#  Discover the job ID to run, from either a grid environment variable and a\n";
335    $string .= "#  command line offset, or directly from the command line.\n";
336    $string .= "#\n";
337    $string .= "if [ x\$$taskenv = x -o x\$$taskenv = xundefined -o x\$$taskenv = x0 ]; then\n";
338    $string .= "  baseid=\$1\n";           #  Off grid
339    $string .= "  offset=0\n";
340    $string .= "else\n";
341    $string .= "  baseid=\$$taskenv\n";    #  On Grid
342    $string .= "  offset=\$1\n";
343    $string .= "fi\n";
344    $string .= "if [ x\$offset = x ]; then\n";
345    $string .= "  offset=0\n";
346    $string .= "fi\n";
347    $string .= "if [ x\$baseid = x ]; then\n";
348    $string .= "  echo Error: I need $taskenv set, or a job index on the command line.\n";
349    $string .= "  exit\n";
350    $string .= "fi\n";
351    $string .= "jobid=`expr -- \$baseid + \$offset`\n";
352    $string .= "if [ x\$baseid = x0 ]; then\n";
353    $string .= "  echo Error: jobid 0 is invalid\\; I need $taskenv set, or a job index on the command line.\n";
354    $string .= "  exit\n";
355    $string .= "fi\n";
356    $string .= "if [ x\$$taskenv = x ]; then\n";
357    $string .= "  echo Running job \$jobid based on command line options.\n";
358    $string .= "else\n";
359    $string .= "  echo Running job \$jobid based on $taskenv=\$$taskenv and offset=\$offset.\n";
360    $string .= "fi\n";
361}
362
363
364#  Emits a block of shell code to change shell imposed limit on the number of open files and
365#  processes.
366#
367sub getLimitShellCode () {
368    my $string;
369
370    $string .= "echo \"\"\n";
371    $string .= "echo \"Attempting to increase maximum allowed processes and open files.\"";
372    $string .= "\n";
373    $string .= "max=`ulimit -Hu`\n";
374    $string .= "bef=`ulimit -Su`\n";
375    $string .= "if [ \$bef -lt \$max ] ; then\n";
376    $string .= "  ulimit -Su \$max\n";
377    $string .= "  aft=`ulimit -Su`\n";
378    $string .= "  echo \"  Changed max processes per user from \$bef to \$aft (max \$max).\"\n";
379    $string .= "else\n";
380    $string .= "  echo \"  Max processes per user limited to \$bef, no increase possible.\"\n";
381    $string .= "fi\n";
382    $string .= "\n";
383    $string .= "max=`ulimit -Hn`\n";
384    $string .= "bef=`ulimit -Sn`\n";
385    $string .= "if [ \$bef -lt \$max ] ; then\n";
386    $string .= "  ulimit -Sn \$max\n";
387    $string .= "  aft=`ulimit -Sn`\n";
388    $string .= "  echo \"  Changed max open files from \$bef to \$aft (max \$max).\"\n";
389    $string .= "else\n";
390    $string .= "  echo \"  Max open files limited to \$bef, no increase possible.\"\n";
391    $string .= "fi\n";
392    $string .= "\n";
393    $string .= "echo \"\"\n";
394    $string .= "\n";
395
396    return($string);
397}
398
399
400#  Used inside canu to find where binaries are located.
401#
402sub getBinDirectory () {
403    return($FindBin::RealBin);
404
405    #my $idir = getInstallDirectory();
406    #my $path = $idir;
407    #
408    #$path = "$idir/bin"   if (-d "$idir/bin");
409    #
410    #return($path);
411}
412
413
414#  Emits a block of shell code to locate binaries during shell scripts.  See comments on
415#  getBinDirectory.
416#
417sub getBinDirectoryShellCode () {
418    my $idir = $FindBin::RealBin;
419    my $string;
420
421    #  First, run any preExec command that might exist.
422
423    if (defined(getGlobal("preExec"))) {
424        $string .= "#  Pre-execution commands.\n";
425        $string .= "\n";
426        $string .= getGlobal('preExec') . "\n";
427        $string .= "\n";
428    }
429
430    #  Then, setup and report paths.
431
432    my $javaPath = getGlobal("java");
433    my $canu     = "\$bin/" . basename($0);   #  NOTE: $bin decided at script run time
434
435    $string .= "\n";
436    $string .= "#  Path to Canu.\n";
437    $string .= "\n";
438    $string .= "bin=\"$idir\"\n";
439    $string .= "\n";
440    $string .= "#  Report paths.\n";
441    $string .= "\n";
442    $string .= "echo \"\"\n";
443    $string .= "echo \"Found perl:\"\n";
444    $string .= "echo \"  \" `which perl`\n";
445    $string .= "echo \"  \" `perl --version | grep version`\n";
446    $string .= "echo \"\"\n";
447    $string .= "echo \"Found java:\"\n";
448    $string .= "echo \"  \" `which $javaPath`\n";
449    $string .= "echo \"  \" `$javaPath -showversion 2>&1 | head -n 1`\n";
450    $string .= "echo \"\"\n";
451    $string .= "echo \"Found canu:\"\n";
452    $string .= "echo \"  \" $canu\n";
453    $string .= "echo \"  \" `$canu -version`\n";
454    $string .= "echo \"\"\n";
455    $string .= "\n";
456    $string .= "\n";
457    $string .= "#  Environment for any object storage.\n";
458    $string .= "\n";
459    $string .= "export CANU_OBJECT_STORE_CLIENT="    . getGlobal("objectStoreClient")    . "\n";
460    $string .= "export CANU_OBJECT_STORE_CLIENT_UA=" . getGlobal("objectStoreClientUA")  . "\n";
461    $string .= "export CANU_OBJECT_STORE_CLIENT_DA=" . getGlobal("objectStoreClientDA")  . "\n";
462    $string .= "export CANU_OBJECT_STORE_NAMESPACE=" . getGlobal("objectStoreNameSpace") . "\n";
463    $string .= "export CANU_OBJECT_STORE_PROJECT="   . getGlobal("objectStoreProject")   . "\n";
464    $string .= "\n";
465    $string .= "\n";
466
467    return($string);
468}
469
470
471
472
473#
474#  If running on a cloud system, shell scripts are started in some random location.
475#  setWorkDirectory() will create the directory the script is supposed to run in (e.g.,
476#  correction/0-mercounts) and move into it.  This will keep the scripts compatible with the way
477#  they are run from within canu.pl.
478#
479#  If you're fine running in 'some random location' do nothing here.
480#
481#  Note that canu does minimal cleanup.
482#
483
484sub setWorkDirectory ($$) {
485    my $asm     = shift @_;
486    my $rootdir = shift @_;
487
488    #  Set the initial directory based on various rules.
489    #
490    #  For the canu executive, in grid mode, both setWorkDirectoryShellCode and
491    #  this (in that order) are called.  TEST is assuming that all (non-executive)
492    #  compute jobs are run as arrays.
493
494    if    ((getGlobal("objectStore") eq "TEST") && (defined($ENV{"JOB_ID"}))) {
495        my $jid = $ENV{'JOB_ID'};
496        my $tid = $ENV{'SGE_TASK_ID'};   #  'undefined' since this isn't an array job.
497
498        remove_tree("/assembly/objectstore/job-$jid");
499        make_path  ("/assembly/objectstore/job-$jid");
500        chdir      ("/assembly/objectstore/job-$jid");
501    }
502
503    elsif (getGlobal("objectStore") eq "DNANEXUS") {
504    }
505
506    elsif (getGlobal("gridEngine") eq "PBSPRO") {
507        chdir($ENV{"PBS_O_WORKDIR"})   if (exists($ENV{"PBS_O_WORKDIR"}));
508        delete $ENV{"PBS_O_WORKDIR"};
509    }
510
511    #  Now move into the assembly directory.
512
513    if (defined($rootdir)) {
514        make_path($rootdir)  if (! -d $rootdir);
515        chdir($rootdir);
516    }
517
518    #  And save some pieces we need when we quit.
519
520    setGlobal("onExitDir", getcwd());
521    setGlobal("onExitNam", $asm);
522}
523
524
525
526sub setWorkDirectoryShellCode ($) {
527    my $path = shift @_;
528    my $code = "";
529
530    if    (getGlobal("objectStore") eq "TEST") {
531        $code .= "if [ z\$SGE_TASK_ID != z ] ; then\n";
532        $code .= "  jid=\$JOB_ID\n";
533        $code .= "  tid=\$SGE_TASK_ID\n";
534        $code .= "  if [ x\$tid != xundefined ] ; then\n";
535        $code .= "    rm   -rf /assembly/objectstore/job-\$jid-\$tid/\n";
536        $code .= "    mkdir -p /assembly/objectstore/job-\$jid-\$tid/$path\n";
537        $code .= "    cd       /assembly/objectstore/job-\$jid-\$tid/$path\n";
538        $code .= "  fi\n";
539        $code .= "fi\n";
540    }
541
542    elsif (getGlobal("objectStore") eq "DNANEXUS") {
543        #  You're probably fine running in some random location, but if there is faster disk
544        #  available, move there.
545    }
546
547    elsif (getGlobal("gridEngine") eq "PBSPRO") {
548        $code .= "if [ z\$PBS_O_WORKDIR != z ] ; then\n";
549        $code .= "  cd \$PBS_O_WORKDIR\n";
550        $code .= "fi\n";
551    }
552
553    return($code);
554}
555
556
557
558#  Spend too much effort ensuring that the name is unique in the system.  For 'canu' jobs, we don't
559#  care.
560
561sub makeRandomSuffix ($) {
562    my $length = shift @_;
563    my @chars  = +('0'..'9', 'a'..'k', 'm'..'z', 'A'..'H', 'J'..'N', 'P'..'Z');    #  Remove 'l', 'I' and 'O'
564    my $suffix;
565
566    while ($length-- > 0) {
567        $suffix .= @chars[int(rand(59))];
568    }
569
570    return($suffix);
571}
572
573
574sub makeUniqueJobName ($$) {
575    my $jobType = shift @_;
576    my $asm     = shift @_;
577
578    #  If a canu job, just return the standard name.  No uniquification needed.
579
580    if ($jobType eq "canu") {
581        return("canu_" . $asm . ((defined(getGlobal("gridOptionsJobName"))) ? ("_" . getGlobal("gridOptionsJobName")) : ("")));
582    }
583
584    #  For all other jobs, we need to ensure the name is unique.  We do this by adding digits at the end.
585
586    my $jobName = "${jobType}_" . $asm . ((defined(getGlobal("gridOptionsJobName"))) ? ("_" . getGlobal("gridOptionsJobName")) : (""));
587    my %jobs;
588
589    #  First, find the list of all jobs that exist.
590
591    if (uc(getGlobal("gridEngine")) eq "SGE") {
592        open(F, "qstat -xml |");
593        while (<F>) {
594            $jobs{$1}++  if (m/^\s*<JB_name>(.*)<\/JB_name>$/);
595        }
596        close(F);
597    }
598
599    if (uc(getGlobal("gridEngine")) eq "PBS") {
600    }
601
602    if (uc(getGlobal("gridEngine")) eq "PBSPro") {
603    }
604
605    if (uc(getGlobal("gridEngine")) eq "LSF") {
606    }
607
608    if (uc(getGlobal("gridEngine")) eq "DNANEXUS") {
609    }
610
611    #  If the jobName doesn't exist, we can use it.
612
613    return($jobName)  if (! exists($jobs{$jobName}));
614
615    #  Otherwise, find a unique random 2-letter suffix.
616
617    my $jobIdx  = makeRandomSuffix(2);
618
619    while (exists($jobs{"${jobName}_$jobIdx"})) {
620        $jobIdx = makeRandomSuffix(2);
621    }
622
623    #  And return it!  Simple!
624
625    # this was breaking dependencies when multiple jobs were submitted like for a failed consensus run, turn off for now
626    return("${jobName}");
627    #return("${jobName}_$jobIdx");
628}
629
630
631
632
633#  Submit ourself back to the grid.  If the one argument is defined, make us hold on jobs with that
634#  name.
635#
636#  The previous version (CA) would use "gridPropagateHold" to reset holds on existing jobs so that
637#  they would also hold on this job.
638#
639sub submitScript ($$) {
640    my $asm         = shift @_;
641    my $jobHold     = shift @_;
642
643    return   if (getGlobal("useGrid")       ne "1");      #  If not requested to run on the grid,
644    return   if (getGlobal("gridEngine")    eq undef);    #  or can't run on the grid, don't run on the grid.
645
646    #  If no job hold, and we are already on the grid, do NOT resubmit ourself.
647    #
648    #  When the user launches canu on the head node, a call to submitScript() is made to launch canu
649    #  under grid control.  That results in a restart of canu, and another call to submitScript(),
650    #  but this time, the envorinment variable is set, we we can skip the resubmission, and continue
651    #  with canu execution.
652
653    return   if (($jobHold eq undef) && (exists($ENV{getGlobal("gridEngineJobID")})));
654
655    #  Figure out the name of the script we want to be making, and a place for it to write output.
656
657    my $idx = "01";
658
659    while ((-e "canu-scripts/canu.$idx.sh") ||
660           (-e "canu-scripts/canu.$idx.out")) {
661        $idx++;
662    }
663
664    my $script    = "canu-scripts/canu.$idx.sh";
665    my $scriptOut = "canu-scripts/canu.$idx.out";
666
667    #  Make a script for us to submit.
668
669    open(F, "> $script") or caFailure("failed to open '$script' for writing", undef);
670    print F "#!" . getGlobal("shell") . "\n";
671    print F "\n";
672    print F "#  Attempt to (re)configure SGE.  For unknown reasons, jobs submitted\n"   if (getGlobal("gridEngine") eq "SGE");
673    print F "#  to SGE, and running under SGE, fail to read the shell init scripts,\n"  if (getGlobal("gridEngine") eq "SGE");
674    print F "#  and so they don't set up SGE (or ANY other paths, etc) properly.\n"     if (getGlobal("gridEngine") eq "SGE");
675    print F "#  For the record, interactive logins (qlogin) DO set the environment.\n"  if (getGlobal("gridEngine") eq "SGE");
676    print F "\n"                                                                        if (getGlobal("gridEngine") eq "SGE");
677    print F "if [ \"x\$SGE_ROOT\" != \"x\" -a \\\n"                                     if (getGlobal("gridEngine") eq "SGE");
678    print F "     -e  \$SGE_ROOT/\$SGE_CELL/common/settings.sh ]; then\n"               if (getGlobal("gridEngine") eq "SGE");
679    print F "  . \$SGE_ROOT/\$SGE_CELL/common/settings.sh\n"                            if (getGlobal("gridEngine") eq "SGE");
680    print F "fi\n"                                                                      if (getGlobal("gridEngine") eq "SGE");
681    print F "\n";
682    print F getBinDirectoryShellCode();
683    print F "\n";
684    print F setWorkDirectoryShellCode(".");
685    print F "\n";
686    print F "rm -f canu.out\n";
687    print F "ln -s $scriptOut canu.out\n";
688    print F "\n";
689    print F "/usr/bin/env perl \\\n";
690    print F "\$bin/" . basename($0) . " " . getCommandLineOptions() . " canuIteration=" . getGlobal("canuIteration") . "\n";
691    close(F);
692
693    makeExecutable("$script");
694
695    #  Construct a submission command line.
696
697    my $jobName   = makeUniqueJobName("canu", $asm);
698
699    #  The canu.pl script isn't expected to take resources.  We'll default to 4gb and one thread.
700
701    my $mem = getGlobal("executiveMemory");
702    my $thr = getGlobal("executiveThreads");
703
704    my $resOption = buildResourceOption($mem, $thr);
705
706    my $gridOpts;
707
708    $gridOpts  = $jobHold;
709    $gridOpts .= " "                                     if (defined($gridOpts));
710
711    #  LSF ignores all but the first option, so options need to be reversed.
712    #  DNAnexus doesn't use threads, and memory is the instance type.
713
714    if    (uc(getGlobal("gridEngine")) eq "LSF") {
715        $gridOpts .= getGlobal("gridOptionsExecutive")   if (defined(getGlobal("gridOptionsExecutive")));
716        $gridOpts .= " "                                 if (defined($gridOpts));
717        $gridOpts .= getGlobal("gridOptions")            if (defined(getGlobal("gridOptions")));
718        $gridOpts .= " "                                 if (defined($gridOpts));
719        $gridOpts .= $resOption                          if (defined($resOption));
720    }
721
722    elsif (uc(getGlobal("gridEngine")) eq "DNANEXUS") {
723        $gridOpts .= getGlobal("gridOptions")            if (defined(getGlobal("gridOptions")));
724        $gridOpts .= " "                                 if (defined($gridOpts));
725        $gridOpts .= getGlobal("gridOptionsExecutive")   if (defined(getGlobal("gridOptionsExecutive")));
726    }
727
728    else {
729        $gridOpts .= $resOption                          if (defined($resOption));
730        $gridOpts .= " "                                 if (defined($gridOpts));
731        $gridOpts .= getGlobal("gridOptions")            if (defined(getGlobal("gridOptions")));
732        $gridOpts .= " "                                 if (defined($gridOpts));
733        $gridOpts .= getGlobal("gridOptionsExecutive")   if (defined(getGlobal("gridOptionsExecutive")));
734    }
735
736    my $submitCommand        = getGlobal("gridEngineSubmitCommand");
737    my $nameOption           = getGlobal("gridEngineNameOption");
738    my $outputOption         = getGlobal("gridEngineOutputOption");
739
740    my $qcmd = "$submitCommand $gridOpts";
741    $qcmd   .= " $nameOption '$jobName'"   if defined($nameOption);
742    $qcmd   .= " $outputOption $scriptOut" if defined($outputOption);
743
744    #  DNAnexus doesn't submit scripts; all parameters are passed through
745    #  '-i' options.  The 'fetch_and_run' magic is in dx-canu/src/canu-job-launcher.sh.
746    #  It will download the requested shell script and execute said function
747    #  in it.
748
749    if (uc(getGlobal("gridEngine")) eq "DNANEXUS") {
750        $qcmd .= " \\\n";
751        $qcmd .= " -ioutput_folder:string=\"" . getGlobal("objectStoreNamespace") . "\" \\\n";
752        $qcmd .= " -iscript_path:string=\"\" \\\n";
753        $qcmd .= " -iscript_name:string=\"canu-executive.sh\" \\\n";
754        $qcmd .= " -icanu_iteration:int="     . getGlobal("canuIteration")        .   " \\\n";
755        $qcmd .= " -icanu_iteration_max:int=" . getGlobal("canuIterationMax")     .   " \\\n";
756        $qcmd .= " fetch_and_run \\\n";
757    }
758
759    else {
760        $qcmd .= "  $script";
761    }
762
763    if (runCommand(getcwd(), $qcmd) == 0) {      #  Exit sucessfully if we've submitted
764        exit(0);                                 #  the next part successfully.
765    }
766
767    print STDERR "-- Failed to submit Canu executive.  Delay 10 seconds and try again.\n";
768
769    sleep(10);
770
771    if (runCommand(getcwd(), $qcmd) == 0) {
772        exit(0);
773    }
774
775    print STDERR "-- Failed to submit Canu executive.  Giving up after two tries.\n";
776
777    exit(1);
778}
779
780
781
782sub buildGridArray (@) {
783    my ( $name, $bgn, $end, $opt, $thr ) = @_;
784    my  $off = 0;
785
786    #  In some grids (SGE)   this is the maximum size of an array job.
787    #  In some grids (Slurm) this is the maximum index of an array job.
788    #
789    #  So, here, we just don't let any index be above the value.  Both types will be happy.
790
791    if ($end > getGlobal('gridEngineArrayMaxJobs')) {
792        $off  = $bgn - 1;
793        $bgn -= $off;
794        $end -= $off;
795    }
796
797    #  PBSPro requires array jobs to have bgn < end.  When $bgn == $end, we
798    #  just remove the array qualifier.  But only if this option is setting
799    #  the number of jobs, not if it is setting the name.
800    #  New versions of PBS have this behavior too
801
802    if (uc(getGlobal("gridEngine")) eq "PBSPRO" || uc(getGlobal("gridEngine")) eq "PBS") {
803        if (($bgn == $end) && ($opt =~ m/ARRAY_JOBS/)) {
804            $opt = "";
805            $off = $bgn;
806        }
807    }
808    # DNA nexus doesn't have arrays and only supports 1 job, which we use to pass the identifier
809    # Set the offset to blank since it is not supported as well
810    if (uc(getGlobal("gridEngine")) eq "DNANEXUS" && ($bgn == $end) && ($opt =~ m/ARRAY_JOBS/)) {
811           my $jid = $bgn + $off;
812           $opt =~ s/ARRAY_JOBS/$jid/g;
813           $off = "";
814    }
815
816    #  Further, PBS/Torque won't let scripts be passed options unless they
817    #  are prefixed with a -F....and PBSPro doesn't need this.
818
819    if (uc(getGlobal("gridEngine")) eq "PBS") {
820        $off = "-F \"$off\"";
821    }
822
823    if( $opt =~ m/(ARRAY_NAME)/ )
824    {
825	$opt =~ s/$1/$name/; # Replace ARRAY_NAME with 'job name'
826    }
827    elsif( $opt =~ m/(ARRAY_JOBS)/ )
828    {
829	$opt =~ s/$1/$bgn-$end/; # Replace ARRAY_JOBS with 'bgn-end'
830
831	if( lc( getGlobal( 'gridEngine' ) ) eq 'slurm' && $end > 1 )
832	{
833	    if( $name =~ m/^cormhap_/i && defined getGlobal( 'slurmCormhapCoreLimit' ) )
834	    {
835		$opt .= '%' . int( getGlobal( 'slurmCormhapCoreLimit' ) / $thr );
836	    }
837	    elsif( $name =~ m/^ovb_/i && defined getGlobal( 'slurmOvbCoreLimit' ) )
838	    {
839		$opt .= '%' . getGlobal( 'slurmOvbCoreLimit' );
840	    }
841	    elsif( $name =~ m/^ovs_/i && defined getGlobal( 'slurmOvsCoreLimit' ) )
842	    {
843		$opt .= '%' . getGlobal( 'slurmOvsCoreLimit' );
844	    }
845	    elsif( $name =~ m/^red_/i && defined getGlobal( 'slurmRedCoreLimit' ) )
846	    {
847		$opt .= '%' . int( getGlobal( 'slurmRedCoreLimit' ) / $thr );
848	    }
849	    elsif( defined getGlobal( 'slurmArrayTaskLimit' ) )
850	    {
851		$opt .= '%' . getGlobal( 'slurmArrayTaskLimit' );
852	    }
853	    elsif( defined getGlobal( 'slurmArrayCoreLimit' ) )
854	    {
855		$opt .= '%' . int( getGlobal( 'slurmArrayCoreLimit' ) / $thr );
856	    }
857	}
858    }
859
860    return($opt, $off);
861}
862
863
864sub buildOutputName ($$$) {
865    my $path   = shift @_;
866    my $script = shift @_;
867    my $tid    = substr("000000" . (shift @_), -6);
868    my $o;
869
870    #  When this function is called, canu.pl is running in the assembly directory.
871    #  But, when the script is executed, it is rooted in '$path'.  To get the
872    #  'logs' working, we need to check if the directory relative to the assembly root exists,
873    #  but set it relative to $path (which is also where $script is relative to).
874
875    $o = "$script.$tid.out";
876    $o = "logs/$1.$tid.out"   if ((-e "$path/logs") && ($script =~ m/scripts\/(.*)/));
877
878    return($o);
879}
880
881
882sub buildOutputOption ($$) {
883    my $path   = shift @_;
884    my $script = shift @_;
885    my $tid    = getGlobal("gridEngineArraySubmitID");
886    my $opt    = getGlobal("gridEngineOutputOption");
887
888    if (defined($tid) && defined($opt)) {
889        my $o;
890
891        $o = "$script.$tid.out";
892        $o = "logs/$1.$tid.out"   if ((-e "$path/logs") && ($script =~ m/scripts\/(.*)/));
893
894        return("$opt $o");
895    }
896
897    return(undef);
898}
899
900
901sub buildStageOption ($$) {
902    my $t = shift @_;
903    my $d = shift @_;
904    my $r;
905
906    if ($t eq "cor" || $t eq "cormhap" || $t eq "obtmhap" || $t eq "utgmhap") {
907        $r =  getGlobal("gridEngineStageOption");
908        $r =~ s/DISK_SPACE/${d}/g;
909    }
910
911    return($r);
912}
913
914
915sub buildResourceOption ($$) {
916    my $m = shift @_;
917    my $t = shift @_;
918    my $u = "g";
919
920    #  Increase memory slightly if this is a retry.
921
922    if (getGlobal("canuIteration") > 0) {
923        $m *= 1.25 ** (getGlobal("canuIteration")-1);
924    }
925
926    #  Massage the memory requested into a format the grid is happy with.
927
928    if (getGlobal("gridEngineMemoryPerJob") != "1") {    #  If anything but "1", divide the memory request
929        $m /= $t;                                        #  by the number of slots we request.  Default behavior
930    }                                                    #  for SGE and Slurm when mem-per-cpu is used.
931
932    if (uc(getGlobal("gridEngine")) eq "LSF") {                                     #  But then reset for LSF,
933        # always round up
934        $m = int($m / 1024 + 0.5)          if (getGlobal("gridEngineMemoryUnits") =~ m/t/i);   #  because LSF wants to
935        $m = int($m * 1 + 0.5)             if (getGlobal("gridEngineMemoryUnits") =~ m/g/i);   #  enforce the units used.
936        $m = int($m * 1024 + 0.5)          if (getGlobal("gridEngineMemoryUnits") =~ m/m/i);
937        $m = int($m * 1024 * 1024 + 0.5)   if (getGlobal("gridEngineMemoryUnits") =~ m/k/i);
938        $u = "";
939    }
940
941    if (uc(getGlobal("gridEngine")) eq "DNANEXUS") {
942       $m = canu::Grid_DNANexus::getDNANexusInstance($m, $t);
943       $u = "";
944    }
945
946    if (($u eq "g") &&          #  If we're not an integral number of gigabytes,
947        (int($m) != $m)) {      #  switch over to megabytes.
948        $m = int($m * 1024);    #    But only if we're still gigabytes!
949        $u = "m";               #    In particular, both LSF and DNANEXUS set units to "".
950    }
951
952    #  Replace MEMORY and THREADS with actual values.
953
954    my $r = getGlobal("gridEngineResourceOption");
955
956    $r =~ s/MEMORY/${m}${u}/g;
957    $r =~ s/THREADS/$t/g;
958
959    return($r);
960}
961
962
963sub purgeGridJobSubmitScripts ($$) {
964    my $path    = shift @_;
965    my $script  = shift @_;
966    my $idx     = "01";
967
968    while (-e "$path/$script.jobSubmit-$idx.sh") {
969        unlink "$path/$script.jobSubmit-$idx.sh";
970        $idx++;
971    }
972}
973
974
975sub buildGridJob ($$$$$$$$$) {
976    my $asm     = shift @_;
977    my $jobType = shift @_;
978    my $path    = shift @_;
979    my $script  = shift @_;
980    my $mem     = shift @_;
981    my $thr     = shift @_;
982    my $dsk     = shift @_;
983    my $bgnJob  = shift @_;
984    my $endJob  = shift @_;
985
986    #  Unpack the job range if needed.
987
988    if ($bgnJob =~ m/^(\d+)-(\d+)$/) {
989        $bgnJob = $1;
990        $endJob = $2;
991    }
992
993    if (!defined($endJob)) {
994        $endJob = $bgnJob;
995    }
996
997    #  Figure out the command and options needed to run the job.
998
999    my $submitCommand          = getGlobal("gridEngineSubmitCommand");
1000    my $nameOption             = getGlobal("gridEngineNameOption");
1001
1002    my $jobNameT               = makeUniqueJobName($jobType, $asm);
1003
1004    my ($jobName,  $jobOff)    = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayName"));
1005    my ( $arrayOpt, $arrayOff ) = buildGridArray( $jobNameT, $bgnJob, $endJob, getGlobal( "gridEngineArrayOption" ), $thr );
1006
1007    my $outputOption           = buildOutputOption($path, $script);
1008
1009    my $stageOption            = buildStageOption($jobType, $dsk);
1010    my $resOption              = buildResourceOption($mem, $thr);
1011    my $globalOptions          = getGlobal("gridOptions");
1012    my $jobOptions             = getGlobal("gridOptions$jobType");
1013
1014    my $opts;
1015
1016    $opts  = "$stageOption "    if (defined($stageOption));
1017    $opts .= "$resOption "      if (defined($resOption));
1018    $opts .= "$globalOptions "  if (defined($globalOptions));
1019    $opts .= "$jobOptions "     if (defined($jobOptions));
1020    $opts .= "$outputOption "   if (defined($outputOption));
1021    $opts =~ s/\s+$//;
1022
1023    #  Find a unique file name to save the command.
1024
1025    my $idx = "01";
1026
1027    while (-e "$path/$script.jobSubmit-$idx.sh") {
1028        $idx++;
1029    }
1030
1031    #  Build and save the command line.  Return the command PREFIX (we'll be adding .sh and .out as
1032    #  appropriate), and the job name it will be submitted with (which isn't expected to be used).
1033
1034    open(F, "> $path/$script.jobSubmit-$idx.sh") or die;
1035    print F "#!/bin/sh\n";
1036    print F "\n";
1037    print F "$submitCommand \\\n";
1038    print F "  $opts \\\n"  if (defined($opts));
1039    print F "  $nameOption \"$jobName\" \\\n";
1040    print F "  $arrayOpt \\\n";
1041
1042
1043    if (uc(getGlobal("gridEngine")) eq "PBSPRO") {    #  PBSpro needs '--' to tell it to
1044        print F " -- ";                               #  stop parsing the command line.
1045    }
1046
1047    #  DNAnexus wants job parameters via options to the submit command;
1048    #  everyone else wants the script itself.
1049
1050    if (uc(getGlobal("gridEngine")) eq "DNANEXUS") {
1051        print F "  -ioutput_folder:string=\"" . getGlobal("objectStoreNamespace") . "\" \\\n";
1052        print F "  -iscript_path:string=\"$path\" \\\n";
1053        print F "  -iscript_name:string=\"$script.sh\" \\\n";
1054        print F "  fetch_and_run \\\n";
1055    } else {
1056        print F "  `pwd`/$script.sh $arrayOff \\\n";
1057    }
1058
1059    print F "> ./$script.jobSubmit-$idx.out 2>&1\n";
1060    close(F);
1061
1062    makeExecutable("$path/$script.jobSubmit-$idx.sh");
1063
1064    return("$script.jobSubmit-$idx", $jobName);
1065}
1066
1067
1068
1069
1070#  Convert @jobs to a list of ranges, 1-4, 5, 10-20, etc.  These will be directly submitted to the
1071#  grid, or run one-by-one locally.
1072#
1073#  If we're SGE, we can combine everything to one job range: 1-4, 5, 10-20.  Except that
1074#  buildGridJob() doesn't know how to handle that.
1075
1076sub convertToJobRange (@) {
1077    my @jobs;
1078
1079    #  Expand the ranges into a simple list of job ids.
1080
1081    foreach my $j (@_) {
1082        if        ($j =~ m/^0*(\d+)-0*(\d+)$/) {
1083            for (my $a=$1; $a<=$2; $a++) {
1084                push @jobs, $a;
1085            }
1086
1087        } elsif ($j =~ m/^0*(\d+)$/) {
1088            push @jobs, $1;
1089
1090        } else {
1091            caFailure("invalid job format in '$j'", undef);
1092        }
1093    }
1094
1095    #  Sort.
1096
1097    my @jobsA = sort { $a <=> $b } @jobs;
1098
1099    undef @jobs;
1100
1101    #  Merge adjacent ids into a range.
1102
1103    my $st = $jobsA[0];
1104    my $ed = $jobsA[0];
1105
1106    shift @jobsA;
1107
1108    foreach my $j (@jobsA) {
1109        if ($ed + 1 == $j) {
1110            $ed = $j;
1111        } else {
1112            push @jobs, ($st == $ed) ? "$st" : "$st-$ed";
1113            $st = $j;
1114            $ed = $j;
1115        }
1116    }
1117
1118    push @jobs, ($st == $ed) ? "$st" : "$st-$ed";
1119
1120
1121    #  In some grids (SGE)   this is the maximum size of an array job.
1122    #  In some grids (Slurm) this is the maximum index of an array job.
1123    #
1124    #  So, here, we make blocks that have at most that many jobs.  When we submit the job, we'll
1125    #  offset the indices to be 1..Max.
1126
1127    my $l = getGlobal("gridEngineArrayMaxJobs") - 1;
1128
1129    if ($l >= 0) {
1130        @jobsA = @jobs;
1131        undef @jobs;
1132
1133        foreach my $j (@jobsA) {
1134            if ($j =~ m/^0*(\d+)-0*(\d+)$/) {
1135                my $b = $1;
1136                my $e = $2;
1137
1138                while ($b <= $e) {
1139                    my $B = ($b + $l < $e) ? ($b + $l) : $e;
1140                    push @jobs, "$b-$B";
1141                    $b += $l + 1;
1142                }
1143            } else {
1144                push @jobs, $j
1145            }
1146        }
1147
1148        undef @jobsA;
1149    }
1150
1151    return(@jobs);
1152}
1153
1154
1155
1156sub countJobsInRange (@) {
1157    my @jobs  = @_;
1158    my $nJobs = 0;
1159
1160    foreach my $j (@jobs) {
1161        if ($j =~ m/^(\d+)-(\d+)$/) {
1162            $nJobs += $2 - $1 + 1;
1163        } else {
1164            $nJobs++;
1165        }
1166    }
1167
1168    return($nJobs);
1169}
1170
1171
1172
1173#  Expects
1174#    job type ("ovl", etc)
1175#    output directory
1176#    script name with no directory or .sh
1177#    number of jobs in the task
1178#
1179#  If under grid control, submit grid jobs.  Otherwise, run in parallel locally.
1180#
1181sub submitOrRunParallelJob ($$$$@) {
1182    my $asm          = shift @_;  #  Name of the assembly
1183
1184    my $jobType      = shift @_;  #  E.g., ovl, cns, ... - populates 'gridOptionsXXX
1185                                  #                      - also becomes the grid job name prefix, so three letters suggested
1186
1187    my $path         = shift @_;  #  Location of script to run
1188    my $script       = shift @_;  #  Runs $path/$script.sh > $path/$script.######.out
1189
1190    my $mem          = getGlobal("${jobType}Memory");
1191    my $thr          = getGlobal("${jobType}Threads");
1192    my $dsk          = getGlobal("${jobType}StageSpace");
1193
1194    my @jobs         = convertToJobRange(@_);
1195    my $nJobs        = countJobsInRange(@jobs);
1196
1197    my $runDirectly  = 0;
1198
1199    #  The script MUST be executable.
1200
1201    makeExecutable("$path/$script.sh");
1202
1203    #  If the job can fit in the task running the executive, run it right here.
1204
1205    if (($nJobs * $mem + 0.5 <= getGlobal("executiveMemory")) &&
1206        ($nJobs * $thr       <= getGlobal("executiveThreads"))) {
1207        $runDirectly = 1;
1208    }
1209
1210    #  Report what we're doing.
1211
1212    #my $t = localtime();
1213    #print STDERR "----------------------------------------GRIDSTART $t\n";
1214    #print STDERR "$path/$script.sh with $mem gigabytes memory and $thr threads.\n";
1215
1216    #  Break infinite loops.  If the grid jobs keep failing, give up after a few attempts.
1217    #
1218    #  submitScript() passes canuIteration on to the next call.
1219    #  canuIteration is reset to zero if the Check() for any parallel step succeeds.
1220    #
1221    #  Assuming grid jobs die on each attempt:
1222    #    0) canu run from the command line submits iteration 1; canuIteration is NOT incremented
1223    #       because no parallel jobs have been submitted.
1224    #    1) Iteration 1 - canu.pl submits jobs, increments the iteration count, and submits itself as iteration 2
1225    #    2) Iteration 2 - canu.pl submits jobs, increments the iteration count, and submits itself as iteration 3
1226    #    3) Iteration 3 - canu.pl fails with the error below
1227    #
1228    #  If the jobs succeed in Iteration 2, the canu in iteration 3 will pass the Check(), never call
1229    #  this function, and continue the pipeline.
1230
1231    my $iter = getGlobal("canuIteration");
1232    my $max  = getGlobal("canuIterationMax");
1233
1234    if ($iter >= $max) {
1235        caExit("canu iteration count too high, stopping pipeline (most likely a problem in the grid-based computes)", undef);
1236    } elsif ($iter == 0) {
1237        $iter = "First";
1238    } elsif ($iter == 1) {
1239        $iter = "Second";
1240    } elsif ($iter == 2) {
1241        $iter = "Third";
1242    } elsif ($iter == 3) {
1243        $iter = "Fourth";
1244    } elsif ($iter == 4) {
1245        $iter = "Fifth";
1246    } else {
1247        $iter = "${iter}th";
1248    }
1249
1250    print STDERR "--\n";
1251    print STDERR "-- Running jobs.  $iter attempt out of $max.\n";
1252
1253    setGlobal("canuIteration", getGlobal("canuIteration") + 1);
1254
1255    #  If 'gridEngineJobID' environment variable exists (SGE: JOB_ID; LSF: LSB_JOBID) then we are
1256    #  currently running under grid crontrol.  If so, run the grid command to submit more jobs, then
1257    #  submit ourself back to the grid.  If not, tell the user to run the grid command by hand.
1258
1259    #  Jobs under grid control, and we submit them
1260
1261    if (defined(getGlobal("gridEngine")) &&
1262        (getGlobal("useGrid") eq "1") &&
1263        (getGlobal("useGrid$jobType") eq "1") &&
1264        (exists($ENV{getGlobal("gridEngineJobID")})) &&
1265        ($runDirectly == 0)) {
1266        my @jobsSubmitted;
1267
1268        print STDERR "--\n";
1269
1270        purgeGridJobSubmitScripts($path, $script);
1271
1272        if (getGlobal("showNext")) {
1273            print STDERR "--\n";
1274            print STDERR "-- NEXT COMMANDS\n";
1275            print STDERR "--\n";
1276            print STDERR "\n";
1277            print STDERR prettifyCommand("cd $path"), "\n";
1278            foreach my $j (@jobs) {
1279                my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef);
1280
1281                print STDERR prettifyCommand("./$cmd.sh") . "\n";
1282            }
1283            exit(0);
1284        }
1285
1286        foreach my $j (@jobs) {
1287            my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef);
1288
1289            if (runCommandSilently($path, "./$cmd.sh", 0)) {
1290                print STDERR "-- Failed to submit compute jobs.  Delay 10 seconds and try again.\n";
1291                sleep(10);
1292
1293                runCommandSilently($path, "./$cmd.sh", 0) and caFailure("Failed to submit compute jobs", "$path/$cmd.out");
1294            }
1295
1296            #  Parse the stdout/stderr from the submit command to find the id of the job
1297            #  we just submitted.  We'll use this to hold the next iteration until all these
1298            #  jobs have completed.
1299
1300            open(F, "< $path/$cmd.out");
1301            while (<F>) {
1302                chomp;
1303
1304                if (uc(getGlobal("gridEngine")) eq "SGE") {
1305                    #  Your job 148364 ("canu_asm") has been submitted
1306                    if (m/Your\sjob\s(\d+)\s/) {
1307                        $jobName = $1;
1308                    }
1309                    #  Your job-array 148678.1500-1534:1 ("canu_asm") has been submitted
1310                    if (m/Your\sjob-array\s(\d+).\d+-\d+:\d\s/) {
1311                        $jobName = $1;
1312                    }
1313                }
1314
1315                if (uc(getGlobal("gridEngine")) eq "LSF") {
1316                    #  Job <759810> is submitted to queue <14>.
1317                    if (m/Job\s<(\d+)>\sis/) {
1318                        $jobName = "ended($1)";
1319                    }
1320                }
1321
1322                if (uc(getGlobal("gridEngine")) eq "PBS") {
1323                    #  123456.qm2
1324                    $jobName = $_;
1325                }
1326
1327                if (uc(getGlobal("gridEngine")) eq "PBSPRO") {
1328                    #  ??
1329                    $jobName = $_;
1330                }
1331
1332                if (uc(getGlobal("gridEngine")) eq "SLURM") {
1333                    #  BPW has seen Slurm report "ERROR" instead of something
1334                    #  useful here.  If that is seen, report the error to the
1335                    #  screen and ignore this job.  We'll redo it on the next
1336                    #  iteration (unless this is the second iteration, then
1337                    #  we're screwed either way).
1338                    if (m/Submitted\sbatch\sjob\s(\d+)/) {
1339                        $jobName = $1;
1340                    } elsif (m/ERROR/) {
1341                        $jobName = undef;
1342                    } else {
1343                        $jobName = $_;
1344                    }
1345                }
1346
1347                if (uc(getGlobal("gridEngine")) eq "DNANEXUS") {
1348                   $jobName = $_;
1349                }
1350            }
1351            close(F);
1352
1353            if      (!defined($jobName)) {
1354                print STDERR "-- '$cmd.sh' -> returned an error; job not submitted.\n";
1355            } elsif ($j =~ m/^\d+$/) {
1356                print STDERR "-- '$cmd.sh' -> job $jobName task $j.\n";
1357            } else {
1358                print STDERR "-- '$cmd.sh' -> job $jobName tasks $j.\n";
1359            }
1360
1361            if (defined($jobName)) {
1362                push @jobsSubmitted, $jobName;
1363            }
1364        }
1365
1366        print STDERR "--\n";
1367
1368        #  All jobs submitted.  Make an option to hold the executive on those jobs.
1369
1370        my $jobHold;
1371
1372        if (uc(getGlobal("gridEngine")) eq "SGE") {
1373            $jobHold = "-hold_jid " . join ",", @jobsSubmitted;
1374        }
1375
1376        if (uc(getGlobal("gridEngine")) eq "LSF") {
1377            $jobHold = "-w \"" . (join "&&", @jobsSubmitted) . "\"";
1378        }
1379
1380        if (uc(getGlobal("gridEngine")) eq "PBS") {
1381            # new PBS versions dont have 1-task arrays like PBSPro but still have afteranyarray (which doesn't work on a not-array task)
1382            # so we need to check if we are waiting for a regular job or array
1383            my $holdType = (join ":", @jobsSubmitted)  =~ m/^(\d+)\[(.*)\]/ ? "afteranyarray" : "afterany";
1384            $jobHold = "-W depend=$holdType:" . join ":", @jobsSubmitted;
1385        }
1386
1387        if (uc(getGlobal("gridEngine")) eq "PBSPRO") {
1388            $jobHold = "-W depend=afterany:" . join ":", @jobsSubmitted;
1389        }
1390
1391        if (uc(getGlobal("gridEngine")) eq "SLURM") {
1392            $jobHold = "--depend=afterany:" . join ":", @jobsSubmitted;
1393        }
1394
1395        if (uc(getGlobal("gridEngine")) eq "DNANEXUS") {
1396            $jobHold = "--depends-on " . join " ", @jobsSubmitted;
1397        }
1398
1399        submitScript($asm, $jobHold);
1400
1401        #  submitScript() should never return.  If it does, then a parallel step was attempted too many time.
1402
1403        caExit("Too many attempts to run a parallel stage on the grid.  Stop.", undef);
1404    }
1405
1406    #  Jobs under grid control, but the user must submit them
1407
1408    if (defined(getGlobal("gridEngine")) &&
1409        (getGlobal("useGrid") ne "0") &&
1410        (getGlobal("useGrid$jobType") eq "1") &&
1411        (! exists($ENV{getGlobal("gridEngineJobID")})) &&
1412        ($runDirectly == 0)) {
1413        my $cwd = getcwd();
1414        my $s   = (scalar(@jobs) == 1) ? "" : "s";
1415
1416        print STDERR "\n";
1417        print STDERR "Please run the following command$s to submit tasks to the grid for execution.\n";
1418        print STDERR "Each task will use $mem gigabytes memory and $thr threads.\n";
1419        print STDERR "\n";
1420        print STDERR "  cd $cwd/$path\n";
1421
1422        purgeGridJobSubmitScripts($path, $script);
1423
1424        foreach my $j (@jobs) {
1425            my ($cmd, $jobName) = buildGridJob($asm, $jobType, $path, $script, $mem, $thr, $dsk, $j, undef);
1426
1427            print "  ./$cmd.sh\n";
1428        }
1429
1430        print STDERR "\n";
1431        print STDERR "When all tasks are finished, restart canu as before.  The output of the grid\n";
1432        print STDERR "submit command$s will be in *jobSubmit*out.\n";
1433        print STDERR "\n";
1434
1435        exit(0);
1436    }
1437
1438    #  Standard jobs, run locally.
1439
1440    foreach my $j (@jobs) {
1441        my $st;
1442        my $ed;
1443
1444        if ($j =~ m/^(\d+)-(\d+)$/) {
1445            $st = $1;
1446            $ed = $2;
1447        } else {
1448            $st = $ed = $j;
1449        }
1450
1451        if (getGlobal("showNext")) {
1452            print STDERR "--\n";
1453            print STDERR "-- NEXT COMMANDS\n";
1454            print STDERR "--\n";
1455            print STDERR "\n";
1456            print STDERR prettifyCommand("cd $path") . "\n";
1457            for (my $i=$st; $i<=$ed; $i++) {
1458                print STDERR prettifyCommand("./$script.sh $i") . "\n";
1459            }
1460            exit(0);
1461        }
1462
1463        for (my $i=$st; $i<=$ed; $i++) {
1464            schedulerSubmit("./$script.sh $i > ./" . buildOutputName($path, $script, $i) . " 2>&1");
1465        }
1466    }
1467
1468    # compute limit based on # of cpus
1469    my $nCParallel  = getGlobal("${jobType}Concurrency");
1470    $nCParallel     = int(getGlobal("maxThreads") / $thr)  if ((!defined($nCParallel)) || ($nCParallel == 0));
1471    $nCParallel     = 1                                    if ((!defined($nCParallel)) || ($nCParallel == 0));
1472
1473    # compute limit based on physical memory
1474    my $nMParallel = getGlobal("${jobType}Concurrency");
1475    $nMParallel    = int(getGlobal("maxMemory") / getGlobal("${jobType}Memory")) if ((!defined($nMParallel)) || ($nMParallel == 0));
1476    $nMParallel    = 1                                                           if ((!defined($nMParallel)) || ($nMParallel == 0));
1477
1478    # run min of our limits
1479    my $nParallel  = $nCParallel < $nMParallel ? $nCParallel : $nMParallel;
1480
1481    schedulerSetNumberOfProcesses($nParallel);
1482    schedulerFinish($path, $jobType);
1483}
1484
1485
1486
1487
1488#  Pretty-ify the command.  If there are no newlines already in it, break
1489#  before every switch and before file redirects.
1490
1491sub prettifyCommand ($) {
1492    my $dis = shift @_;
1493
1494    if (($dis =~ tr/\n/\n/) == 0) {
1495        $dis =~ s/\s-/ \\\n  -/g;    #  Replace ' -' with '\n  -' (newline, two spaces, then the dash)
1496        $dis =~ s/\s>\s/ \\\n> /;    #  Replace ' > ' with '\n> '
1497        $dis =~ s/\s2>\s/ \\\n2> /;  #  Replace ' 2> ' with '\n2> '
1498    }
1499
1500    $dis = "    " . $dis;    #  Indent the command by four spaces.
1501    $dis =~ s/\n/\n    /g;
1502
1503    return($dis);
1504}
1505
1506
1507sub reportRunError ($) {
1508    my $rc  = shift @_;
1509
1510    #  Bunch of busy work to get the names of signals.  Is it really worth it?!
1511
1512    my @signame;
1513    if (defined($Config{sig_name})) {
1514        my $i = 0;
1515        foreach my $n (split('\s+', $Config{sig_name})) {
1516            $signame[$i] = $n;
1517            $i++;
1518        }
1519    } else {
1520        for (my $i=0; $i<127; $i++) {
1521            $signame[$i] = "signal $i";
1522        }
1523    }
1524
1525    #  The rest is rather straightforward at least.
1526
1527    print STDERR "\n";
1528    print STDERR "ERROR:\n";
1529
1530    if      ($rc ==  -1) {
1531        print STDERR "ERROR:  Failed to run the command.  (rc=$rc)\n";
1532    } elsif ($rc  & 127) {
1533        print STDERR "ERROR:  Failed with signal $signame[$rc & 127].  (rc=$rc)\n";
1534    } else {
1535        print STDERR "ERROR:  Failed with exit code ", $rc >> 8 , ".  (rc=$rc)\n";
1536    }
1537
1538    print STDERR "ERROR:\n";
1539}
1540
1541
1542#  Utility to run a command and check the exit status, report time used.
1543#
1544sub runCommand ($$) {
1545    my $dir = shift @_;
1546    my $cmd = shift @_;
1547    my $dis = prettifyCommand($cmd);
1548
1549    return(0)  if ($cmd eq "");
1550
1551    #  Check if the directory exists.
1552
1553    if (! -d $dir) {
1554        caFailure("Directory '$dir' doesn't exist, can't run command", "");
1555    }
1556
1557    #  If only showing the next command, show it and stop.
1558
1559    if (getGlobal("showNext")) {
1560        print STDERR "--\n";
1561        print STDERR "-- NEXT COMMAND\n";
1562        print STDERR "--\n";
1563        print STDERR "\n";
1564        print STDERR prettifyCommand("cd $dir") . "\n";
1565        print STDERR "$dis\n";
1566        exit(0);
1567    }
1568
1569    #  Log that we're starting, and show the pretty-ified command.
1570
1571    my $cwd = getcwd();        #  Remember where we are.
1572    chdir($dir);               #  So we can root the jobs in the correct location.
1573
1574    my $startsecs = time();
1575    my $diskfree  = diskSpace(".");
1576
1577    print STDERR "----------------------------------------\n";
1578    print STDERR "-- Starting command on ", scalar(localtime()), " with $diskfree GB free disk space\n";
1579    print STDERR "\n";
1580    print STDERR "    cd $dir\n";
1581    print STDERR "$dis\n";
1582
1583    my $rc = 0xffff & system($cmd);
1584
1585    logFinished(".", $startsecs);
1586
1587    chdir($cwd);
1588
1589    #  Pretty much copied from Programming Perl page 230
1590
1591    return(0) if ($rc == 0);
1592
1593    reportRunError($rc);
1594
1595    return(1);
1596}
1597
1598
1599
1600#  Duplicated in Grid_Cloud.pm to get around recursive 'use' statements.
1601
1602sub runCommandSilently ($$$) {
1603    my $dir      = shift @_;
1604    my $cmd      = shift @_;
1605    my $dis      = prettifyCommand($cmd);
1606    my $critical = shift @_;
1607
1608    return(0)   if ($cmd eq "");
1609
1610    my $cwd       = getcwd();  #  Remember where we are.
1611    chdir($dir);               #  So we can root the jobs in the correct location.
1612
1613    my $rc = 0xffff & system($cmd);
1614
1615    chdir($cwd);
1616
1617    return(0) if ($rc == 0);         #  No errors, return no error.
1618    return(1) if ($critical == 0);   #  If not critical, return that it failed, otherwise, report error and fail.
1619
1620    print STDERR "$dis\n";
1621
1622    reportRunError($rc);
1623
1624    return(1);
1625}
1626
1627
1628
1629sub findCommand ($) {
1630    my $cmd  = shift @_;
1631    my @path = File::Spec->path;
1632
1633    for my $path (@path) {
1634        if (-x "$path/$cmd") {
1635            return("$path/$cmd");
1636        }
1637    }
1638
1639    return(undef);
1640}
1641
1642
1643
1644sub findExecutable ($) {
1645    my $exec = shift @_;
1646
1647    my $path = `which \"$exec\" 2> /dev/null`;
1648
1649    $path =~ s/^\s+//;
1650    $path =~ s/\s+$//;
1651
1652    return(undef)  if ($path eq "");
1653    return($path);
1654}
1655
1656
1657#  Use caExit() for transient errors, like not opening files, processes that die, etc.
1658sub caExit ($$) {
1659    my  $asm     = getGlobal("onExitNam");
1660    my  $msg     = shift @_;
1661    my  $log     = shift @_;
1662    my  $version = getGlobal("version");
1663
1664    $msg = undef   if ($msg eq "");
1665    $log = undef   if ($log eq "");
1666
1667    print STDERR "\n";
1668    print STDERR "ABORT:\n";
1669    print STDERR "ABORT: $version\n";
1670    print STDERR "ABORT: Don't panic, but a mostly harmless error occurred and Canu stopped.\n";
1671    print STDERR "ABORT: Try restarting.  If that doesn't work, ask for help.\n";
1672    print STDERR "ABORT:\n";
1673    print STDERR "ABORT:   $msg.\n"     if (defined($msg));
1674    print STDERR "ABORT:\n"             if (defined($msg));
1675
1676    if (defined($log) && -e $log) {
1677        my  $df = diskSpace($log);
1678
1679        print STDERR "ABORT: Disk space available:  $df GB\n";
1680        print STDERR "ABORT:\n";
1681    }
1682
1683    if (-e $log) {
1684        print STDERR "ABORT: Last 50 lines of the relevant log file ($log):\n";
1685        print STDERR "ABORT:\n";
1686
1687        open(Z, "tail -n 50 $log |");
1688        while (<Z>) {
1689            print STDERR "ABORT:   $_";
1690        }
1691        close(Z);
1692
1693        print STDERR "ABORT:\n";
1694    }
1695
1696    my $fail = getGlobal('onFailure');
1697    if (defined($fail)) {
1698        runCommandSilently(getGlobal("onExitDir"), "$fail $asm", 0);
1699    }
1700
1701    exit(1);
1702}
1703
1704
1705#  Use caFailure() for errors that definitely will require code changes to fix.
1706sub caFailure ($$) {
1707    my  $asm     = getGlobal("onExitNam");
1708    my  $msg     = shift @_;
1709    my  $log     = shift @_;
1710    my  $version = getGlobal("version");
1711    my  $trace   = longmess("Failed");
1712
1713    $trace =~ s/\n/\nCRASH: /g;
1714
1715    print STDERR "\n";
1716    print STDERR "CRASH:\n";
1717    print STDERR "CRASH: $version\n";
1718    print STDERR "CRASH: Please panic, this is abnormal.\n";
1719    print STDERR "CRASH:\n";
1720    print STDERR "CRASH:   $msg.\n";
1721    print STDERR "CRASH:\n";
1722    print STDERR "CRASH: $trace\n";
1723    #print STDERR "CRASH:\n";   #  $trace has an extra CRASH: at the end
1724
1725    if (-e $log) {
1726        print STDERR "CRASH: Last 50 lines of the relevant log file ($log):\n";
1727        print STDERR "CRASH:\n";
1728
1729        open(Z, "tail -n 50 $log |");
1730        while (<Z>) {
1731            print STDERR "CRASH: $_";
1732        }
1733        close(Z);
1734
1735        print STDERR "CRASH:\n";
1736    } else {
1737        print STDERR "CRASH: No log file supplied.\n";
1738        print STDERR "CRASH:\n";
1739    }
1740
1741    my $fail = getGlobal('onFailure');
1742    if (defined($fail)) {
1743        runCommandSilently(getGlobal("onExitDir"), "$fail $asm", 0);
1744    }
1745
1746    exit(1);
1747}
1748
1749
17501;
1751