1
2###############################################################################
3 #
4 #  This file is part of canu, a software program that assembles whole-genome
5 #  sequencing reads into contigs.
6 #
7 #  This software is based on:
8 #    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 #    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 #
11 #  Except as indicated otherwise, this is a 'United States Government Work',
12 #  and is released in the public domain.
13 #
14 #  File 'README.licenses' in the root directory of this distribution
15 #  contains full conditions and disclaimers.
16 ##
17
18package canu::OverlapInCore;
19
20require Exporter;
21
22@ISA    = qw(Exporter);
23@EXPORT = qw(overlapConfigure overlapCheck);
24
25use strict;
26use warnings "all";
27no  warnings "uninitialized";
28
29use File::Path 2.08 qw(make_path remove_tree);
30
31use canu::Defaults;
32use canu::Execution;
33
34use canu::Report;
35
36use canu::Grid_Cloud;
37
38
39sub overlapConfigure ($$$) {
40    my $asm     = shift @_;
41    my $tag     = shift @_;
42    my $type    = shift @_;
43
44    my $bin  = getBinDirectory();
45    my $cmd;
46
47    my $base;
48    my $path;
49
50    $base = "correction"  if ($tag eq "cor");
51    $base = "trimming"    if ($tag eq "obt");
52    $base = "unitigging"  if ($tag eq "utg");
53
54    $path = "$base/1-overlapper";
55
56    caFailure("invalid type '$type'", undef)  if (($type ne "partial") && ($type ne "normal"));
57
58    goto allDone   if (fileExists("$path/overlap.sh") && fileExists("$path/$asm.partition.ovlbat") && fileExists("$path/$asm.partition.ovljob") && fileExists("$path/$asm.partition.ovlopt"));
59    goto allDone   if (fileExists("$path/ovljob.files"));
60    goto allDone   if ((-d "$base/$asm.ovlStore") || (fileExists("$base/$asm.ovlStore.tar.gz")));
61
62    print STDERR "--\n";
63    print STDERR "-- OVERLAPPER (normal) (correction) erate=", getGlobal("corOvlErrorRate"), "\n"  if ($tag eq "cor");
64    print STDERR "-- OVERLAPPER (normal) (trimming) erate=",   getGlobal("obtOvlErrorRate"), "\n"    if ($tag eq "obt");
65    print STDERR "-- OVERLAPPER (normal) (assembly) erate=",   getGlobal("utgOvlErrorRate"), "\n"    if ($tag eq "utg");
66    print STDERR "--\n";
67
68    make_path("$path") if (! -d "$path");
69
70    #  overlapInCorePartition internally uses 'WORKING' outputs, and renames to the final
71    #  version right before it exits.  All we need to do here is check for existence of
72    #  the output, and exit if the command fails.
73
74    fetchFile("$path/$asm.partition.ovlbat");
75    fetchFile("$path/$asm.partition.ovljob");
76    fetchFile("$path/$asm.partition.ovlopt");
77    fetchFile("$path/overlap.sh");              #  Fetch early, so we can delete if ovlbat etc don't exist.
78
79    if ((! -e "$path/$asm.partition.ovlbat") ||
80        (! -e "$path/$asm.partition.ovljob") ||
81        (! -e "$path/$asm.partition.ovlopt")) {
82
83        #  These used to be runCA options, but were removed in canu.  They were used mostly for
84        #  illumina-pacbio correction, but were also used (or could have been used) during the
85        #  Salmon assembly when overlaps were computed differently depending on the libraries
86        #  involved (and was run manually).  These are left in for documentation.
87        #
88        #my $checkLibrary       = getGlobal("${tag}CheckLibrary");    #  -C
89        #my $hashLibrary        = getGlobal("${tag}HashLibrary");     #  -H $hashLibrary
90        #my $refLibrary         = getGlobal("${tag}RefLibrary");      #  -R $refLibrary
91
92        $cmd  = "$bin/overlapInCorePartition \\\n";
93        $cmd .= " -S  ../../$asm.seqStore \\\n";
94        $cmd .= " -hl " . getGlobal("${tag}OvlHashBlockLength") . " \\\n";
95        $cmd .= " -rl " . getGlobal("${tag}OvlRefBlockLength")  . " \\\n";
96        $cmd .= " -ol " . getGlobal("minOverlapLength") . " \\\n";
97        $cmd .= " -o  ./$asm.partition \\\n";
98        $cmd .= "> ./$asm.partition.err 2>&1";
99
100        if (runCommand($path, $cmd)) {
101            caExit("failed partition for overlapper", undef);
102        }
103
104        stashFile("$path/$asm.partition.ovlbat");
105        stashFile("$path/$asm.partition.ovljob");
106        stashFile("$path/$asm.partition.ovlopt");
107
108        unlink "$path/overlap.sh";
109    }
110
111    open(BAT, "< $path/$asm.partition.ovlbat") or caExit("can't open '$path/$asm.partition.ovlbat' for reading: $!", undef);
112    open(JOB, "< $path/$asm.partition.ovljob") or caExit("can't open '$path/$asm.partition.ovljob' for reading: $!", undef);
113    open(OPT, "< $path/$asm.partition.ovlopt") or caExit("can't open '$path/$asm.partition.ovlopt' for reading: $!", undef);
114
115    my @bat = <BAT>;  chomp @bat;
116    my @job = <JOB>;  chomp @job;
117    my @opt = <OPT>;  chomp @opt;
118
119    close(BAT);
120    close(JOB);
121    close(OPT);
122
123    if (! -e "$path/overlap.sh") {
124        my $merSize      = getGlobal("${tag}OvlMerSize");
125
126        #my $hashLibrary  = getGlobal("${tag}OvlHashLibrary");
127        #my $refLibrary   = getGlobal("${tag}OvlRefLibrary");
128
129        #  Create a script to run overlaps.  We make a giant job array for this -- we need to know
130        #  hashBeg, hashEnd, refBeg and refEnd -- from that we compute batchName and jobName.
131
132        my $hashBits       = getGlobal("${tag}OvlHashBits");
133        my $hashLoad       = getGlobal("${tag}OvlHashLoad");
134
135        open(F, "> $path/overlap.sh") or caExit("can't open '$path/overlap.sh' for writing: $!", undef);
136        print F "#!" . getGlobal("shell") . "\n";
137        print F "\n";
138        print F getBinDirectoryShellCode();
139        print F "\n";
140        print F setWorkDirectoryShellCode($path);
141        print F fetchSeqStoreShellCode($asm, $path, "");
142        print F "\n";
143        print F getJobIDShellCode();
144        print F "\n";
145
146        for (my $ii=1; $ii<=scalar(@bat); $ii++) {
147            print F "if [ \$jobid -eq $ii ] ; then\n";
148            print F "  bat=\"$bat[$ii-1]\"\n";            #  Needed to mkdir.
149            print F "  job=\"$bat[$ii-1]/$job[$ii-1]\"\n";  #  Needed to simplify overlapCheck() below.
150            print F "  opt=\"$opt[$ii-1]\"\n";
151            print F "fi\n";
152            print F "\n";
153        }
154
155        print F "\n";
156        print F "if [ ! -d ./\$bat ]; then\n";
157        print F "  mkdir ./\$bat\n";
158        print F "fi\n";
159        print F "\n";
160        print F fileExistsShellCode("exists", $path, "\$job.ovb");
161        print F "if [ \$exists = true ] ; then\n";
162        print F "  echo Job previously completed successfully.\n";
163        print F "  exit\n";
164        print F "fi\n";
165        print F "\n";
166        print F "#  Fetch the frequent kmers, if needed.\n";
167        print F "if [ ! -e ../0-mercounts/$asm.ms$merSize.dump ] ; then\n";
168        print F "  mkdir -p ../0-mercounts\n";
169        print F "  cd ../0-mercounts\n";
170        print F fetchFileShellCode("$base/0-mercounts", "$asm.ms$merSize.dump", "  ");
171        print F "  cd -\n";
172        print F "fi\n";
173        print F "\n";
174        print F "\n";
175        print F "\$bin/overlapInCore \\\n";
176        print F "  -partial \\\n"  if ($type eq "partial");
177        print F "  -t ", getGlobal("${tag}OvlThreads"), " \\\n";
178        print F "  -k $merSize \\\n";
179        print F "  -k ../0-mercounts/$asm.ms$merSize.dump \\\n";
180        print F "  --hashbits $hashBits \\\n";
181        print F "  --hashload $hashLoad \\\n";
182        print F "  --maxerate  ", getGlobal("corOvlErrorRate"), " \\\n"  if ($tag eq "cor");   #  Explicitly using proper name for grepability.
183        print F "  --maxerate  ", getGlobal("obtOvlErrorRate"), " \\\n"  if ($tag eq "obt");
184        print F "  --maxerate  ", getGlobal("utgOvlErrorRate"), " \\\n"  if ($tag eq "utg");
185        print F "  --minlength ", getGlobal("minOverlapLength"), " \\\n";
186        print F "  --minkmers \\\n" if (defined(getGlobal("${tag}OvlFilter")) && getGlobal("${tag}OvlFilter")==1);
187        print F "  \$opt \\\n";
188        print F "  -o ./\$job.ovb.WORKING \\\n";
189        print F "  -s ./\$job.stats \\\n";
190        #print F "  -H $hashLibrary \\\n" if ($hashLibrary ne "0");
191        #print F "  -R $refLibrary \\\n"  if ($refLibrary  ne "0");
192        print F "  ../../$asm.seqStore \\\n";
193        print F "&& \\\n";
194        print F "mv ./\$job.ovb.WORKING ./\$job.ovb\n";
195        print F "\n";
196        print F stashFileShellCode("$base/1-overlapper/", "\$job.ovb",   "");
197        print F stashFileShellCode("$base/1-overlapper/", "\$job.oc",    "");
198        print F stashFileShellCode("$base/1-overlapper/", "\$job.stats", "");
199        print F "\n";
200        print F "exit 0\n";
201        close(F);
202
203        makeExecutable("$path/overlap.sh");
204        stashFile("$path/overlap.sh");
205    }
206
207    my $jobs      = scalar(@job);
208    my $batchName = $bat[$jobs-1];  chomp $batchName;
209    my $jobName   = $job[$jobs-1];  chomp $jobName;
210
211    my $numJobs = 0;
212    open(F, "< $path/overlap.sh") or caExit("can't open '$path/overlap.sh' for reading: $!", undef);
213    while (<F>) {
214        $numJobs++  if (m/^\s+job=/);
215    }
216    close(F);
217    print STDERR "--\n";
218    print STDERR "-- Configured $numJobs overlapInCore jobs.\n";
219
220  finishStage:
221    generateReport($asm);
222    resetIteration("$tag-overlapConfigure");
223
224  allDone:
225    stopAfter("overlapConfigure");
226}
227
228
229
230sub reportSumMeanStdDev (@) {
231    my $sum    = 0;
232    my $mean   = 0;
233    my $stddev = 0;
234    my $n      = scalar(@_);
235    my $formatted;
236
237    $sum += $_  foreach (@_);
238
239    $mean = $sum / $n;
240
241    $stddev += ($_ - $mean) * ($_ - $mean)  foreach (@_);
242    $stddev  = int(1000 * sqrt($stddev / ($n-1))) / 1000  if ($n > 1);
243
244    $formatted  = substr("         $mean", -10) . " +- $stddev";
245
246    return($sum, $formatted);
247}
248
249
250sub reportOverlapStats ($$@) {
251    my $base      = shift @_;
252    my $asm       = shift @_;
253    my @statsJobs = @_;
254
255    my @hitsWithoutOlaps;
256    my @hitsWithOlaps;
257    my @multiOlaps;
258    my @totalOlaps;
259    my @containedOlaps;
260    my @dovetailOlaps;
261    my @shortReject;
262    my @longReject;
263
264    foreach my $s (@statsJobs) {
265        fetchFile("$base/$s");
266
267        open(F, "< $base/$s") or caExit("can't open '$base/$s' for reading: $!", undef);
268
269        $_ = <F>;  push @hitsWithoutOlaps, $1  if (m/^\s*Kmer\shits\swithout\solaps\s=\s(\d+)$/);
270        $_ = <F>;  push @hitsWithOlaps, $1     if (m/^\s*Kmer\shits\swith\solaps\s=\s(\d+)$/);
271        $_ = <F>;  push @multiOlaps, $1        if (m/^\s*Multiple\soverlaps\/pair\s=\s(\d+)$/);
272        $_ = <F>;  push @totalOlaps, $1        if (m/^\s*Total\soverlaps\sproduced\s=\s(\d+)$/);
273        $_ = <F>;  push @containedOlaps, $1    if (m/^\s*Contained\soverlaps\s=\s(\d+)$/);
274        $_ = <F>;  push @dovetailOlaps, $1     if (m/^\s*Dovetail\soverlaps\s=\s(\d+)$/);
275        $_ = <F>;  push @shortReject, $1       if (m/^\s*Rejected\sby\sshort\swindow\s=\s(\d+)$/);
276        $_ = <F>;  push @longReject, $1        if (m/^\s*Rejected\sby\slong\swindow\s=\s(\d+)$/);
277
278        close(F);
279    }
280
281    printf STDERR "--\n";
282    printf STDERR "-- overlapInCore compute '$base/1-overlapper':\n";
283    printf STDERR "--   kmer hits\n";
284    printf STDERR "--     with no overlap     %12d  %s\n", reportSumMeanStdDev(@hitsWithoutOlaps);
285    printf STDERR "--     with an overlap     %12d  %s\n", reportSumMeanStdDev(@hitsWithOlaps);
286    printf STDERR "--\n";
287    printf STDERR "--   overlaps              %12d  %s\n", reportSumMeanStdDev(@totalOlaps);
288    printf STDERR "--     contained           %12d  %s\n", reportSumMeanStdDev(@containedOlaps);
289    printf STDERR "--     dovetail            %12d  %s\n", reportSumMeanStdDev(@dovetailOlaps);
290    printf STDERR "--\n";
291    printf STDERR "--   overlaps rejected\n";
292    printf STDERR "--     multiple per pair   %12d  %s\n", reportSumMeanStdDev(@multiOlaps);
293    printf STDERR "--     bad short window    %12d  %s\n", reportSumMeanStdDev(@shortReject);
294    printf STDERR "--     bad long window     %12d  %s\n", reportSumMeanStdDev(@longReject);
295}
296
297
298#  Check that the overlapper jobs properly executed.  If not,
299#  complain, but don't help the user fix things.
300#
301sub overlapCheck ($$$) {
302    my $asm     = shift @_;
303    my $tag     = shift @_;
304    my $type    = shift @_;
305    my $attempt = getGlobal("canuIteration");
306
307    my $base;
308    my $path;
309
310    $base = "correction"  if ($tag eq "cor");
311    $base = "trimming"    if ($tag eq "obt");
312    $base = "unitigging"  if ($tag eq "utg");
313
314    $path = "$base/1-overlapper";
315
316    goto allDone   if (fileExists("$path/ovljob.files"));
317    goto allDone   if ((-d "$base/$asm.ovlStore") || (fileExists("$base/$asm.ovlStore.tar.gz")));
318
319    #  Figure out if all the tasks finished correctly.
320
321    my $currentJobID   = 1;
322    my @successJobs;
323    my @statsJobs;
324    my @miscJobs;
325    my @failedJobs;
326    my $failureMessage = "";
327
328    fetchFile("$path/overlap.sh");
329
330    open(F, "< $path/overlap.sh") or caExit("can't open '$path/overlap.sh' for reading: $!", undef);
331
332    while (<F>) {
333        if (m/^\s+job=\"(\d+\/\d+)\"$/) {
334            if      (fileExists("$path/$1.ovb.gz")) {
335                push @successJobs, "1-overlapper/$1.ovb.gz\n";   #  Dumped to a file, so include \n
336                push @statsJobs,   "1-overlapper/$1.stats";      #  Used here, don't include \n
337                push @miscJobs,    "1-overlapper/$1.stats\n";
338                push @miscJobs,    "1-overlapper/$1.oc\n";
339
340            } elsif (fileExists("$path/$1.ovb")) {
341                push @successJobs, "1-overlapper/$1.ovb\n";
342                push @statsJobs,   "1-overlapper/$1.stats";
343                push @miscJobs,    "1-overlapper/$1.stats\n";
344                push @miscJobs,    "1-overlapper/$1.oc\n";
345
346            } elsif (fileExists("$path/$1.ovb.bz2")) {
347                push @successJobs, "1-overlapper/$1.ovb.bz2\n";
348                push @statsJobs,   "1-overlapper/$1.stats";
349                push @miscJobs,    "1-overlapper/$1.stats\n";
350                push @miscJobs,    "1-overlapper/$1.oc\n";
351
352            } elsif (fileExists("$path/$1.ovb.xz")) {
353                push @successJobs, "1-overlapper/$1.ovb.xz\n";
354                push @statsJobs,   "1-overlapper/$1.stats";
355                push @miscJobs,    "1-overlapper/$1.stats\n";
356                push @miscJobs,    "1-overlapper/$1.oc\n";
357
358            } else {
359                $failureMessage .= "--   job $path/$1.ovb FAILED.\n";
360                push @failedJobs, $currentJobID;
361            }
362
363            $currentJobID++;
364        }
365    }
366
367    close(F);
368
369    #  Failed jobs, retry.
370
371    if (scalar(@failedJobs) > 0) {
372
373        #  If too many attempts, give up.
374
375        if ($attempt >= getGlobal("canuIterationMax")) {
376            print STDERR "--\n";
377            print STDERR "-- Overlap jobs failed, tried $attempt times, giving up.\n";
378            print STDERR $failureMessage;
379            print STDERR "--\n";
380            caExit(undef, undef);
381        }
382
383        if ($attempt > 0) {
384            print STDERR "--\n";
385            print STDERR "-- Overlap jobs failed, retry.\n";
386            print STDERR $failureMessage;
387            print STDERR "--\n";
388        }
389
390        #  Otherwise, run some jobs.
391
392        generateReport($asm);
393
394        submitOrRunParallelJob($asm, "${tag}ovl", $path, "overlap", @failedJobs);
395        return;
396    }
397
398  finishStage:
399    print STDERR "-- Found ", scalar(@successJobs), " overlapInCore output files.\n";
400
401    open(L, "> $path/ovljob.files") or caExit("can't open '$path/ovljob.files' for writing: $!", undef);
402    print L @successJobs;
403    close(L);
404
405    open(L, "> $path/ovljob.more.files") or caExit("can't open '$path/ovljob.more.files' for writing: $!", undef);
406    print L @miscJobs;
407    close(L);
408
409    stashFile("$path/ovljob.files");
410    stashFile("$path/ovljob.more.files");
411
412    reportOverlapStats($base, $asm, @statsJobs);
413
414    generateReport($asm);
415    resetIteration("$tag-overlapCheck");
416
417  allDone:
418    stopAfter("overlap");
419}
420
421