1 2############################################################################### 3 # 4 # This file is part of canu, a software program that assembles whole-genome 5 # sequencing reads into contigs. 6 # 7 # This software is based on: 8 # 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) 9 # the 'kmer package' r1994 (http://kmer.sourceforge.net) 10 # 11 # Except as indicated otherwise, this is a 'United States Government Work', 12 # and is released in the public domain. 13 # 14 # File 'README.licenses' in the root directory of this distribution 15 # contains full conditions and disclaimers. 16 ## 17 18package canu::OverlapInCore; 19 20require Exporter; 21 22@ISA = qw(Exporter); 23@EXPORT = qw(overlapConfigure overlapCheck); 24 25use strict; 26use warnings "all"; 27no warnings "uninitialized"; 28 29use File::Path 2.08 qw(make_path remove_tree); 30 31use canu::Defaults; 32use canu::Execution; 33 34use canu::Report; 35 36use canu::Grid_Cloud; 37 38 39sub overlapConfigure ($$$) { 40 my $asm = shift @_; 41 my $tag = shift @_; 42 my $type = shift @_; 43 44 my $bin = getBinDirectory(); 45 my $cmd; 46 47 my $base; 48 my $path; 49 50 $base = "correction" if ($tag eq "cor"); 51 $base = "trimming" if ($tag eq "obt"); 52 $base = "unitigging" if ($tag eq "utg"); 53 54 $path = "$base/1-overlapper"; 55 56 caFailure("invalid type '$type'", undef) if (($type ne "partial") && ($type ne "normal")); 57 58 goto allDone if (fileExists("$path/overlap.sh") && fileExists("$path/$asm.partition.ovlbat") && fileExists("$path/$asm.partition.ovljob") && fileExists("$path/$asm.partition.ovlopt")); 59 goto allDone if (fileExists("$path/ovljob.files")); 60 goto allDone if ((-d "$base/$asm.ovlStore") || (fileExists("$base/$asm.ovlStore.tar.gz"))); 61 62 print STDERR "--\n"; 63 print STDERR "-- OVERLAPPER (normal) (correction) erate=", getGlobal("corOvlErrorRate"), "\n" if ($tag eq "cor"); 64 print STDERR "-- OVERLAPPER (normal) (trimming) erate=", getGlobal("obtOvlErrorRate"), "\n" if ($tag eq "obt"); 65 print STDERR "-- OVERLAPPER (normal) (assembly) erate=", getGlobal("utgOvlErrorRate"), "\n" if ($tag eq "utg"); 66 print STDERR "--\n"; 67 68 make_path("$path") if (! -d "$path"); 69 70 # overlapInCorePartition internally uses 'WORKING' outputs, and renames to the final 71 # version right before it exits. All we need to do here is check for existence of 72 # the output, and exit if the command fails. 73 74 fetchFile("$path/$asm.partition.ovlbat"); 75 fetchFile("$path/$asm.partition.ovljob"); 76 fetchFile("$path/$asm.partition.ovlopt"); 77 fetchFile("$path/overlap.sh"); # Fetch early, so we can delete if ovlbat etc don't exist. 78 79 if ((! -e "$path/$asm.partition.ovlbat") || 80 (! -e "$path/$asm.partition.ovljob") || 81 (! -e "$path/$asm.partition.ovlopt")) { 82 83 # These used to be runCA options, but were removed in canu. They were used mostly for 84 # illumina-pacbio correction, but were also used (or could have been used) during the 85 # Salmon assembly when overlaps were computed differently depending on the libraries 86 # involved (and was run manually). These are left in for documentation. 87 # 88 #my $checkLibrary = getGlobal("${tag}CheckLibrary"); # -C 89 #my $hashLibrary = getGlobal("${tag}HashLibrary"); # -H $hashLibrary 90 #my $refLibrary = getGlobal("${tag}RefLibrary"); # -R $refLibrary 91 92 $cmd = "$bin/overlapInCorePartition \\\n"; 93 $cmd .= " -S ../../$asm.seqStore \\\n"; 94 $cmd .= " -hl " . getGlobal("${tag}OvlHashBlockLength") . " \\\n"; 95 $cmd .= " -rl " . getGlobal("${tag}OvlRefBlockLength") . " \\\n"; 96 $cmd .= " -ol " . getGlobal("minOverlapLength") . " \\\n"; 97 $cmd .= " -o ./$asm.partition \\\n"; 98 $cmd .= "> ./$asm.partition.err 2>&1"; 99 100 if (runCommand($path, $cmd)) { 101 caExit("failed partition for overlapper", undef); 102 } 103 104 stashFile("$path/$asm.partition.ovlbat"); 105 stashFile("$path/$asm.partition.ovljob"); 106 stashFile("$path/$asm.partition.ovlopt"); 107 108 unlink "$path/overlap.sh"; 109 } 110 111 open(BAT, "< $path/$asm.partition.ovlbat") or caExit("can't open '$path/$asm.partition.ovlbat' for reading: $!", undef); 112 open(JOB, "< $path/$asm.partition.ovljob") or caExit("can't open '$path/$asm.partition.ovljob' for reading: $!", undef); 113 open(OPT, "< $path/$asm.partition.ovlopt") or caExit("can't open '$path/$asm.partition.ovlopt' for reading: $!", undef); 114 115 my @bat = <BAT>; chomp @bat; 116 my @job = <JOB>; chomp @job; 117 my @opt = <OPT>; chomp @opt; 118 119 close(BAT); 120 close(JOB); 121 close(OPT); 122 123 if (! -e "$path/overlap.sh") { 124 my $merSize = getGlobal("${tag}OvlMerSize"); 125 126 #my $hashLibrary = getGlobal("${tag}OvlHashLibrary"); 127 #my $refLibrary = getGlobal("${tag}OvlRefLibrary"); 128 129 # Create a script to run overlaps. We make a giant job array for this -- we need to know 130 # hashBeg, hashEnd, refBeg and refEnd -- from that we compute batchName and jobName. 131 132 my $hashBits = getGlobal("${tag}OvlHashBits"); 133 my $hashLoad = getGlobal("${tag}OvlHashLoad"); 134 135 open(F, "> $path/overlap.sh") or caExit("can't open '$path/overlap.sh' for writing: $!", undef); 136 print F "#!" . getGlobal("shell") . "\n"; 137 print F "\n"; 138 print F getBinDirectoryShellCode(); 139 print F "\n"; 140 print F setWorkDirectoryShellCode($path); 141 print F fetchSeqStoreShellCode($asm, $path, ""); 142 print F "\n"; 143 print F getJobIDShellCode(); 144 print F "\n"; 145 146 for (my $ii=1; $ii<=scalar(@bat); $ii++) { 147 print F "if [ \$jobid -eq $ii ] ; then\n"; 148 print F " bat=\"$bat[$ii-1]\"\n"; # Needed to mkdir. 149 print F " job=\"$bat[$ii-1]/$job[$ii-1]\"\n"; # Needed to simplify overlapCheck() below. 150 print F " opt=\"$opt[$ii-1]\"\n"; 151 print F "fi\n"; 152 print F "\n"; 153 } 154 155 print F "\n"; 156 print F "if [ ! -d ./\$bat ]; then\n"; 157 print F " mkdir ./\$bat\n"; 158 print F "fi\n"; 159 print F "\n"; 160 print F fileExistsShellCode("exists", $path, "\$job.ovb"); 161 print F "if [ \$exists = true ] ; then\n"; 162 print F " echo Job previously completed successfully.\n"; 163 print F " exit\n"; 164 print F "fi\n"; 165 print F "\n"; 166 print F "# Fetch the frequent kmers, if needed.\n"; 167 print F "if [ ! -e ../0-mercounts/$asm.ms$merSize.dump ] ; then\n"; 168 print F " mkdir -p ../0-mercounts\n"; 169 print F " cd ../0-mercounts\n"; 170 print F fetchFileShellCode("$base/0-mercounts", "$asm.ms$merSize.dump", " "); 171 print F " cd -\n"; 172 print F "fi\n"; 173 print F "\n"; 174 print F "\n"; 175 print F "\$bin/overlapInCore \\\n"; 176 print F " -partial \\\n" if ($type eq "partial"); 177 print F " -t ", getGlobal("${tag}OvlThreads"), " \\\n"; 178 print F " -k $merSize \\\n"; 179 print F " -k ../0-mercounts/$asm.ms$merSize.dump \\\n"; 180 print F " --hashbits $hashBits \\\n"; 181 print F " --hashload $hashLoad \\\n"; 182 print F " --maxerate ", getGlobal("corOvlErrorRate"), " \\\n" if ($tag eq "cor"); # Explicitly using proper name for grepability. 183 print F " --maxerate ", getGlobal("obtOvlErrorRate"), " \\\n" if ($tag eq "obt"); 184 print F " --maxerate ", getGlobal("utgOvlErrorRate"), " \\\n" if ($tag eq "utg"); 185 print F " --minlength ", getGlobal("minOverlapLength"), " \\\n"; 186 print F " --minkmers \\\n" if (defined(getGlobal("${tag}OvlFilter")) && getGlobal("${tag}OvlFilter")==1); 187 print F " \$opt \\\n"; 188 print F " -o ./\$job.ovb.WORKING \\\n"; 189 print F " -s ./\$job.stats \\\n"; 190 #print F " -H $hashLibrary \\\n" if ($hashLibrary ne "0"); 191 #print F " -R $refLibrary \\\n" if ($refLibrary ne "0"); 192 print F " ../../$asm.seqStore \\\n"; 193 print F "&& \\\n"; 194 print F "mv ./\$job.ovb.WORKING ./\$job.ovb\n"; 195 print F "\n"; 196 print F stashFileShellCode("$base/1-overlapper/", "\$job.ovb", ""); 197 print F stashFileShellCode("$base/1-overlapper/", "\$job.oc", ""); 198 print F stashFileShellCode("$base/1-overlapper/", "\$job.stats", ""); 199 print F "\n"; 200 print F "exit 0\n"; 201 close(F); 202 203 makeExecutable("$path/overlap.sh"); 204 stashFile("$path/overlap.sh"); 205 } 206 207 my $jobs = scalar(@job); 208 my $batchName = $bat[$jobs-1]; chomp $batchName; 209 my $jobName = $job[$jobs-1]; chomp $jobName; 210 211 my $numJobs = 0; 212 open(F, "< $path/overlap.sh") or caExit("can't open '$path/overlap.sh' for reading: $!", undef); 213 while (<F>) { 214 $numJobs++ if (m/^\s+job=/); 215 } 216 close(F); 217 print STDERR "--\n"; 218 print STDERR "-- Configured $numJobs overlapInCore jobs.\n"; 219 220 finishStage: 221 generateReport($asm); 222 resetIteration("$tag-overlapConfigure"); 223 224 allDone: 225 stopAfter("overlapConfigure"); 226} 227 228 229 230sub reportSumMeanStdDev (@) { 231 my $sum = 0; 232 my $mean = 0; 233 my $stddev = 0; 234 my $n = scalar(@_); 235 my $formatted; 236 237 $sum += $_ foreach (@_); 238 239 $mean = $sum / $n; 240 241 $stddev += ($_ - $mean) * ($_ - $mean) foreach (@_); 242 $stddev = int(1000 * sqrt($stddev / ($n-1))) / 1000 if ($n > 1); 243 244 $formatted = substr(" $mean", -10) . " +- $stddev"; 245 246 return($sum, $formatted); 247} 248 249 250sub reportOverlapStats ($$@) { 251 my $base = shift @_; 252 my $asm = shift @_; 253 my @statsJobs = @_; 254 255 my @hitsWithoutOlaps; 256 my @hitsWithOlaps; 257 my @multiOlaps; 258 my @totalOlaps; 259 my @containedOlaps; 260 my @dovetailOlaps; 261 my @shortReject; 262 my @longReject; 263 264 foreach my $s (@statsJobs) { 265 fetchFile("$base/$s"); 266 267 open(F, "< $base/$s") or caExit("can't open '$base/$s' for reading: $!", undef); 268 269 $_ = <F>; push @hitsWithoutOlaps, $1 if (m/^\s*Kmer\shits\swithout\solaps\s=\s(\d+)$/); 270 $_ = <F>; push @hitsWithOlaps, $1 if (m/^\s*Kmer\shits\swith\solaps\s=\s(\d+)$/); 271 $_ = <F>; push @multiOlaps, $1 if (m/^\s*Multiple\soverlaps\/pair\s=\s(\d+)$/); 272 $_ = <F>; push @totalOlaps, $1 if (m/^\s*Total\soverlaps\sproduced\s=\s(\d+)$/); 273 $_ = <F>; push @containedOlaps, $1 if (m/^\s*Contained\soverlaps\s=\s(\d+)$/); 274 $_ = <F>; push @dovetailOlaps, $1 if (m/^\s*Dovetail\soverlaps\s=\s(\d+)$/); 275 $_ = <F>; push @shortReject, $1 if (m/^\s*Rejected\sby\sshort\swindow\s=\s(\d+)$/); 276 $_ = <F>; push @longReject, $1 if (m/^\s*Rejected\sby\slong\swindow\s=\s(\d+)$/); 277 278 close(F); 279 } 280 281 printf STDERR "--\n"; 282 printf STDERR "-- overlapInCore compute '$base/1-overlapper':\n"; 283 printf STDERR "-- kmer hits\n"; 284 printf STDERR "-- with no overlap %12d %s\n", reportSumMeanStdDev(@hitsWithoutOlaps); 285 printf STDERR "-- with an overlap %12d %s\n", reportSumMeanStdDev(@hitsWithOlaps); 286 printf STDERR "--\n"; 287 printf STDERR "-- overlaps %12d %s\n", reportSumMeanStdDev(@totalOlaps); 288 printf STDERR "-- contained %12d %s\n", reportSumMeanStdDev(@containedOlaps); 289 printf STDERR "-- dovetail %12d %s\n", reportSumMeanStdDev(@dovetailOlaps); 290 printf STDERR "--\n"; 291 printf STDERR "-- overlaps rejected\n"; 292 printf STDERR "-- multiple per pair %12d %s\n", reportSumMeanStdDev(@multiOlaps); 293 printf STDERR "-- bad short window %12d %s\n", reportSumMeanStdDev(@shortReject); 294 printf STDERR "-- bad long window %12d %s\n", reportSumMeanStdDev(@longReject); 295} 296 297 298# Check that the overlapper jobs properly executed. If not, 299# complain, but don't help the user fix things. 300# 301sub overlapCheck ($$$) { 302 my $asm = shift @_; 303 my $tag = shift @_; 304 my $type = shift @_; 305 my $attempt = getGlobal("canuIteration"); 306 307 my $base; 308 my $path; 309 310 $base = "correction" if ($tag eq "cor"); 311 $base = "trimming" if ($tag eq "obt"); 312 $base = "unitigging" if ($tag eq "utg"); 313 314 $path = "$base/1-overlapper"; 315 316 goto allDone if (fileExists("$path/ovljob.files")); 317 goto allDone if ((-d "$base/$asm.ovlStore") || (fileExists("$base/$asm.ovlStore.tar.gz"))); 318 319 # Figure out if all the tasks finished correctly. 320 321 my $currentJobID = 1; 322 my @successJobs; 323 my @statsJobs; 324 my @miscJobs; 325 my @failedJobs; 326 my $failureMessage = ""; 327 328 fetchFile("$path/overlap.sh"); 329 330 open(F, "< $path/overlap.sh") or caExit("can't open '$path/overlap.sh' for reading: $!", undef); 331 332 while (<F>) { 333 if (m/^\s+job=\"(\d+\/\d+)\"$/) { 334 if (fileExists("$path/$1.ovb.gz")) { 335 push @successJobs, "1-overlapper/$1.ovb.gz\n"; # Dumped to a file, so include \n 336 push @statsJobs, "1-overlapper/$1.stats"; # Used here, don't include \n 337 push @miscJobs, "1-overlapper/$1.stats\n"; 338 push @miscJobs, "1-overlapper/$1.oc\n"; 339 340 } elsif (fileExists("$path/$1.ovb")) { 341 push @successJobs, "1-overlapper/$1.ovb\n"; 342 push @statsJobs, "1-overlapper/$1.stats"; 343 push @miscJobs, "1-overlapper/$1.stats\n"; 344 push @miscJobs, "1-overlapper/$1.oc\n"; 345 346 } elsif (fileExists("$path/$1.ovb.bz2")) { 347 push @successJobs, "1-overlapper/$1.ovb.bz2\n"; 348 push @statsJobs, "1-overlapper/$1.stats"; 349 push @miscJobs, "1-overlapper/$1.stats\n"; 350 push @miscJobs, "1-overlapper/$1.oc\n"; 351 352 } elsif (fileExists("$path/$1.ovb.xz")) { 353 push @successJobs, "1-overlapper/$1.ovb.xz\n"; 354 push @statsJobs, "1-overlapper/$1.stats"; 355 push @miscJobs, "1-overlapper/$1.stats\n"; 356 push @miscJobs, "1-overlapper/$1.oc\n"; 357 358 } else { 359 $failureMessage .= "-- job $path/$1.ovb FAILED.\n"; 360 push @failedJobs, $currentJobID; 361 } 362 363 $currentJobID++; 364 } 365 } 366 367 close(F); 368 369 # Failed jobs, retry. 370 371 if (scalar(@failedJobs) > 0) { 372 373 # If too many attempts, give up. 374 375 if ($attempt >= getGlobal("canuIterationMax")) { 376 print STDERR "--\n"; 377 print STDERR "-- Overlap jobs failed, tried $attempt times, giving up.\n"; 378 print STDERR $failureMessage; 379 print STDERR "--\n"; 380 caExit(undef, undef); 381 } 382 383 if ($attempt > 0) { 384 print STDERR "--\n"; 385 print STDERR "-- Overlap jobs failed, retry.\n"; 386 print STDERR $failureMessage; 387 print STDERR "--\n"; 388 } 389 390 # Otherwise, run some jobs. 391 392 generateReport($asm); 393 394 submitOrRunParallelJob($asm, "${tag}ovl", $path, "overlap", @failedJobs); 395 return; 396 } 397 398 finishStage: 399 print STDERR "-- Found ", scalar(@successJobs), " overlapInCore output files.\n"; 400 401 open(L, "> $path/ovljob.files") or caExit("can't open '$path/ovljob.files' for writing: $!", undef); 402 print L @successJobs; 403 close(L); 404 405 open(L, "> $path/ovljob.more.files") or caExit("can't open '$path/ovljob.more.files' for writing: $!", undef); 406 print L @miscJobs; 407 close(L); 408 409 stashFile("$path/ovljob.files"); 410 stashFile("$path/ovljob.more.files"); 411 412 reportOverlapStats($base, $asm, @statsJobs); 413 414 generateReport($asm); 415 resetIteration("$tag-overlapCheck"); 416 417 allDone: 418 stopAfter("overlap"); 419} 420 421