1 2############################################################################### 3 # 4 # This file is part of canu, a software program that assembles whole-genome 5 # sequencing reads into contigs. 6 # 7 # This software is based on: 8 # 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) 9 # the 'kmer package' r1994 (http://kmer.sourceforge.net) 10 # 11 # Except as indicated otherwise, this is a 'United States Government Work', 12 # and is released in the public domain. 13 # 14 # File 'README.licenses' in the root directory of this distribution 15 # contains full conditions and disclaimers. 16 ## 17 18package canu::Defaults; 19 20require Exporter; 21 22@ISA = qw(Exporter); 23@EXPORT = qw(getCommandLineOptions 24 addCommandLineOption 25 removeHaplotypeOptions 26 addCommandLineError 27 writeLog 28 diskSpace 29 printOptions 30 printHelp 31 printCitation 32 setParametersFromFile 33 setParametersFromCommandLine 34 checkJava 35 checkMinimap 36 checkGnuplot 37 adjustMemoryValue 38 displayMemoryValue 39 adjustGenomeSize 40 displayGenomeSize 41 checkParameters 42 getGlobal 43 setGlobal 44 setGlobalIfUndef 45 setDefaults 46 setVersion); 47 48use strict; 49use warnings "all"; 50no warnings "uninitialized"; 51 52use Cwd qw(getcwd abs_path); 53use Carp qw(cluck); 54use Sys::Hostname; 55use Text::Wrap; 56use File::Basename; # dirname 57 58# Except for some direct access in setDefaults(), the only allowed access 59# method to these is through setGlobal() and getGlobal(). 60# 61my %global; # $global{$key} = parameter value ($key for all of 62my %synops; # $synops{$key} = parameter description these is always 63my %synnam; # $synnam{$key} = case-preserved key name lowercase) 64 65my @cLineOpts; 66my $specLog = ""; 67 68 69 70# Helper function to append a command line error to the list of command line 71# errors. 72# 73sub addCommandLineError($) { 74 $global{'errors'} .= shift @_; 75} 76 77 78# Get the value of a parameter. The parameter is case insensitive. 79# 80# We cannot use the normal caFailure here due to a cyclic 'use' structure, 81# and are forced to fail ungracefully on errors. These errors should not 82# depend on user input. 83# 84sub getGlobal ($) { 85 my $var = shift @_; 86 $var =~ tr/A-Z/a-z/; 87 88 if (!exists($global{$var})) { 89 print STDERR "================================================================================\n"; 90 print STDERR "Unknown parameter '$var' accessed. Stack trace:\n"; 91 cluck; 92 exit(1); 93 } 94 95 return($global{$var}); 96} 97 98 99sub setGlobalIfUndef ($$) { 100 my $var = shift @_; 101 my $val = shift @_; 102 103 if (!defined(getGlobal($var))) { 104 setGlobal($var, $val); 105 } 106} 107 108 109# Set the value of a parameter. The parameter is case insensitive. 110# 111# This is a bit complicated by handling of parameter alises (meta-options) 112# and handling of deprecated options. 113# 114sub setGlobal ($$); # A prototype so we can call ourself recursively. 115sub setGlobal ($$) { 116 my $VAR = shift @_; 117 my $var = $VAR; 118 my $val = shift @_; 119 120 $var =~ tr/A-Z/a-z/; 121 122 # Map undef/empty string and true/false to nicer values. 123 124 $val = undef if (($val eq "undef") || ($val eq "")); 125 $val = 0 if (($val =~ m/^false$/i) || ($val =~ m/^f$/i)); 126 $val = 1 if (($val =~ m/^true$/i) || ($val =~ m/^t$/i)); 127 128 # 129 # Handle real options first. If there is a key in %global, it's not a 130 # meta-option and we can set it and get out of here. 131 # 132 133 if (exists($global{$var})) { 134 $global{$var} = $val; 135 return; 136 } 137 138 # 139 # Handle meta-options. These options are aliases for three other 140 # options, for example: 141 # ovlMemory -> corOvlMemory and obtOvlMemory and utgOvlMemory 142 # 143 # They all follow this standard format, except gridOptions, which wants 144 # to insert the stage name in the middle of the option: 145 # gridOptionsOVL -> gridOptionsCOROVL. 146 # 147 # Note the recursive call here. 148 # 149 150 if ($var eq "gridoptionsovl") { setGlobal("gridOptionsCORovl", $val); setGlobal("gridOptionsOBTovl", $val); setGlobal("gridOptionsUTGovl", $val); return; } 151 if ($var eq "gridoptionsmhap") { setGlobal("gridOptionsCORmhap", $val); setGlobal("gridOptionsOBTmhap", $val); setGlobal("gridOptionsUTGmhap", $val); return; } 152 if ($var eq "gridoptionsmmap") { setGlobal("gridOptionsCORmmap", $val); setGlobal("gridOptionsOBTmmap", $val); setGlobal("gridOptionsUTGmmap", $val); return; } 153 154 foreach my $opt ("ovlmemory", "mhapmemory", "mmapmemory", # Execution options 155 "ovlthreads", "mhapthreads", "mmapthreads", 156 "ovlconcurrency", "mhapconcurrency", "mmapconcurrency", 157 "overlapper", # Overlap algorithm selection 158 "realign", 159 "ovlerrorrate", # Overlapper options 160 "ovlhashblocklength", 161 "ovlrefblocklength", 162 "ovlhashbits", 163 "ovlhashload", 164 "ovlmersize", 165 "ovlmerthreshold", 166 "ovlmerdistinct", 167 "ovlfrequentmers", 168 "mhapblocksize", # Mhap options 169 "mhapmersize", 170 "mhapsensitivity", 171 "mhapfilterunique", 172 "mhapfilterthreshold", 173 "mhapnotf", 174 "mhappipe", 175 "mmapblocksize", # Minimap options 176 "mmapmersize") { 177 if ($var eq "$opt") { 178 setGlobal("cor$opt", $val); 179 setGlobal("obt$opt", $val); 180 setGlobal("utg$opt", $val); 181 return; 182 } 183 } 184 185 if ($var eq "rawerrorrate") { 186 setGlobalIfUndef("corErrorRate", $val); setGlobalIfUndef("corOvlErrorRate", $val); 187 return; 188 } 189 190 if ($var eq "correctederrorrate") { 191 setGlobalIfUndef("obtErrorRate", $val); setGlobalIfUndef("obtOvlErrorRate", $val); 192 setGlobalIfUndef("utgErrorRate", $val); setGlobalIfUndef("utgOvlErrorRate", $val); 193 setGlobalIfUndef("cnsErrorRate", $val); 194 return; 195 } 196 197 # 198 # Replace obsolete options. 199 # 200 201 if ($var eq "readsamplingcoverage") { 202 print STDERR "--\n"; 203 print STDERR "-- WARNING: Deprecated option 'readSamplingCoverage' supplied.\n"; 204 print STDERR "-- WARNING: Use 'maxInputCoverage' instead.\n"; 205 print STDERR "-- WARNING: 'readSamplingCoverage' will be removed in the next release.\n"; 206 print STDERR "--\n"; 207 208 setGlobal("maxInputCoverage", $val); 209 return; 210 } 211 212 # 213 # If here, we got a parameter we don't know about. Let the usual error 214 # handling handle it since this should only occur when parsing user 215 # options (command line or spec file). 216 # 217 218 addCommandLineError("ERROR: Parameter '$VAR' is not known.\n"); 219} 220 221 222 223 224 225sub getCommandLineOptions () { 226 my $cLineOpts = join ' ', @cLineOpts; 227 228 return((wantarray) ? @cLineOpts : $cLineOpts); 229} 230 231 232 233sub addCommandLineOption ($) { 234 my $opt = shift @_; 235 236 return if ($opt =~ m/canuIteration=/); # Ignore canu resetting canuIteration 237 238 push @cLineOpts, $opt; 239 #$cLineOpts .= " " if (defined($cLineOpts) && ($cLineOpts !~ m/\s$/)); 240 #$cLineOpts .= $opt; 241} 242 243 244 245sub removeHaplotypeOptions () { 246 my @strippedOpts; 247 248 my $haveRaw = 0; 249 my $haveCorrected = 0; 250 251 my $haveTrimmed = 0; 252 253 my $setUpForPacBio = 0; 254 my $setUpForNanopore = 0; 255 my $setUpForHiFi = 0; 256 257 # A very specialized function. Remove all the sequence file options, 258 # both long reads and short reads used for haplotyping, from the list of 259 # command line options. Then return a string appropriate for assembling 260 # the haplotyped reads. 261 # 262 # Note that when this is called a second (third, etc) time, the 263 # semantics are different (no haplotype options to remove) but the end 264 # result is the same. 265 266 foreach my $opt (@cLineOpts) { 267 # for v1.* 268 if ($opt =~ m/^-pacbio-raw\s/) { $haveRaw++; $setUpForPacBio++; next; } 269 if ($opt =~ m/^-pacbio-corrected\s/) { $haveCorrected++; $setUpForPacBio++; next; } 270 if ($opt =~ m/^-nanopore-raw\s/) { $haveRaw++; $setUpForNanopore++; next; } 271 if ($opt =~ m/^-nanopore-corrected\s/) { $haveCorrected++; $setUpForNanopore++; next; } 272 273 if ($opt =~ m/^-raw\s/) { $haveRaw++; next; } 274 if ($opt =~ m/^-corrected\s/) { $haveCorrected++; next; } 275 276 if ($opt =~ m/^-trimmed\s/) { $haveTrimmed++; next; } 277 278 if ($opt =~ m/^-pacbio\s/) { $setUpForPacBio++; next; } 279 if ($opt =~ m/^-nanopore\s/) { $setUpForNanopore++; next; } 280 if ($opt =~ m/^-pacbio-hifi\s/) { $setUpForHiFi++; next; } 281 282 if ($opt =~ m/^-haplotype/) { next; } 283 if ($opt =~ m/^-d\s/) { next; } 284 if ($opt =~ m/^-p\s/) { next; } 285 286 push @strippedOpts, $opt; 287 } 288 289 @cLineOpts = @strippedOpts; 290 291 # Now figure out what to load the haplotyped reads as. 292 293 my $tech = ""; 294 295 # Either raw or corrected (or hifi). 296 297 if ($haveRaw) { $tech .= " "; } # implicit 298 elsif ($haveCorrected) { $tech .= "-corrected "; } 299 300 # And they can be trimmed or not. 301 302 if ($haveTrimmed) { $tech .= "-trimmed "; } 303 304 # Only one tech is allowed. 305 306 if ($setUpForHiFi) { $tech .= "-pacbio-hifi"; } 307 elsif ($setUpForNanopore) { $tech .= "-nanopore"; } 308 elsif ($setUpForPacBio) { $tech .= "-pacbio"; } 309 310 return($tech); 311} 312 313 314 315sub writeLog () { 316 my $time = time(); 317 my $host = hostname(); 318 my $pid = $$; 319 320 open(F, "> canu-logs/${time}_${host}_${pid}_canu"); 321 print F $specLog; 322 close(F); 323} 324 325 326 327sub diskSpace ($) { 328 my $dir = dirname($_[0]); 329 my ($total, $used, $free, $avail) = (0, 0, 0, 0); 330 331 if (-d $dir) { 332 open(DF, "df -P -k $dir |"); 333 while (<DF>) { 334 chomp; 335 336 if (m/^(.*)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+%)\s+(.*)$/) { 337 $total = int($2 / 1048.576) / 1000; 338 $used = int($3 / 1048.576) / 1000; 339 $free = int($4 / 1048.576) / 1000; 340 $avail = int($4 / 1048.576) / 1000; # Possibly limited by quota? 341 } 342 } 343 close(DF); 344 } 345 346 #print STDERR "Disk space: total $total GB, used $used GB, free $free GB, available $avail GB\n"; 347 348 return (wantarray) ? ($total, $used, $free, $avail) : $avail; 349} 350 351 352 353sub printOptions () { 354 my $pretty = 0; 355 356 # Figure out the maximum length of the option names. 357 my $optLength = 0; 358 foreach my $key (keys %synnam) { 359 $optLength = length($synnam{$key}) if ($optLength < length($synnam{$key})); 360 } 361 362 # Emit a nicely formatted list of options and a description of each. 363 foreach my $key (sort keys %synnam) { 364 my $optName = $synnam{$key}; 365 my $synopsis = $synops{$key}; 366 367 next if ($optName eq ""); 368 next if ($synopsis eq ""); 369 370 371 if ($pretty == 0) { 372 $optName .= " " x ($optLength - length($optName)); 373 374 print "$optName$synopsis\n"; 375 } 376 else { 377 $Text::Wrap::columns = 77; 378 379 $synopsis = wrap(" ", " ", $synopsis); 380 381 print "\n"; 382 print "$optName\n"; 383 print "$synopsis\n"; 384 } 385 } 386} 387 388 389sub printHelp (@) { 390 my $force = shift @_; 391 my $errors = getGlobal("errors"); 392 393 return if (!defined($force) && !defined($errors)); 394 395 print "\n"; 396 print "usage: canu [-version] [-citation] \\\n"; 397 print " [-haplotype | -correct | -trim | -assemble | -trim-assemble] \\\n"; 398 print " [-s <assembly-specifications-file>] \\\n"; 399 print " -p <assembly-prefix> \\\n"; 400 print " -d <assembly-directory> \\\n"; 401 print " genomeSize=<number>[g|m|k] \\\n"; 402 print " [other-options] \\\n"; 403 print " [-haplotype{NAME} illumina.fastq.gz] \\\n"; 404 print " [-corrected] \\\n"; 405 print " [-trimmed] \\\n"; 406 print " [-pacbio |\n"; 407 print " -nanopore |\n"; 408 print " -pacbio-hifi] file1 file2 ...\n"; 409 print "\n"; 410 print "example: canu -d run1 -p godzilla genomeSize=1g -nanopore-raw reads/*.fasta.gz \n"; 411 print "\n"; 412 print "\n"; 413 print " To restrict canu to only a specific stage, use:\n"; 414 print " -haplotype - generate haplotype-specific reads\n"; 415 print " -correct - generate corrected reads\n"; 416 print " -trim - generate trimmed reads\n"; 417 print " -assemble - generate an assembly\n"; 418 print " -trim-assemble - generate trimmed reads and then assemble them\n"; 419 print "\n"; 420 print " The assembly is computed in the -d <assembly-directory>, with output files named\n"; 421 print " using the -p <assembly-prefix>. This directory is created if needed. It is not\n"; 422 print " possible to run multiple assemblies in the same directory.\n"; 423 print "\n"; 424 print " The genome size should be your best guess of the haploid genome size of what is being\n"; 425 print " assembled. It is used primarily to estimate coverage in reads, NOT as the desired\n"; 426 print " assembly size. Fractional values are allowed: '4.7m' equals '4700k' equals '4700000'\n"; 427 print "\n"; 428 print " Some common options:\n"; 429 print " useGrid=string\n"; 430 print " - Run under grid control (true), locally (false), or set up for grid control\n"; 431 print " but don't submit any jobs (remote)\n"; 432 print " rawErrorRate=fraction-error\n"; 433 print " - The allowed difference in an overlap between two raw uncorrected reads. For lower\n"; 434 print " quality reads, use a higher number. The defaults are 0.300 for PacBio reads and\n"; 435 print " 0.500 for Nanopore reads.\n"; 436 print " correctedErrorRate=fraction-error\n"; 437 print " - The allowed difference in an overlap between two corrected reads. Assemblies of\n"; 438 print " low coverage or data with biological differences will benefit from a slight increase\n"; 439 print " in this. Defaults are 0.045 for PacBio reads and 0.144 for Nanopore reads.\n"; 440 print " gridOptions=string\n"; 441 print " - Pass string to the command used to submit jobs to the grid. Can be used to set\n"; 442 print " maximum run time limits. Should NOT be used to set memory limits; Canu will do\n"; 443 print " that for you.\n"; 444 print " minReadLength=number\n"; 445 print " - Ignore reads shorter than 'number' bases long. Default: 1000.\n"; 446 print " minOverlapLength=number\n"; 447 print " - Ignore read-to-read overlaps shorter than 'number' bases long. Default: 500.\n"; 448 print " A full list of options can be printed with '-options'. All options can be supplied in\n"; 449 print " an optional sepc file with the -s option.\n"; 450 print "\n"; 451 print " For TrioCanu, haplotypes are specified with the -haplotype{NAME} option, with any\n"; 452 print " number of haplotype-specific Illumina read files after. The {NAME} of each haplotype\n"; 453 print " is free text (but only letters and numbers, please). For example:\n"; 454 print " -haplotypeNANNY nanny/*gz\n"; 455 print " -haplotypeBILLY billy1.fasta.gz billy2.fasta.gz\n"; 456 print "\n"; 457 print " Reads can be either FASTA or FASTQ format, uncompressed, or compressed with gz, bz2 or xz.\n"; 458 print "\n"; 459 print " Reads are specified by the technology they were generated with, and any processing performed.\n"; 460 print "\n"; 461 print " [processing]\n"; 462 print " -corrected\n"; 463 print " -trimmed\n"; 464 print "\n"; 465 print " [technology]\n"; 466 print " -pacbio <files>\n"; 467 print " -nanopore <files>\n"; 468 print " -pacbio-hifi <files>\n"; 469 print "\n"; 470 print "Complete documentation at http://canu.readthedocs.org/en/latest/\n"; 471 print "\n"; 472 473 if ($errors) { 474 print "$errors"; 475 print "\n"; 476 exit(1); 477 } else { 478 exit(0); 479 } 480} 481 482 483sub printCitation ($$) { 484 my $prefix = shift @_; 485 my $mode = shift @_; 486 487 if (($mode eq "canu") || 488 ($mode eq "all")) { 489 print STDERR "${prefix}For 'standard' assemblies of PacBio or Nanopore reads:\n"; 490 print STDERR "${prefix} Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM.\n"; 491 print STDERR "${prefix} Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation.\n"; 492 print STDERR "${prefix} Genome Res. 2017 May;27(5):722-736.\n"; 493 print STDERR "${prefix} http://doi.org/10.1101/gr.215087.116\n"; 494 print STDERR "${prefix}\n"; 495 } 496 497 if (($mode eq "trio") || 498 ($mode eq "all")) { 499 print STDERR "${prefix}For 'trio-binned' assemblies of PacBio or Nanopore reads:\n"; 500 print STDERR " ${prefix}Koren S, Rhie A, Walenz BP, Dilthey AT, Bickhart DM, Kingan SB, Hiendleder S, Williams JL, Smith TPL, Phillippy AM.\n"; 501 print STDERR " ${prefix}De novo assembly of haplotype-resolved genomes with trio binning.\n"; 502 print STDERR " ${prefix}Nat Biotechnol. 2018\n"; 503 print STDERR " ${prefix}https//doi.org/10.1038/nbt.4277\n"; 504 print STDERR "${prefix}\n"; 505 } 506 507 if (($mode eq "hicanu") || 508 ($mode eq "all")) { 509 print STDERR "${prefix}For assemblies of PacBio HiFi reads:\n"; 510 print STDERR "${prefix} Nurk S, Walenz BP, Rhiea A, Vollger MR, Logsdon GA, Grothe R, Miga KH, Eichler EE, Phillippy AM, Koren S.\n"; 511 print STDERR "${prefix} HiCanu: accurate assembly of segmental duplications, satellites, and allelic variants from high-fidelity long reads.\n"; 512 print STDERR "${prefix} biorXiv. 2020.\n"; 513 print STDERR "${prefix} https://doi.org/10.1101/2020.03.14.992248\n"; 514 print STDERR "${prefix}\n"; 515 } 516 517 print STDERR "${prefix}Read and contig alignments during correction and consensus use:\n"; 518 print STDERR "${prefix} Šošic M, Šikic M.\n"; 519 print STDERR "${prefix} Edlib: a C/C ++ library for fast, exact sequence alignment using edit distance.\n"; 520 print STDERR "${prefix} Bioinformatics. 2017 May 1;33(9):1394-1395.\n"; 521 print STDERR "${prefix} http://doi.org/10.1093/bioinformatics/btw753\n"; 522 print STDERR "${prefix}\n"; 523 524 print STDERR "${prefix}Overlaps are generated using:\n"; 525 526 if ((getGlobal("corOverlapper") eq "mhap") || 527 (getGlobal("obtOverlapper") eq "mhap") || 528 (getGlobal("utgOverlapper") eq "mhap")) { 529 print STDERR "${prefix} Berlin K, et al.\n"; 530 print STDERR "${prefix} Assembling large genomes with single-molecule sequencing and locality-sensitive hashing.\n"; 531 print STDERR "${prefix} Nat Biotechnol. 2015 Jun;33(6):623-30.\n"; 532 print STDERR "${prefix} http://doi.org/10.1038/nbt.3238\n"; 533 print STDERR "${prefix}\n"; 534 } 535 536 if ((getGlobal("corOverlapper") eq "ovl") || 537 (getGlobal("obtOverlapper") eq "ovl") || 538 (getGlobal("utgOverlapper") eq "ovl")) { 539 print STDERR "${prefix} Myers EW, et al.\n"; 540 print STDERR "${prefix} A Whole-Genome Assembly of Drosophila.\n"; 541 print STDERR "${prefix} Science. 2000 Mar 24;287(5461):2196-204.\n"; 542 print STDERR "${prefix} http://doi.org/10.1126/science.287.5461.2196\n"; 543 print STDERR "${prefix}\n"; 544 } 545 546 if ((getGlobal("corOverlapper") eq "minimap") || 547 (getGlobal("obtOverlapper") eq "minimap") || 548 (getGlobal("utgOverlapper") eq "minimap")) { 549 print STDERR "${prefix} Li H.\n"; 550 print STDERR "${prefix} Minimap2: pairwise alignment for nucleotide sequences.\n"; 551 print STDERR "${prefix} arXiv.org. 2017 Aug 4.\n"; 552 print STDERR "${prefix} https://arxiv.org/abs/1708.01492\n"; 553 print STDERR "${prefix}\n"; 554 } 555 556 if ($mode ne "hicanu") { 557 print STDERR "${prefix}Corrected read consensus sequences are generated using an algorithm derived from FALCON-sense:\n"; 558 print STDERR "${prefix} Chin CS, et al.\n"; 559 print STDERR "${prefix} Phased diploid genome assembly with single-molecule real-time sequencing.\n"; 560 print STDERR "${prefix} Nat Methods. 2016 Dec;13(12):1050-1054.\n"; 561 print STDERR "${prefix} http://doi.org/10.1038/nmeth.4035\n"; 562 print STDERR "${prefix}\n"; 563 } 564 565 print STDERR "${prefix}Contig consensus sequences are generated using an algorithm derived from pbdagcon:\n"; 566 print STDERR "${prefix} Chin CS, et al.\n"; 567 print STDERR "${prefix} Nonhybrid, finished microbial genome assemblies from long-read SMRT sequencing data.\n"; 568 print STDERR "${prefix} Nat Methods. 2013 Jun;10(6):563-9\n"; 569 print STDERR "${prefix} http://doi.org/10.1038/nmeth.2474\n"; 570 print STDERR "${prefix}\n"; 571} 572 573 574 575sub makeAbsolute ($) { 576 my $var = shift @_; 577 my $val = getGlobal($var); 578 my $abs = abs_path($val); 579 580 if (defined($val) && ($val ne $abs)) { 581 setGlobal($var, $abs); 582 $val =~ s/\\\"/\"/g; 583 $val =~ s/\"/\\\"/g; 584 $val =~ s/\\\$/\$/g; 585 $val =~ s/\$/\\\$/g; 586 587 addCommandLineOption("'$var=$val'"); 588 } 589} 590 591 592 593sub fixCase ($@) { 594 my $var = shift @_; 595 my $VAL = getGlobal($var); 596 my $val = $VAL; 597 my $upp = shift @_; 598 599 if (defined($upp)) { 600 $val =~ tr/a-z/A-Z/; 601 } else { 602 $val =~ tr/A-Z/a-z/; 603 } 604 605 setGlobal($var, $val) if ($VAL ne $val) 606} 607 608 609 610sub setParametersFromFile ($) { 611 my $specFile = shift @_; 612 613 # Client should be ensuring that the file exists before calling this function. 614 die "specFile '$specFile' not found.\n" if (! -e "$specFile"); 615 616 $specLog .= "\n"; 617 $specLog .= "###\n"; 618 $specLog .= "### Reading options from '$specFile'\n"; 619 $specLog .= "###\n"; 620 $specLog .= "\n"; 621 622 # We lost the use of caExit() here (moved to Execution.pm) and so can't call it. 623 # Just die. 624 625 open(F, "< $specFile") or die("can't open '$specFile' for reading: $!\n"); 626 627 while (<F>) { 628 $specLog .= $_; 629 630 s/^\s+//; 631 s/\s+$//; 632 633 next if (m/^#/); 634 next if (length($_) eq 0); 635 636 # First, figure out the two words. 637 # 638 # Word two won't match a #, but will gobble up spaces at the end. 639 # Then, we can match a #, and any amount of comment, minimally. If 640 # word two is made non-greedy, it will shrink to nothing, as the 641 # last bit will gobble up everything, since we're allowed to match 642 # zero #'s in between. 643 644 my $one; 645 my $two; 646 647 if (m/^(\w*)\s*=\s*([^#]*)\s*#*.*?$/) { 648 $one = $1; 649 $two = $2; 650 } else { 651 addCommandLineError("ERROR: File not found or unknown specFile option line '$_' in file '$specFile'.\n"); 652 } 653 654 # Now, clean up the second word to handle quotes. 655 656 $two =~ s/^\s+//; # There can be spaces from the greedy match. 657 $two =~ s/\s+$//; 658 659 $two = $1 if ($two =~ m/^'(.+)'$/); # Remove single quotes | But don't allowed mixed quotes; users 660 $two = $1 if ($two =~ m/^"(.+)"$/); # Remove double quotes | should certainly know better 661 662 # And do something. 663 664 $two =~ s/^\s+//; # Remove spaces again. They'll just confuse our option processing. 665 $two =~ s/\s+$//; 666 667 setGlobal($one, $two); 668 } 669 close(F); 670} 671 672 673 674sub setParametersFromCommandLine(@) { 675 my @specOpts = @_; 676 677 if (scalar(@specOpts) > 0) { 678 $specLog .= "\n"; 679 $specLog .= "###\n"; 680 $specLog .= "### Reading options from the command line.\n"; 681 $specLog .= "###\n"; 682 $specLog .= "\n"; 683 } 684 685 foreach my $s (@specOpts) { 686 $specLog .= "$s\n"; 687 688 if ($s =~ m/\s*(\w*)\s*=(.*)/) { 689 my ($var, $val) = ($1, $2); 690 $var =~ s/^\s+//; $var =~ s/\s+$//; 691 $val =~ s/^\s+//; $val =~ s/\s+$//; 692 setGlobal($var, $val); 693 } else { 694 addCommandLineError("ERROR: Misformed command line option '$s'.\n"); 695 } 696 } 697} 698 699 700 701# Helper function to add a variable/value pair to the list of global options. 702# Both getGlobal() and setGlobal() will complain if an unknown variable is accessed. 703# 704sub setDefault ($$$) { 705 my $VAR = shift @_; 706 my $var = $VAR; 707 my $value = shift @_; 708 my $description = shift @_; 709 710 $var =~ tr/A-Z/a-z/; 711 712 $global{$var} = $value; # Internal lookups always use all lowercase. 713 $synops{$var} = $description; # But these operate on the case-sensitive input string? 714 $synnam{$var} = $VAR; # Remember the stylized version ($VAR) of the option $var. 715} 716 717 718# Helper to add variable/value pairs for all the options that affect 719# exectuion. 720# 721sub setExecDefaults ($$) { 722 my $tag = shift @_; 723 my $name = shift @_; 724 725 setDefault("useGrid${tag}", 1, "If 'true', run module $name under grid control; if 'false' run locally."); 726 setDefault("gridOptions${tag}", undef, "Grid engine options applied to $name jobs"); 727 setDefault("${tag}Memory", undef, "Amount of memory, in gigabytes, to use for $name jobs"); 728 setDefault("${tag}Threads", undef, "Number of threads to use for $name jobs"); 729 setDefault("${tag}StageSpace", undef, "Amount of local disk space needed to stage data for $name jobs"); 730 setDefault("${tag}Concurrency", undef, "If grid not enabled, number of $name jobs to run at the same time; default is n_proc / n_threads"); 731} 732 733 734# Helper to add variable/value pairs for all the options that affect the 735# overlap computation. 736# 737sub setOverlapDefaults ($$$) { 738 my $tag = shift @_; # If 'cor', some parameters are loosened for raw pacbio reads 739 my $name = shift @_; 740 my $default = shift @_; # Sets ${tag}Overlapper 741 742 # Which overlapper to use. 743 744 setDefault("${tag}Overlapper", $default, "Which overlap algorithm to use for $name"); 745 setDefault("${tag}ReAlign", 0, "Refine overlaps by computing the actual alignment: 'true' or 'false'. Not useful for overlapper=ovl. Uses ${tag}OvlErrorRate"); 746 747 # OverlapInCore parameters. 748 749 setDefault("${tag}OvlHashBlockLength", undef, "Amount of sequence (bp) to load into the overlap hash table"); 750 setDefault("${tag}OvlRefBlockLength", undef, "Amount of sequence (bp) to search against the hash table per batch"); 751 setDefault("${tag}OvlHashBits", undef, "Width of the kmer hash. Width 22=1gb, 23=2gb, 24=4gb, 25=8gb. Plus 10b per ${tag}OvlHashBlockLength"); 752 setDefault("${tag}OvlHashLoad", 0.80, "Maximum hash table load. If set too high, table lookups are inefficent; if too low, search overhead dominates run time; default 0.75"); 753 setDefault("${tag}OvlMerSize", ($tag eq "cor") ? 19 : 22, "K-mer size for seeds in overlaps"); 754 setDefault("${tag}OvlMerThreshold", undef, "K-mer frequency threshold; mers more frequent than this count are ignored"); 755 setDefault("${tag}OvlMerDistinct", undef, "K-mer frequency threshold; the least frequent fraction of distinct mers can seed overlaps"); 756 setDefault("${tag}OvlFrequentMers", undef, "Do not seed overlaps with these kmers"); 757 setDefault("${tag}OvlFilter", undef, "Filter overlaps based on expected kmers vs observed kmers"); 758 759 # Mhap parameters. FilterThreshold MUST be a string, otherwise it gets printed in scientific notation (5e-06) which java doesn't understand. 760 761 setDefault("${tag}MhapVersion", "2.1.3", "Version of the MHAP jar file to use"); 762 setDefault("${tag}MhapFilterThreshold", "0.0000001", "Value between 0 and 1. kmers which comprise more than this percentage of the input are downweighted"); 763 setDefault("${tag}MhapFilterUnique", undef, "Expert option: True or false, supress the low-frequency k-mer distribution based on them being likely noise and not true overlaps. Threshold auto-computed based on error rate and coverage."); 764 setDefault("${tag}MhapNoTf", undef, "Expert option: True or false, do not use tf weighting, only idf of tf-idf."); 765 setDefault("${tag}MhapOptions", undef, "Expert option: free-form parameters to pass to MHAP."); 766 setDefault("${tag}MhapPipe", 1, "Report results to a pipe instead of *large* files."); 767 setDefault("${tag}MhapBlockSize", 3000, "Number of reads per GB of memory allowed (mhapMemory)"); 768 setDefault("${tag}MhapMerSize", ($tag eq "cor") ? 16 : 16, "K-mer size for seeds in mhap"); 769 setDefault("${tag}MhapOrderedMerSize", ($tag eq "cor") ? 12 : 18, "K-mer size for second-stage filter in mhap"); 770 setDefault("${tag}MhapSensitivity", undef, "Coarse sensitivity level: 'low', 'normal' or 'high'. Set automatically based on coverage; 'high' <= 30x < 'normal' < 60x <= 'low'"); 771 772 # MiniMap parameters. 773 774 setDefault("${tag}MMapBlockSize", 6000, "Number of reads per 1GB; memory * blockSize = the size of block loaded into memory per job"); 775 setDefault("${tag}MMapMerSize", ($tag eq "cor") ? 15 : 21, "K-mer size for seeds in minmap"); 776} 777 778 779 780sub setDefaults () { 781 782 ##### Internal stuff - no synopsis pre pretty-name for these. 783 784 $global{"errors"} = undef; # Command line errors. 785 $global{"version"} = undef; # Set in setVersion() once we know where binaries are. 786 $global{"availablehosts"} = undef; # Internal list of cpus-memory-nodes describing the grid. 787 788 $global{"localmemory"} = 0; # Amount of memory on the local host, set in Grid_Local.pm 789 $global{"localthreads"} = 0; 790 791 $global{"canuiteration"} = 0; 792 $global{"canuiterationmax"} = 2; 793 794 $global{"onexitdir"} = undef; # Copy of $wrk, for caExit() and caFailure() ONLY. 795 $global{"onexitnam"} = undef; # Copy of $asm, for caExit() and caFailure() ONLY. 796 797 ##### Meta options - no $global for these, only synopsis and pretty-format name, 798 799 $synops{"rawerrorrate"} = "Expected fraction error in an alignment of two uncorrected reads"; 800 $synnam{"rawerrorrate"} = "rawErrorRate"; 801 802 $synops{"correctederrorrate"} = "Expected fraction error in an alignment of two corrected reads"; 803 $synnam{"correctederrorrate"} = "correctedErrorRate"; 804 805 ##### General Configuration Options (aka miscellany) 806 807 my $java = (exists $ENV{"JAVA_HOME"} && -e "$ENV{'JAVA_HOME'}/bin/java") ? "$ENV{'JAVA_HOME'}/bin/java" : "java"; 808 809 setDefault("showNext", undef, "Don't run any commands, just report what would run"); 810 setDefault("shell", "/bin/sh", "Command interpreter to use; sh-compatible (e.g., bash), NOT C-shell (csh or tcsh); default '/bin/sh'"); 811 812 setDefault("minimap", "minimap2", "Path to minimap2; default 'minimap2'"); 813 814 setDefault("java", $java, "Java interpreter to use; at least version 1.8; default 'java'"); 815 setDefault("javaUse64Bit", undef, "Java interpreter supports the -d64 or -d32 flags; default auto"); 816 817 setDefault("gnuplot", "gnuplot", "Path to the gnuplot executable"); 818 setDefault("gnuplotImageFormat", undef, "Image format that gnuplot will generate. Default: based on gnuplot, 'png', 'svg' or 'gif'"); 819 820 setDefault("stageDirectory", undef, "If set, copy heavily used data to this node-local location"); 821 setDefault("preExec", undef, "A command line to run at the start of Canu execution scripts"); 822 823 ##### Cleanup and Termination options 824 825 setDefault("saveOverlaps", 0, "Do not remove the overlap stores. Default: false = remove overlap stores when they're no longer needed"); 826 setDefault("purgeOverlaps", "normal", "When to delete intermediate overlap files: never, normal (default), aggressive, dangerous"); 827 setDefault("saveReadCorrections", 0, "Save intermediate read correction files, almost never a good idea"); 828 setDefault("saveReadHaplotypes", 0, "Save intermediate read haplotype files, almost never a good idea"); 829 setDefault("saveMerCounts", 0, "Save full mer counting results, sometimes useful"); 830 setDefault("saveReads", 1, "Save intermediate corrected and trimmed reads to asm.correctedReads.fasta.gz and asm.trimmedReads.fasta.gz"); 831 setDefault("onSuccess", undef, "Full path to command to run on successful completion"); 832 setDefault("onFailure", undef, "Full path to command to run on failure"); 833 834 ##### Error Rates 835 836 setDefault("corOvlErrorRate", undef, "Overlaps above this error rate are not computed"); 837 setDefault("obtOvlErrorRate", undef, "Overlaps at or below this error rate are used to trim reads"); 838 setDefault("utgOvlErrorRate", undef, "Overlaps at or below this error rate are used to trim reads"); 839 setDefault("utgErrorRate", undef, "Overlaps at or below this error rate are used to construct contigs"); 840 setDefault("utgGraphDeviation", undef, "Overlaps this much above median will not be used for initial graph construction"); 841 setDefault("utgBubbleDeviation", 1, "Overlaps this much above mean of contig will be used to identify bubbles"); 842 setDefault("utgRepeatDeviation", undef, "Overlaps this much above mean unitig error rate will not be used for repeat splitting"); 843 setDefault("utgRepeatConfusedBP", 2500, "Repeats where the next best edge is at least this many bp shorter will not be split"); 844 setDefault("utgRepeatConfusedPC", 15, "Repeats where the next best edge is at least this many percent shorter will not be split"); 845 setDefault("utgChimeraType", "deadend", "When to filter reads for contig construction: none, chimera (missing middle), uncovered (missing middle or ends), deadend (missing middle or end or no neighbor) (default)"); 846 setDefault("corErrorRate", undef, "Only use raw alignments below this error rate to construct corrected reads"); 847 setDefault("cnsErrorRate", undef, "Consensus expects alignments at about this error rate"); 848 849 ##### Minimums and maximums 850 851 setDefault("minReadLength", 1000, "Reads shorter than this length are not loaded into the assembler; default 1000"); 852 setDefault("minOverlapLength", 500, "Overlaps shorter than this length are not computed; default 500"); 853 854 setDefault("readSamplingBias", 0.0, "Score reads as 'random * length^bias', keep the highest scoring reads"); 855 856 setDefault("minMemory", 0, "Minimum amount of memory needed to compute the assembly (do not set unless prompted!)"); 857 setDefault("maxMemory", undef, "Maximum memory to use by any component of the assembler"); 858 859 setDefault("minThreads", 0, "Minimum number of compute threads suggested to compute the assembly"); 860 setDefault("maxThreads", undef, "Maximum number of compute threads to use by any component of the assembler"); 861 862 setDefault("minInputCoverage", 10, "Stop if input coverage is too low; default 10"); 863 setDefault("maxInputCoverage", undef, "If input coverage is high, downsample to something reasonable; default 200"); 864 865 ##### Stopping conditions 866 867 setDefault("stopOnLowCoverage", 10, "Stop if raw, corrected or trimmed read coverage is low"); 868 setDefault("stopAfter", undef, "Stop after a specific algorithm step is completed"); 869 870 ##### Grid Engine configuration, internal parameters. These are filled out in canu.pl, right after this function returns. 871 872 setDefault("gridEngine", undef, "Grid engine configuration, not documented"); 873 setDefault("gridEngineSubmitCommand", undef, "Grid engine configuration, not documented"); 874 setDefault("gridEngineNameOption", undef, "Grid engine configuration, not documented"); 875 setDefault("gridEngineArrayOption", undef, "Grid engine configuration, not documented"); 876 setDefault("gridEngineArrayName", undef, "Grid engine configuration, not documented"); 877 setDefault("gridEngineArrayMaxJobs", undef, "Grid engine configuration, not documented"); 878 setDefault("gridEngineOutputOption", undef, "Grid engine configuration, not documented"); 879 setDefault("gridEngineThreadsOption", undef, "Grid engine configuration, not documented"); 880 setDefault("gridEngineMemoryOption", undef, "Grid engine configuration, not documented"); 881 setDefault("gridEngineResourceOption", undef, "Grid engine configuration, not documented"); 882 setDefault("gridEngineMemoryUnits", undef, "Grid engine configuration, not documented"); 883 setDefault("gridEngineMemoryPerJob", undef, "Grid engine configuration, not documented"); 884 setDefault("gridEngineNameToJobIDCommand", undef, "Grid engine configuration, not documented"); 885 setDefault("gridEngineNameToJobIDCommandNoArray", undef, "Grid engine configuration, not documented"); 886 setDefault("gridEngineStageOption", undef, "Grid engine configuration, not documented"); 887 setDefault("gridEngineTaskID", undef, "Grid engine configuration, not documented"); 888 setDefault("gridEngineArraySubmitID", undef, "Grid engine configuration, not documented"); 889 setDefault("gridEngineJobID", undef, "Grid engine configuration, not documented"); 890 891 ##### Slurm-specific parameters for controlling the number of 892 ##### cores / tasks dispatched per step or globally (WIP) 893 894 setDefault( 'slurmCormhapCoreLimit', undef, 'Maximum number of cores allocated for MHAP pre-computing and alignment within the correction phase' ); 895 setDefault( 'slurmOvbCoreLimit', undef, 'Maximum number of single-core tasks dispatched for the ovlStore bucketizing step within the trimming phase' ); 896 setDefault( 'slurmOvsCoreLimit', undef, 'Maximum number of single-core tasks dispatched for the ovlStore sorting step within the trimming phase' ); 897 setDefault( 'slurmRedCoreLimit', undef, 'Maximum number of cores allocated for read error detection within the unitigging phase' ); 898 setDefault( 'slurmArrayTaskLimit', undef, 'Maximum number of tasks permitted for each step throughout assembly' ); 899 setDefault( 'slurmArrayCoreLimit', undef, 'Maximum number of cores allocated for each step throughout assembly' ); 900 901 ##### Grid Engine Pipeline 902 903 setDefault("useGrid", 1, "If 'true', enable grid-based execution; if 'false', run all jobs on the local machine; if 'remote', create jobs for grid execution but do not submit; default 'true'"); 904 905 ##### Grid Engine configuration, for each step of the pipeline 906 907 setDefault("gridOptions", undef, "Grid engine options applied to all jobs"); 908 setDefault("gridOptionsExecutive", undef, "Grid engine options applied to the canu executive script"); 909 setDefault("gridOptionsJobName", undef, "Grid jobs job-name suffix"); 910 911 ##### Grid Engine configuration and parameters, for each step of the pipeline (memory, threads) 912 913 setExecDefaults("meryl", "mer counting"); 914 setExecDefaults("hap", "haplotype assignment"); 915 setExecDefaults("cor", "read correction"); 916 917 setExecDefaults("corovl", "overlaps for correction"); 918 setExecDefaults("obtovl", "overlaps for trimming"); 919 setExecDefaults("utgovl", "overlaps for unitig construction"); 920 921 setExecDefaults("cormhap", "mhap overlaps for correction"); 922 setExecDefaults("obtmhap", "mhap overlaps for trimming"); 923 setExecDefaults("utgmhap", "mhap overlaps for unitig construction"); 924 925 setExecDefaults("cormmap", "mmap overlaps for correction"); 926 setExecDefaults("obtmmap", "mmap overlaps for trimming"); 927 setExecDefaults("utgmmap", "mmap overlaps for unitig construction"); 928 929 setExecDefaults("ovb", "overlap store bucketizing"); 930 setExecDefaults("ovs", "overlap store sorting"); 931 932 setExecDefaults("red", "read error detection"); 933 setExecDefaults("oea", "overlap error adjustment"); 934 935 setExecDefaults("bat", "unitig construction"); 936 setExecDefaults("cns", "unitig consensus"); 937 938 ##### Object Storage 939 940 setDefault("objectStore", undef, "Type of object storage used; not ready for production yet"); 941 setDefault("objectStoreClient", undef, "Path to the command line client used to access the object storage"); 942 setDefault("objectStoreClientUA", undef, "Path to the command line client used to upload files to object storage"); 943 setDefault("objectStoreClientDA", undef, "Path to the command line client used to download files from object storage"); 944 setDefault("objectStoreNameSpace", undef, "Object store parameters; specific to the type of objectStore used"); 945 setDefault("objectStoreProject", undef, "Object store project; specific to the type of objectStore used"); 946 947 ##### Overlapper 948 949 setOverlapDefaults("cor", "correction", "mhap"); # Overlaps computed for correction 950 setOverlapDefaults("obt", "overlap based trimming", "ovl"); # Overlaps computed for trimming 951 setOverlapDefaults("utg", "unitig construction", "ovl"); # Overlaps computed for unitigging 952 953 setGlobal("corReAlign", "false"); # To guard against someone only setting 954 setGlobal("obtReAlign", "false"); # utgOverlapper=mhap, we default to realign 955 setGlobal("utgReAlign", "true"); # enabled. 956 957 ##### Overlap Store 958 959 # ovbMemory and ovsMemory are set above. 960 961 ##### Executive 962 963 setDefault("executiveMemory", 4, "Amount of memory, in GB, to reserve for the Canu exective process"); 964 setDefault("executiveThreads", 1, "Number of threads to reserve for the Canu exective process"); 965 966 ##### Mers 967 968 setDefault("merylMemory", undef, "Amount of memory, in gigabytes, to use for mer counting"); 969 setDefault("merylThreads", undef, "Number of threads to use for mer counting"); 970 setDefault("merylConcurrency", undef, "Unused, there is only one process"); 971 972 ##### Haplotyping 973 974 setDefault("hapUnknownFraction", 0.05, "Fraction of allowed unknown bases before they are included in the assembly, between 0-1; default 0.05"); 975 setDefault("hapMemory", undef, "Amount of memory, in gigabytes, to use for haplotype assignment"); 976 setDefault("hapThreads", undef, "Number of threads to use for haplotype assignment"); 977 setDefault("hapConcurrency", undef, "Unused, there is only one process"); 978 979 ##### Overlap Based Trimming 980 981 setDefault("obtErrorRate", undef, "Stringency of overlaps to use for trimming"); 982 setDefault("trimReadsOverlap", 500, "Minimum overlap between evidence to make contiguous trim; default '500'"); 983 setDefault("trimReadsCoverage", 2, "Minimum depth of evidence to retain bases; default '2"); 984 985 ##### Fragment/Overlap Error Correction 986 987 setDefault("enableOEA", 1, "Do overlap error adjustment - comprises two steps: read error detection (RED) and overlap error adjustment (OEA); default 'true'"); 988 989 setDefault("oeaHaploConfirm", undef, "This many or more reads will confirm a true haplotype difference; default 5"); 990 setDefault("oeaMaskTrivial", undef, "Mask trivial DNA in Overlap Error Adjustment; default off; on for HiFi reads"); 991 setDefault("oeaErrorRate", undef, "Only use overlaps with at most this much fraction error to find errors in reads; default utgOvlErrorRate, 0.003 for HiFi reads"); 992 993 setDefault("redBatchSize", undef, "Number of reads per fragment error detection batch"); 994 setDefault("redBatchLength", undef, "Number of bases per fragment error detection batch"); 995 996 setDefault("oeaBatchSize", undef, "Number of reads per overlap error correction batch"); 997 setDefault("oeaBatchLength", undef, "Number of bases per overlap error correction batch"); 998 999 ##### Unitigger & BOG & bogart Options 1000 1001 setDefault("unitigger", "bogart", "Which unitig algorithm to use; only 'bogart' supported; default 'bogart'"); 1002 setDefault("genomeSize", undef, "An estimate of the size of the genome"); 1003 setDefault("batOptions", undef, "Advanced options to bogart"); 1004 setDefault("batMemory", undef, "Approximate maximum memory usage, in gigabytes, default is the maxMemory limit"); 1005 setDefault("batThreads", undef, "Number of threads to use; default is the maxThreads limit"); 1006 setDefault("batConcurrency", undef, "Unused, only one process supported"); 1007 1008 setDefault("contigFilter", "2 0 1.0 0.5 3", "Parameters to filter out 'unassembled' unitigs. Five values: minReads minLength singleReadSpan lowCovFraction lowCovDepth"); 1009 1010 ##### Consensus Options 1011 1012 setDefault("cnsMaxCoverage", 40, "Limit unitig consensus to at most this coverage; default '40' = unlimited"); 1013 setDefault("cnsConsensus", "pbdagcon", "Which consensus algorithm to use; 'pbdagcon' (fast, reliable); 'utgcns' (multialignment output); 'quick' (single read mosaic); default 'pbdagcon'"); 1014 setDefault("cnsPartitions", 0, "Attempt to create this many consensus jobs; default '0' = based on the largest tig"); 1015 1016 ##### Correction Options 1017 1018 setDefault("corPartitions", undef, "Partition read correction into N jobs"); 1019 setDefault("corPartitionMin", undef, "Don't make a read correction partition with fewer than N reads"); 1020 setDefault("corMinEvidenceLength", undef, "Limit read correction to only overlaps longer than this; default: unlimited"); 1021 setDefault("corMaxEvidenceErate", undef, "Limit read correction to only overlaps at or below this fraction error; default: unlimited"); 1022 setDefault("corMaxEvidenceCoverageGlobal", "1.0x", "Limit reads used for correction to supporting at most this coverage; default: '1.0x' = 1.0 * estimated coverage"); 1023 setDefault("corMaxEvidenceCoverageLocal", "2.0x", "Limit reads being corrected to at most this much evidence coverage; default: '2.0x' = 2.0 * estimated coverage"); 1024 setDefault("corOutCoverage", 40, "Only correct the longest reads up to this coverage; default 40"); 1025 setDefault("corMinCoverage", undef, "Minimum number of bases supporting each corrected base, if less than this sequences are split; default based on input read coverage: 0 <= 30x < 4 < 60x <= 4"); 1026 setDefault("corFilter", "expensive", "Method to filter short reads from correction; 'quick' or 'expensive'; default 'expensive'"); 1027 setDefault("corConsensus", "falcon", "Which consensus algorithm to use; only 'falcon' is supported; default 'falcon'"); 1028 1029 setDefault("homoPolyCompress", undef, "Compute everything but consensus sequences using homopolymer compressed reads"); 1030 1031 ##### Sanity check that all keys are lowercase. 1032 1033 foreach my $key (keys %global) { 1034 my $keylc = $key; 1035 $keylc =~ tr/A-Z/a-z/; 1036 1037 if ($key ne $keylc) { 1038 die "key '$key' is not all lowercase.\n"; 1039 } 1040 } 1041} 1042 1043 1044# Get the version information. 1045 1046sub setVersion ($) { 1047 my $bin = shift @_; 1048 my $version; 1049 1050 open(F, "$bin/sqStoreCreate --version 2>&1 |"); 1051 while (<F>) { 1052 $version = $_; chomp $version; 1053 } 1054 close(F); 1055 1056 setGlobal("version", $version); 1057} 1058 1059 1060sub checkJava () { 1061 return if ((getGlobal("corOverlapper") ne "mhap") && 1062 (getGlobal("obtOverlapper") ne "mhap") && 1063 (getGlobal("utgOverlapper") ne "mhap")); 1064 1065 my $java = getGlobal("java"); 1066 my $javaUse64Bit = getGlobal("javaUse64Bit"); 1067 my $versionStr = "unknown"; 1068 my $version = 0; 1069 my @javaVersionStrings; 1070 1071 if ($java =~ m/^\./) { 1072 addCommandLineError("ERROR: path to java '$java' must not be a relative path.\n"); 1073 } 1074 1075 # We've seen errors running just this tiny java if too many copies are ran at the same time. 1076 # So, run it twice, if needed, with a little random delay between. 1077 1078 for (my $iter=0; $iter<2; $iter++) { 1079 open(F, "$java -Xmx1g -showversion 2>&1 |"); 1080 @javaVersionStrings = <F>; 1081 chomp @javaVersionStrings; 1082 close(F); 1083 1084 foreach (@javaVersionStrings) { 1085 # First word is either "java" or "openjdk" or ... 1086 if (m/^.*\s+version\s+\"(\d+\.*\d*)(.*)\".*$/) { 1087 $versionStr = "$1$2"; 1088 $version = $1; 1089 } 1090 if (m/-d64/) { 1091 setGlobal("javaUse64Bit", 1); 1092 } 1093 } 1094 close(F); 1095 1096 last if ($version >= 1.8); 1097 1098 print STDERR "-- Failed Java version check.\n"; 1099 print STDERR "-- '$javaVersionStrings[0]'\n" if (length($javaVersionStrings[0]) > 0); 1100 print STDERR "-- '$javaVersionStrings[1]'\n" if (length($javaVersionStrings[1]) > 0); 1101 print STDERR "-- '$javaVersionStrings[2]'\n" if (length($javaVersionStrings[2]) > 0); 1102 print STDERR "-- '$javaVersionStrings[3]'\n" if (length($javaVersionStrings[3]) > 0); 1103 print STDERR "--\n"; 1104 print STDERR "-- Trying again.\n"; 1105 print STDERR "--\n"; 1106 1107 sleep(int(rand(3)+1)); 1108 } 1109 setGlobal("javaUse64Bit", 0) if (!defined(getGlobal("javaUse64Bit"))); 1110 1111 if ($version < 1.8) { 1112 addCommandLineError("ERROR: mhap overlapper requires java version at least 1.8.0; you have $versionStr (from '$java').\n"); 1113 addCommandLineError("ERROR: '$java -Xmx1g -showversion' reports:\n"); 1114 1115 for (my $ii=0; (($ii<20) && ($ii < scalar(@javaVersionStrings))); $ii++) { 1116 addCommandLineError("ERROR: '$javaVersionStrings[$ii]'\n"); 1117 } 1118 1119 } else { 1120 print STDERR "-- Detected Java(TM) Runtime Environment '$versionStr' (from '$java')"; 1121 print STDERR (defined(getGlobal("javaUse64Bit")) && getGlobal("javaUse64Bit") == 1) ? " with " : " without "; 1122 print STDERR "-d64 support.\n"; 1123 } 1124} 1125 1126 1127 1128sub checkMinimap ($) { 1129 my $minimap = getGlobal("minimap"); 1130 my $version = undef; 1131 1132 return if ((getGlobal("corOverlapper") ne "minimap") && 1133 (getGlobal("obtOverlapper") ne "minimap") && 1134 (getGlobal("utgOverlapper") ne "minimap")); 1135 1136 if ($minimap =~ m/^\./) { 1137 addCommandLineError("ERROR: path to minimap2 '$minimap' must not be a relative path.\n"); 1138 goto cleanupMinimap; 1139 } 1140 1141 system("cd /tmp && $minimap --version > /tmp/minimap2-$$.err 2>&1"); 1142 1143 open(F, "< /tmp/minimap2-$$.err"); 1144 while (<F>) { 1145 $version = $1 if ($_ =~ m/^(2.*$)/); 1146 } 1147 close(F); 1148 1149 if (!defined($version)) { 1150 addCommandLineError("ERROR: failed to run minimap2 using command '$minimap'.\n"); 1151 goto cleanupMinimap; 1152 } 1153 1154 print STDERR "-- Detected minimap2 version '$version' (from '$minimap').\n"; 1155 1156 cleanupMinimap: 1157 unlink "/tmp/minimap2-$$.err"; 1158} 1159 1160 1161 1162sub checkGnuplot () { 1163 my $gnuplot = getGlobal("gnuplot"); 1164 my $format = getGlobal("gnuplotImageFormat"); 1165 my $version = undef; 1166 1167 if (($gnuplot eq undef) || 1168 ($gnuplot eq "")) { 1169 print STDERR "-- No path to gnuplot executable. Plots disabled.\n"; 1170 goto cleanupGnuplot; 1171 } 1172 1173 if ($gnuplot =~ m/^\./) { 1174 addCommandLineError("ERROR: path to gnuplot '$gnuplot' must not be a relative path.\n"); 1175 goto cleanupGnuplot; 1176 } 1177 1178 # Explicitly set pager to avoid having output corrupted by "Press enter..." 1179 1180 $ENV{"PAGER"} = "cat"; 1181 1182 # Check for existence of gnuplot. 1183 1184 open(F, "> /tmp/gnuplot-$$-test.gp"); 1185 print F "show version long\n"; 1186 print F "set terminal\n"; 1187 close(F); 1188 1189 system("cd /tmp && $gnuplot < /dev/null /tmp/gnuplot-$$-test.gp > /tmp/gnuplot-$$-test.err 2>&1"); 1190 1191 open(F, "< /tmp/gnuplot-$$-test.err"); 1192 while (<F>) { 1193 $version = $1 if ($_ =~ m/^\s*[vV]ersion\s+(.*)/); 1194 $version = $1 if ($_ =~ m/^\s*[vV]ersion\s+(.*)\s+last/); 1195 } 1196 close(F); 1197 1198 if (!defined($version)) { 1199 print STDERR "--\n"; 1200 print STDERR "-- WARNING:\n"; 1201 print STDERR "-- WARNING: Failed to run gnuplot using command '$gnuplot'.\n"; 1202 print STDERR "-- WARNING: Plots will be disabled.\n"; 1203 print STDERR "-- WARNING:\n"; 1204 print STDERR "--\n"; 1205 1206 goto cleanupGnuplot; 1207 } 1208 1209 # Check for existence of a decent output format. Need to redirect in /dev/null to make gnuplot 1210 # not use it's builtin pager. 1211 1212 if (!defined($format)) { 1213 my $havePNG = 0; 1214 my $haveSVG = 0; 1215 my $haveGIF = 0; 1216 1217 open(F, "< /tmp/gnuplot-$$-test.err"); 1218 while (<F>) { 1219 s/^\s+//; 1220 s/\s+$//; 1221 1222 my @t = split '\s+', $_; 1223 1224 $havePNG = 1 if ($t[0] eq 'png'); 1225 $haveSVG = 1 if ($t[0] eq 'svg'); 1226 $haveGIF = 1 if ($t[0] eq 'gif'); 1227 } 1228 close(F); 1229 1230 $format = "gif" if ($haveGIF); 1231 $format = "svg" if ($haveSVG); 1232 $format = "png" if ($havePNG); 1233 1234 setGlobal("gnuplotImageFormat", $format); 1235 } 1236 1237 if (!defined($format)) { 1238 print STDERR "--\n"; 1239 print STDERR "-- WARNING:\n"; 1240 print STDERR "-- WARNING: Failed to detect a suitable output format for gnuplot. Looked for png, svg\n"; 1241 print STDERR "-- WARNING: and gif; found none of them. Specify a format with gnuplotImageFormat=<type>,\n"; 1242 print STDERR "-- WARNING: or set 'gnuplot=undef' to disable gnuplot entirely. Plots will be disabled.\n"; 1243 print STDERR "-- WARNING:\n"; 1244 print STDERR "--\n"; 1245 1246 goto cleanupGnuplot; 1247 } 1248 1249 # Test if we can actually make images. 1250 1251 open(F, "> /tmp/gnuplot-$$-test.gp"); 1252 print F "set title 'gnuplot test'\n"; 1253 print F "set xlabel 'X'\n"; 1254 print F "set xlabel 'Y'\n"; 1255 print F "\n"; 1256 print F "set terminal $format size 1024,1024\n"; 1257 print F "set output '/tmp/gnuplot-$$-test.1.$format'\n"; 1258 print F "\n"; 1259 print F "plot [-30:20] sin(x*20) * atan(x)\n\n"; 1260 print F "\n"; 1261 print F "set terminal $format size 256,256\n"; 1262 print F "set output '/tmp/gnuplot-$$-test.2.$format'\n"; 1263 print F "\n"; 1264 print F "bogus line\n"; 1265 close(F); 1266 1267 system("cd /tmp && $gnuplot < /dev/null /tmp/gnuplot-$$-test.gp > /tmp/gnuplot-$$-test.err 2>&1"); 1268 1269 if ((! -e "/tmp/gnuplot-$$-test.1.$format") || 1270 (! -e "/tmp/gnuplot-$$-test.2.$format")) { 1271 1272 print STDERR "--\n"; 1273 print STDERR "-- WARNING:\n"; 1274 print STDERR "-- WARNING: gnuplot failed to generate images. Specify a format with gnuplotImageFormat=<type>,\n"; 1275 print STDERR "-- WARNING: or set 'gnuplot=undef' to disable gnuplot entirely. Plots will be disabled.\n"; 1276 print STDERR "-- WARNING:\n"; 1277 print STDERR "-- WARNING: gnuplot reports:\n"; 1278 1279 open(F, "< /tmp/gnuplot-$$-test.err"); 1280 while (<F>) { 1281 chomp; 1282 print STDERR "-- WARNING: $_\n"; 1283 } 1284 close(F); 1285 1286 print STDERR "--\n"; 1287 1288 goto cleanupGnuplot; 1289 } 1290 1291 # Yay, gnuplot works! 1292 1293 print STDERR "-- Detected gnuplot version '$version' (from '$gnuplot') and image format '$format'.\n"; 1294 1295 cleanupGnuplot: 1296 unlink "/tmp/gnuplot-$$-test.gp"; 1297 unlink "/tmp/gnuplot-$$-test.err"; 1298 unlink "/tmp/gnuplot-$$-test.1.$format"; 1299 unlink "/tmp/gnuplot-$$-test.2.$format"; 1300} 1301 1302 1303 1304# Converts number with units to gigabytes. If no units, gigabytes is assumed. 1305sub adjustMemoryValue ($) { 1306 my $val = shift @_; 1307 1308 return(undef) if (!defined($val)); 1309 1310 return($1) if ($val =~ m/^(\d+\.{0,1}\d*)$/); 1311 return($1 / 1024 / 1024) if ($val =~ m/^(\d+\.{0,1}\d*)[kK]$/); 1312 return($1 / 1024) if ($val =~ m/^(\d+\.{0,1}\d*)[mM]$/); 1313 return($1) if ($val =~ m/^(\d+\.{0,1}\d*)[gG]$/); 1314 return($1 * 1024) if ($val =~ m/^(\d+\.{0,1}\d*)[tT]$/); 1315 return($1 * 1024 * 1024) if ($val =~ m/^(\d+\.{0,1}\d*)[pP]$/); 1316 1317 die "Invalid memory value '$val'\n"; 1318} 1319 1320 1321# Converts gigabytes to number with units. 1322sub displayMemoryValue ($) { 1323 my $val = shift @_; 1324 1325 return(($val * 1024 * 1024) . "k") if ($val < adjustMemoryValue("1m")); 1326 return(($val * 1024) . "m") if ($val < adjustMemoryValue("1g")); 1327 return(($val) . "g") if ($val < adjustMemoryValue("1t")); 1328 return(($val / 1024) . "t"); 1329} 1330 1331 1332# Converts number with units to bases. 1333sub adjustGenomeSize ($) { 1334 my $val = shift @_; 1335 1336 return(undef) if (!defined($val)); 1337 1338 return($1) if ($val =~ m/^(\d+\.{0,1}\d*)$/i); 1339 return($1 * 1000) if ($val =~ m/^(\d+\.{0,1}\d*)[kK]$/i); 1340 return($1 * 1000000) if ($val =~ m/^(\d+\.{0,1}\d*)[mM]$/i); 1341 return($1 * 1000000000) if ($val =~ m/^(\d+\.{0,1}\d*)[gG]$/i); 1342 return($1 * 1000000000000) if ($val =~ m/^(\d+\.{0,1}\d*)[tT]$/i); 1343 1344 die "Invalid genome size '$val'\n"; 1345} 1346 1347 1348# Converts bases to number with units. 1349sub displayGenomeSize ($) { 1350 my $val = shift @_; 1351 1352 return(($val)) if ($val < adjustGenomeSize("1k")); 1353 return(($val / 1000) . "k") if ($val < adjustGenomeSize("1m")); 1354 return(($val / 1000000) . "m") if ($val < adjustGenomeSize("1g")); 1355 return(($val / 1000000000) . "g") if ($val < adjustGenomeSize("1t")); 1356 return(($val / 1000000000000) . "t"); 1357} 1358 1359 1360 1361sub checkParameters () { 1362 1363 # 1364 # Fiddle with filenames to make them absolute paths. 1365 # 1366 1367 makeAbsolute("corOvlFrequentMers"); 1368 makeAbsolute("obtOvlFrequentMers"); 1369 makeAbsolute("utgOvlFrequentMers"); 1370 1371 # 1372 # Adjust case on some of them 1373 # 1374 1375 fixCase("corOverlapper"); 1376 fixCase("obtOverlapper"); 1377 fixCase("utgOverlapper"); 1378 1379 fixCase("corConsensus"); 1380 fixCase("cnsConsensus"); 1381 1382 fixCase("corFilter"); 1383 1384 fixCase("unitigger"); 1385 fixCase("stopAfter"); 1386 1387 fixCase("gridEngine", "upper"); # These want to be uppercase, grrrr! 1388 fixCase("objectStore", "upper"); 1389 1390 # 1391 # Convert things with units to just values. The other memory sizes are 1392 # 'adjusted' in Configure.sh. 1393 # 1394 1395 setGlobal("genomeSize", adjustGenomeSize(getGlobal("genomeSize"))); 1396 1397 setGlobal("minMemory", adjustMemoryValue(getGlobal("minMemory"))); 1398 setGlobal("maxMemory", adjustMemoryValue(getGlobal("maxMemory"))); 1399 1400 setGlobal("executiveMemory", adjustMemoryValue(getGlobal("executiveMemory"))); 1401 1402 if (getGlobal("executiveMemory") < getGlobal("minMemory")) { # Silently bump up execMemory to minMemory 1403 setGlobal("executiveMemory", getGlobal("minMemory")); # if needed. 1404 } 1405 1406 # 1407 # Check for inconsistent parameters 1408 # 1409 1410 my $gs = getGlobal("genomeSize"); 1411 1412 addCommandLineError("ERROR: Required parameter 'genomeSize' not set.\n") if (!defined($gs)); 1413 addCommandLineError("ERROR: Implausibly small genome size $gs. Check units!\n") if ($gs < 1000); 1414 1415 # 1416 # If we're running as a job array, unset the ID of the job array. This screws 1417 # up our scheduling, as our jobs think they're running in a task array. 1418 # 1419 # Silly SGE sets this to 'undefined' for normal jobs. 1420 # 1421 1422 if (exists($ENV{getGlobal("gridEngineTaskID")})) { 1423 my $ja = $ENV{getGlobal("gridEngineTaskID")}; 1424 1425 if (($ja ne "undefined") && 1426 ($ja ne "0")) { 1427 print STDERR "--\n"; 1428 print STDERR "-- I appear to be task $ja in a job array, unsetting ", getGlobal("gridEngineTaskID"), ".\n"; 1429 } 1430 1431 undef $ENV{getGlobal("gridEngineTaskID")}; 1432 } 1433 1434 # Undefined error rate are OK; we'll set them to defaults later. 1435 # Non-numeric or negative (or too positive) rates are definitely bad. 1436 1437 foreach my $var ("corOvlErrorRate", "obtOvlErrorRate", "utgOvlErrorRate", "corErrorRate", "obtErrorRate", "utgErrorRate", "cnsErrorRate") { 1438 if (!defined(getGlobal($var))) { 1439 } 1440 elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) { 1441 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n"); 1442 } 1443 elsif ((getGlobal($var) < 0.0) || (getGlobal($var) > 1.0)) { 1444 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0 and no more than 1.0\n"); 1445 } 1446 } 1447 1448 foreach my $var ("corOvlMerDistinct", "obtOvlMerDistinct", "utgOvlMerDistinct") { 1449 if (!defined(getGlobal($var))) { 1450 } 1451 elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) { 1452 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n"); 1453 } 1454 elsif ((getGlobal($var) < 0.0) || (getGlobal($var) > 1.0)) { 1455 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0 and no more than 1.0\n"); 1456 } 1457 } 1458 1459 foreach my $var ("corOvlMerThreshold", "obtOvlMerThreshold", "utgOvlMerThreshold") { 1460 if (!defined(getGlobal($var))) { 1461 } 1462 elsif (getGlobal($var) !~ m/^[0123456789]+$/) { 1463 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be an integer\n"); 1464 } 1465 elsif (getGlobal($var) < 2) { 1466 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 2\n"); 1467 } 1468 } 1469 1470 if (getGlobal("minReadLength") < getGlobal("minOverlapLength")) { 1471 my $mr = getGlobal("minReadLength"); 1472 my $mo = getGlobal("minOverlapLength"); 1473 1474 addCommandLineError("ERROR: minReadLength=$mr must be at least minOverlapLength=$mo.\n"); 1475 } 1476 1477 foreach my $var ("corOutCoverage") { 1478 if (!defined(getGlobal($var))) { 1479 addCommandLineError("ERROR: Invalid 'corOutCoverage' specified; must be at least 1.0\n"); 1480 } 1481 elsif (getGlobal($var) =~ m/all/i) { 1482 setGlobal($var, 9999); 1483 } 1484 elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) { 1485 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n"); 1486 } 1487 elsif (getGlobal($var) < 1.0) { 1488 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 1.0\n"); 1489 } 1490 } 1491 1492 foreach my $var ("corMaxEvidenceCoverageGlobal", "corMaxEvidenceCoverageLocal") { 1493 if (!defined(getGlobal($var))) { 1494 # If undef, defaults to corOutCoverage in CorrectReads.pm 1495 } 1496 elsif (getGlobal($var) =~ m/^(\d*\.*\d*)(x*)$/) { 1497 if (($1 < 1.0) && ($2 ne "x")) { 1498 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 1.0\n"); 1499 } 1500 } 1501 else { 1502 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n"); 1503 } 1504 } 1505 1506 foreach my $var ("utgGraphDeviation", "utgRepeatDeviation", "utgRepeatConfusedBP", "minReadLength", "minOverlapLength") { 1507 setGlobal("utgGraphDeviation", 12) if $var eq "utgGraphDeviation" && !defined(getGlobal($var)); 1508 setGlobal("utgRepeatDeviation", 1) if $var eq "utgRepeatDeviation" && !defined(getGlobal($var)); 1509 1510 if (!defined(getGlobal($var))) { 1511 addCommandLineError("ERROR: Invalid '$var' specified; must be set\n"); 1512 } 1513 elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) { 1514 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n"); 1515 } 1516 elsif (getGlobal($var) < 0.0) { 1517 addCommandLineError("ERROR: Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0\n"); 1518 } 1519 } 1520 1521 if ((getGlobal("utgChimeraType") ne "none") && 1522 (getGlobal("utgChimeraType") ne "chimer") && 1523 (getGlobal("utgChimeraType") ne "uncovered") && 1524 (getGlobal("utgChimeraType") ne "deadend")) { 1525 addCommandLineError("ERROR: Invalid 'utgChimeraType' specified (" . getGlobal("utgChimeraType") . "); must be 'none', 'chimer', 'uncovered' or 'deadend'\n"); 1526 } 1527 1528 # 1529 # Check for invalid usage 1530 # 1531 1532 foreach my $tag ("cor", "obt", "utg") { 1533 if ((getGlobal("${tag}Overlapper") ne "mhap") && 1534 (getGlobal("${tag}Overlapper") ne "ovl") && 1535 (getGlobal("${tag}Overlapper") ne "minimap")) { 1536 addCommandLineError("ERROR: Invalid '${tag}Overlapper' specified (" . getGlobal("${tag}Overlapper") . "); must be 'mhap', 'ovl', or 'minimap'\n"); 1537 } 1538 } 1539 1540 foreach my $tag ("cor", "obt", "utg") { 1541 if (getGlobal("${tag}MhapSensitivity") eq "fast") { 1542 print STDERR "WARNING: deprecated ${tag}NhapSensitivity=fast replaced with ${tag}MhapSensitivity=low\n"; 1543 } 1544 1545 if ((getGlobal("${tag}MhapSensitivity") ne undef) && 1546 (getGlobal("${tag}MhapSensitivity") ne "low") && 1547 (getGlobal("${tag}MhapSensitivity") ne "normal") && 1548 (getGlobal("${tag}MhapSensitivity") ne "high")) { 1549 addCommandLineError("ERROR: Invalid '${tag}MhapSensitivity' specified (" . getGlobal("${tag}MhapSensitivity") . "); must be 'fast' or 'normal' or 'high'\n"); 1550 } 1551 } 1552 1553 if (getGlobal("unitigger") ne "bogart") { 1554 addCommandLineError("ERROR: Invalid 'unitigger' specified (" . getGlobal("unitigger") . "); must be 'bogart'\n"); 1555 } 1556 1557 if ((getGlobal("corConsensus") ne "falcon")) { 1558 addCommandLineError("ERROR: Invalid 'corConsensus' specified (" . getGlobal("corConsensus") . "); must be 'utgcns' or 'falcon' or 'falconpipe'\n"); 1559 } 1560 1561 if ((getGlobal("cnsConsensus") ne "quick") && 1562 (getGlobal("cnsConsensus") ne "pbdagcon") && 1563 (getGlobal("cnsConsensus") ne "utgcns")) { 1564 addCommandLineError("ERROR: Invalid 'cnsConsensus' specified (" . getGlobal("cnsConsensus") . "); must be 'quick', 'pbdagcon', or 'utgcns'\n"); 1565 } 1566 1567 if (!defined(getGlobal("maxInputCoverage"))) { 1568 setGlobal("maxInputCoverage", 200); 1569 } 1570 if (getGlobal("maxInputCoverage") eq "all") { 1571 setGlobal("maxInputCoverage", 0); 1572 } 1573 if ((getGlobal("maxInputCoverage") > 0) && (getGlobal("maxInputCoverage") < getGlobal("minInputCoverage"))) { 1574 my $minc = getGlobal("minInputCoverage"); 1575 my $maxc = getGlobal("maxInputCoverage"); 1576 1577 addCommandLineError("ERROR: minInputCoverage ($minc) must be less than maxInputCoverage ($maxc).\n"); 1578 } 1579 if ((getGlobal("maxInputCoverage") > 0) && (getGlobal("maxInputCoverage") < getGlobal("stopOnLowCoverage"))) { 1580 my $minc = getGlobal("stopOnLowCoverage"); 1581 my $maxc = getGlobal("maxInputCoverage"); 1582 1583 addCommandLineError("ERROR: stopOnLowCoverage ($minc) must be less than maxInputCoverage ($maxc).\n"); 1584 } 1585 1586 if ((getGlobal("saveOverlaps") ne "0") && 1587 (getGlobal("saveOverlaps") ne "1")) { 1588 addCommandLineError("ERROR: Invalid 'saveOverlaps' specified (" . getGlobal("saveOverlaps") . "); must be 'true' or 'false'\n"); 1589 } 1590 1591 if ((getGlobal("purgeOverlaps") eq "0") || 1592 (getGlobal("purgeOverlaps") eq "no")) { 1593 setGlobal("purgeOverlaps", "never"); 1594 } 1595 1596 if ((getGlobal("purgeOverlaps") eq "1") || 1597 (getGlobal("purgeOverlaps") eq "yes")) { 1598 setGlobal("purgeOverlaps", "normal"); 1599 } 1600 1601 if ((getGlobal("purgeOverlaps") ne "never") && 1602 (getGlobal("purgeOverlaps") ne "normal") && 1603 (getGlobal("purgeOverlaps") ne "aggressive") && 1604 (getGlobal("purgeOverlaps") ne "dangerous")) { 1605 addCommandLineError("ERROR: Invalid 'purgeOverlaps' specified (" . getGlobal("purgeOverlaps") . "); must be 'never', 'normal', 'aggressive' or 'dangerous'\n"); 1606 } 1607 1608 if ((getGlobal("corFilter") ne "quick") && 1609 (getGlobal("corFilter") ne "expensive") && 1610 (getGlobal("corFilter") ne "none")) { 1611 addCommandLineError("ERROR: Invalid 'corFilter' specified (" . getGlobal("corFilter") . "); must be 'none' or 'quick' or 'expensive'\n"); 1612 } 1613 1614 1615 if ((getGlobal("useGrid") ne "0") && 1616 (getGlobal("useGrid") ne "1") && 1617 (getGlobal("useGrid") ne "remote")) { 1618 addCommandLineError("ERROR: Invalid 'useGrid' specified (" . getGlobal("useGrid") . "); must be 'true', 'false' or 'remote'\n"); 1619 } 1620 1621 1622 if (defined(getGlobal("stopAfter"))) { 1623 my $ok = 0; 1624 my $st = getGlobal("stopAfter"); 1625 $st =~ tr/A-Z/a-z/; 1626 1627 $st = "correction" if ($st eq "readcorrection"); # Update the string to allow deprecated usage. 1628 $st = "trimming" if ($st eq "readtrimming"); 1629 1630 my $failureString = "ERROR: Invalid stopAfter specified (" . getGlobal("stopAfter") . "); must be one of:\n"; 1631 1632 my @stopAfter = ("sequenceStore", 1633 "meryl-configure", 1634 "meryl-count", 1635 "meryl-merge", 1636 "meryl-process", 1637 "meryl-subtract", 1638 "meryl", 1639 "haplotype-configure", 1640 "haplotype", 1641 "overlapConfigure", 1642 "overlap", 1643 "overlapStoreConfigure", 1644 "overlapStore", 1645 "correctionConfigure", 1646 "correction", 1647 "trimming", 1648 "unitig", 1649 "consensusConfigure", 1650 "consensus"); 1651 1652 foreach my $sa (@stopAfter) { 1653 $failureString .= "ERROR: '$sa'\n"; 1654 1655 $sa =~ tr/A-Z/a-z/; 1656 1657 if ($st eq $sa) { 1658 $ok++; 1659 setGlobal('stopAfter', $st); 1660 } 1661 } 1662 1663 addCommandLineError($failureString) if ($ok == 0); 1664 } 1665 1666 { 1667 my @v = split '\s+', getGlobal("contigFilter"); 1668 1669 if (scalar(@v) != 5) { 1670 addCommandLineError("contigFilter must have five values: minReads minLength singleReadSpan lowCovFraction lowCovDepth\n"); 1671 } 1672 1673 addCommandLineError("contigFilter 'minReads' must be a positive integer, currently $v[0]\n") if (($v[0] < 0) || ($v[0] !~ m/^[0-9]+$/)); 1674 addCommandLineError("contigFilter 'minLength' must be a positive integer, currently $v[1]\n") if (($v[1] < 0) || ($v[1] !~ m/^[0-9]+$/)); 1675 addCommandLineError("contigFilter 'singleReadSpan' must be between 0.0 and 1.0, currently $v[2]\n") if (($v[2] < 0) || (1 < $v[2]) || ($v[2] !~ m/^[0-9]*\.{0,1}[0-9]*$/)); 1676 addCommandLineError("contigFilter 'lowCovFraction' must be between 0.0 and 1.0, currently $v[3]\n") if (($v[3] < 0) || (1 < $v[3]) || ($v[3] !~ m/^[0-9]*\.{0,1}[0-9]*$/)); 1677 addCommandLineError("contigFilter 'lowCovDepth' must be a positive integer, currently $v[4]\n") if (($v[4] < 0) || ($v[4] !~ m/^[0-9]+$/)); 1678 } 1679} 1680 1681 16821; 1683