1
2###############################################################################
3 #
4 #  This file is part of canu, a software program that assembles whole-genome
5 #  sequencing reads into contigs.
6 #
7 #  This software is based on:
8 #    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 #    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 #
11 #  Except as indicated otherwise, this is a 'United States Government Work',
12 #  and is released in the public domain.
13 #
14 #  File 'README.licenses' in the root directory of this distribution
15 #  contains full conditions and disclaimers.
16 ##
17
18package canu::Defaults;
19
20require Exporter;
21
22@ISA    = qw(Exporter);
23@EXPORT = qw(getCommandLineOptions
24             addCommandLineOption
25             removeHaplotypeOptions
26             addCommandLineError
27             writeLog
28             diskSpace
29             printOptions
30             printHelp
31             printCitation
32             setParametersFromFile
33             setParametersFromCommandLine
34             checkJava
35             checkMinimap
36             checkGnuplot
37             adjustMemoryValue
38             displayMemoryValue
39             adjustGenomeSize
40             displayGenomeSize
41             checkParameters
42             getGlobal
43             setGlobal
44             setGlobalIfUndef
45             setDefaults
46             setVersion);
47
48use strict;
49use warnings "all";
50no  warnings "uninitialized";
51
52use Cwd qw(getcwd abs_path);
53use Carp qw(cluck);
54use Sys::Hostname;
55use Text::Wrap;
56use File::Basename;   #  dirname
57
58#  Except for some direct access in setDefaults(), the only allowed access
59#  method to these is through setGlobal() and getGlobal().
60#
61my %global;    #  $global{$key} = parameter value           ($key for all of
62my %synops;    #  $synops{$key} = parameter description      these is always
63my %synnam;    #  $synnam{$key} = case-preserved key name    lowercase)
64
65my @cLineOpts;
66my $specLog   = "";
67
68
69
70#  Helper function to append a command line error to the list of command line
71#  errors.
72#
73sub addCommandLineError($) {
74    $global{'errors'} .= shift @_;
75}
76
77
78#  Get the value of a parameter.  The parameter is case insensitive.
79#
80#  We cannot use the normal caFailure here due to a cyclic 'use' structure,
81#  and are forced to fail ungracefully on errors.  These errors should not
82#  depend on user input.
83#
84sub getGlobal ($) {
85    my $var = shift @_;
86    $var =~ tr/A-Z/a-z/;
87
88    if (!exists($global{$var})) {
89        print STDERR "================================================================================\n";
90        print STDERR "Unknown parameter '$var' accessed.  Stack trace:\n";
91        cluck;
92        exit(1);
93    }
94
95    return($global{$var});
96}
97
98
99sub setGlobalIfUndef ($$) {
100    my $var = shift @_;
101    my $val = shift @_;
102
103    if (!defined(getGlobal($var))) {
104        setGlobal($var, $val);
105    }
106}
107
108
109#  Set the value of a parameter.  The parameter is case insensitive.
110#
111#  This is a bit complicated by handling of parameter alises (meta-options)
112#  and handling of deprecated options.
113#
114sub setGlobal ($$);   #  A prototype so we can call ourself recursively.
115sub setGlobal ($$) {
116    my $VAR = shift @_;
117    my $var = $VAR;
118    my $val = shift @_;
119
120    $var =~ tr/A-Z/a-z/;
121
122    #  Map undef/empty string and true/false to nicer values.
123
124    $val = undef  if (($val eq "undef")     || ($val eq ""));
125    $val = 0      if (($val =~ m/^false$/i) || ($val =~ m/^f$/i));
126    $val = 1      if (($val =~ m/^true$/i)  || ($val =~ m/^t$/i));
127
128    #
129    #  Handle real options first.  If there is a key in %global, it's not a
130    #  meta-option and we can set it and get out of here.
131    #
132
133    if (exists($global{$var})) {
134        $global{$var} = $val;
135        return;
136    }
137
138    #
139    #  Handle meta-options.  These options are aliases for three other
140    #  options, for example:
141    #    ovlMemory -> corOvlMemory and obtOvlMemory and utgOvlMemory
142    #
143    #  They all follow this standard format, except gridOptions, which wants
144    #  to insert the stage name in the middle of the option:
145    #    gridOptionsOVL -> gridOptionsCOROVL.
146    #
147    #  Note the recursive call here.
148    #
149
150    if ($var eq "gridoptionsovl")  { setGlobal("gridOptionsCORovl",  $val);  setGlobal("gridOptionsOBTovl",  $val);  setGlobal("gridOptionsUTGovl",  $val);  return; }
151    if ($var eq "gridoptionsmhap") { setGlobal("gridOptionsCORmhap", $val);  setGlobal("gridOptionsOBTmhap", $val);  setGlobal("gridOptionsUTGmhap", $val);  return; }
152    if ($var eq "gridoptionsmmap") { setGlobal("gridOptionsCORmmap", $val);  setGlobal("gridOptionsOBTmmap", $val);  setGlobal("gridOptionsUTGmmap", $val);  return; }
153
154    foreach my $opt ("ovlmemory",      "mhapmemory",      "mmapmemory",      #  Execution options
155                     "ovlthreads",     "mhapthreads",     "mmapthreads",
156                     "ovlconcurrency", "mhapconcurrency", "mmapconcurrency",
157                     "overlapper",                                           #  Overlap algorithm selection
158                     "realign",
159                     "ovlerrorrate",                                         #  Overlapper options
160                     "ovlhashblocklength",
161                     "ovlrefblocklength",
162                     "ovlhashbits",
163                     "ovlhashload",
164                     "ovlmersize",
165                     "ovlmerthreshold",
166                     "ovlmerdistinct",
167                     "ovlfrequentmers",
168                     "mhapblocksize",                                        #  Mhap options
169                     "mhapmersize",
170                     "mhapsensitivity",
171                     "mhapfilterunique",
172                     "mhapfilterthreshold",
173                     "mhapnotf",
174                     "mhappipe",
175                     "mmapblocksize",                                        #  Minimap options
176                     "mmapmersize") {
177        if ($var eq "$opt") {
178            setGlobal("cor$opt", $val);
179            setGlobal("obt$opt", $val);
180            setGlobal("utg$opt", $val);
181            return;
182        }
183    }
184
185    if ($var eq "rawerrorrate") {
186        setGlobalIfUndef("corErrorRate",    $val);   setGlobalIfUndef("corOvlErrorRate", $val);
187        return;
188    }
189
190    if ($var eq "correctederrorrate") {
191        setGlobalIfUndef("obtErrorRate",    $val);   setGlobalIfUndef("obtOvlErrorRate", $val);
192        setGlobalIfUndef("utgErrorRate",    $val);   setGlobalIfUndef("utgOvlErrorRate", $val);
193        setGlobalIfUndef("cnsErrorRate",    $val);
194        return;
195    }
196
197    #
198    #  Replace obsolete options.
199    #
200
201    if ($var eq "readsamplingcoverage") {
202        print STDERR "--\n";
203        print STDERR "--  WARNING:  Deprecated option 'readSamplingCoverage' supplied.\n";
204        print STDERR "--  WARNING:  Use 'maxInputCoverage' instead.\n";
205        print STDERR "--  WARNING:  'readSamplingCoverage' will be removed in the next release.\n";
206        print STDERR "--\n";
207
208        setGlobal("maxInputCoverage", $val);
209        return;
210    }
211
212    #
213    #  If here, we got a parameter we don't know about.  Let the usual error
214    #  handling handle it since this should only occur when parsing user
215    #  options (command line or spec file).
216    #
217
218    addCommandLineError("ERROR:  Parameter '$VAR' is not known.\n");
219}
220
221
222
223
224
225sub getCommandLineOptions () {
226    my $cLineOpts = join ' ', @cLineOpts;
227
228    return((wantarray) ? @cLineOpts : $cLineOpts);
229}
230
231
232
233sub addCommandLineOption ($) {
234    my $opt = shift @_;
235
236    return   if ($opt =~ m/canuIteration=/);   #  Ignore canu resetting canuIteration
237
238    push @cLineOpts, $opt;
239    #$cLineOpts .= " "   if (defined($cLineOpts) && ($cLineOpts !~ m/\s$/));
240    #$cLineOpts .= $opt;
241}
242
243
244
245sub removeHaplotypeOptions () {
246    my @strippedOpts;
247
248    my $haveRaw           = 0;
249    my $haveCorrected     = 0;
250
251    my $haveTrimmed       = 0;
252
253    my $setUpForPacBio    = 0;
254    my $setUpForNanopore  = 0;
255    my $setUpForHiFi      = 0;
256
257    #  A very specialized function.  Remove all the sequence file options,
258    #  both long reads and short reads used for haplotyping, from the list of
259    #  command line options.  Then return a string appropriate for assembling
260    #  the haplotyped reads.
261    #
262    #  Note that when this is called a second (third, etc) time, the
263    #  semantics are different (no haplotype options to remove) but the end
264    #  result is the same.
265
266    foreach my $opt (@cLineOpts) {
267        # for v1.*
268        if ($opt =~ m/^-pacbio-raw\s/)          { $haveRaw++;          $setUpForPacBio++;     next; }
269        if ($opt =~ m/^-pacbio-corrected\s/)    { $haveCorrected++;    $setUpForPacBio++;     next; }
270        if ($opt =~ m/^-nanopore-raw\s/)        { $haveRaw++;          $setUpForNanopore++;   next; }
271        if ($opt =~ m/^-nanopore-corrected\s/)  { $haveCorrected++;    $setUpForNanopore++;   next; }
272
273        if ($opt =~ m/^-raw\s/)                 { $haveRaw++;          next; }
274        if ($opt =~ m/^-corrected\s/)           { $haveCorrected++;    next; }
275
276        if ($opt =~ m/^-trimmed\s/)             { $haveTrimmed++;      next; }
277
278        if ($opt =~ m/^-pacbio\s/)              { $setUpForPacBio++;   next; }
279        if ($opt =~ m/^-nanopore\s/)            { $setUpForNanopore++; next; }
280        if ($opt =~ m/^-pacbio-hifi\s/)         { $setUpForHiFi++;     next; }
281
282        if ($opt =~ m/^-haplotype/)             {                      next; }
283        if ($opt =~ m/^-d\s/)                   {                      next; }
284        if ($opt =~ m/^-p\s/)                   {                      next; }
285
286        push @strippedOpts, $opt;
287    }
288
289    @cLineOpts = @strippedOpts;
290
291    #  Now figure out what to load the haplotyped reads as.
292
293    my $tech = "";
294
295    #  Either raw or corrected (or hifi).
296
297    if    ($haveRaw)           {  $tech .= " ";             }    # implicit
298    elsif ($haveCorrected)     {  $tech .= "-corrected ";   }
299
300    #  And they can be trimmed or not.
301
302    if    ($haveTrimmed)       {  $tech .= "-trimmed ";     }
303
304    #  Only one tech is allowed.
305
306    if    ($setUpForHiFi)      {  $tech .= "-pacbio-hifi";  }
307    elsif ($setUpForNanopore)  {  $tech .= "-nanopore";     }
308    elsif ($setUpForPacBio)    {  $tech .= "-pacbio";       }
309
310    return($tech);
311}
312
313
314
315sub writeLog () {
316    my $time = time();
317    my $host = hostname();
318    my $pid  = $$;
319
320    open(F, "> canu-logs/${time}_${host}_${pid}_canu");
321    print F $specLog;
322    close(F);
323}
324
325
326
327sub diskSpace ($) {
328    my  $dir                          = dirname($_[0]);
329    my ($total, $used, $free, $avail) = (0, 0, 0, 0);
330
331    if (-d $dir) {
332        open(DF, "df -P -k $dir |");
333        while (<DF>) {
334            chomp;
335
336            if (m/^(.*)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+%)\s+(.*)$/) {
337                $total = int($2 / 1048.576) / 1000;
338                $used  = int($3 / 1048.576) / 1000;
339                $free  = int($4 / 1048.576) / 1000;
340                $avail = int($4 / 1048.576) / 1000;  #  Possibly limited by quota?
341            }
342        }
343        close(DF);
344    }
345
346    #print STDERR "Disk space: total $total GB, used $used GB, free $free GB, available $avail GB\n";
347
348    return (wantarray) ? ($total, $used, $free, $avail) : $avail;
349}
350
351
352
353sub printOptions () {
354    my $pretty = 0;
355
356    #  Figure out the maximum length of the option names.
357    my $optLength = 0;
358    foreach my $key (keys %synnam) {
359        $optLength = length($synnam{$key})    if ($optLength < length($synnam{$key}));
360    }
361
362    #  Emit a nicely formatted list of options and a description of each.
363    foreach my $key (sort keys %synnam) {
364        my $optName  = $synnam{$key};
365        my $synopsis = $synops{$key};
366
367        next   if ($optName  eq "");
368        next   if ($synopsis eq "");
369
370
371        if ($pretty == 0) {
372            $optName .= " " x ($optLength - length($optName));
373
374            print "$optName$synopsis\n";
375        }
376        else {
377            $Text::Wrap::columns = 77;
378
379            $synopsis = wrap("        ", "        ", $synopsis);
380
381            print "\n";
382            print "$optName\n";
383            print "$synopsis\n";
384        }
385    }
386}
387
388
389sub printHelp (@) {
390    my $force  = shift @_;
391    my $errors = getGlobal("errors");
392
393    return   if (!defined($force) && !defined($errors));
394
395    print "\n";
396    print "usage:   canu [-version] [-citation] \\\n";
397    print "              [-haplotype | -correct | -trim | -assemble | -trim-assemble] \\\n";
398    print "              [-s <assembly-specifications-file>] \\\n";
399    print "               -p <assembly-prefix> \\\n";
400    print "               -d <assembly-directory> \\\n";
401    print "               genomeSize=<number>[g|m|k] \\\n";
402    print "              [other-options] \\\n";
403    print "              [-haplotype{NAME} illumina.fastq.gz] \\\n";
404    print "              [-corrected] \\\n";
405    print "              [-trimmed] \\\n";
406    print "              [-pacbio |\n";
407    print "               -nanopore |\n";
408    print "               -pacbio-hifi] file1 file2 ...\n";
409    print "\n";
410    print "example: canu -d run1 -p godzilla genomeSize=1g -nanopore-raw reads/*.fasta.gz \n";
411    print "\n";
412    print "\n";
413    print "  To restrict canu to only a specific stage, use:\n";
414    print "    -haplotype     - generate haplotype-specific reads\n";
415    print "    -correct       - generate corrected reads\n";
416    print "    -trim          - generate trimmed reads\n";
417    print "    -assemble      - generate an assembly\n";
418    print "    -trim-assemble - generate trimmed reads and then assemble them\n";
419    print "\n";
420    print "  The assembly is computed in the -d <assembly-directory>, with output files named\n";
421    print "  using the -p <assembly-prefix>.  This directory is created if needed.  It is not\n";
422    print "  possible to run multiple assemblies in the same directory.\n";
423    print "\n";
424    print "  The genome size should be your best guess of the haploid genome size of what is being\n";
425    print "  assembled.  It is used primarily to estimate coverage in reads, NOT as the desired\n";
426    print "  assembly size.  Fractional values are allowed: '4.7m' equals '4700k' equals '4700000'\n";
427    print "\n";
428    print "  Some common options:\n";
429    print "    useGrid=string\n";
430    print "      - Run under grid control (true), locally (false), or set up for grid control\n";
431    print "        but don't submit any jobs (remote)\n";
432    print "    rawErrorRate=fraction-error\n";
433    print "      - The allowed difference in an overlap between two raw uncorrected reads.  For lower\n";
434    print "        quality reads, use a higher number.  The defaults are 0.300 for PacBio reads and\n";
435    print "        0.500 for Nanopore reads.\n";
436    print "    correctedErrorRate=fraction-error\n";
437    print "      - The allowed difference in an overlap between two corrected reads.  Assemblies of\n";
438    print "        low coverage or data with biological differences will benefit from a slight increase\n";
439    print "        in this.  Defaults are 0.045 for PacBio reads and 0.144 for Nanopore reads.\n";
440    print "    gridOptions=string\n";
441    print "      - Pass string to the command used to submit jobs to the grid.  Can be used to set\n";
442    print "        maximum run time limits.  Should NOT be used to set memory limits; Canu will do\n";
443    print "        that for you.\n";
444    print "    minReadLength=number\n";
445    print "      - Ignore reads shorter than 'number' bases long.  Default: 1000.\n";
446    print "    minOverlapLength=number\n";
447    print "      - Ignore read-to-read overlaps shorter than 'number' bases long.  Default: 500.\n";
448    print "  A full list of options can be printed with '-options'.  All options can be supplied in\n";
449    print "  an optional sepc file with the -s option.\n";
450    print "\n";
451    print "  For TrioCanu, haplotypes are specified with the -haplotype{NAME} option, with any\n";
452    print "  number of haplotype-specific Illumina read files after.  The {NAME} of each haplotype\n";
453    print "  is free text (but only letters and numbers, please).  For example:\n";
454    print "    -haplotypeNANNY nanny/*gz\n";
455    print "    -haplotypeBILLY billy1.fasta.gz billy2.fasta.gz\n";
456    print "\n";
457    print "  Reads can be either FASTA or FASTQ format, uncompressed, or compressed with gz, bz2 or xz.\n";
458    print "\n";
459    print "  Reads are specified by the technology they were generated with, and any processing performed.\n";
460    print "\n";
461    print "  [processing]\n";
462    print "    -corrected\n";
463    print "    -trimmed\n";
464    print "\n";
465    print "  [technology]\n";
466    print "    -pacbio      <files>\n";
467    print "    -nanopore    <files>\n";
468    print "    -pacbio-hifi <files>\n";
469    print "\n";
470    print "Complete documentation at http://canu.readthedocs.org/en/latest/\n";
471    print "\n";
472
473    if ($errors) {
474        print "$errors";
475        print "\n";
476        exit(1);
477    } else {
478       exit(0);
479   }
480}
481
482
483sub printCitation ($$) {
484    my $prefix = shift @_;
485    my $mode   = shift @_;
486
487    if (($mode eq "canu") ||
488        ($mode eq "all")) {
489        print STDERR "${prefix}For 'standard' assemblies of PacBio or Nanopore reads:\n";
490        print STDERR "${prefix}  Koren S, Walenz BP, Berlin K, Miller JR, Phillippy AM.\n";
491        print STDERR "${prefix}  Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation.\n";
492        print STDERR "${prefix}  Genome Res. 2017 May;27(5):722-736.\n";
493        print STDERR "${prefix}  http://doi.org/10.1101/gr.215087.116\n";
494        print STDERR "${prefix}\n";
495    }
496
497    if (($mode eq "trio") ||
498        ($mode eq "all")) {
499        print STDERR "${prefix}For 'trio-binned' assemblies of PacBio or Nanopore reads:\n";
500        print STDERR "  ${prefix}Koren S, Rhie A, Walenz BP, Dilthey AT, Bickhart DM, Kingan SB, Hiendleder S, Williams JL, Smith TPL, Phillippy AM.\n";
501        print STDERR "  ${prefix}De novo assembly of haplotype-resolved genomes with trio binning.\n";
502        print STDERR "  ${prefix}Nat Biotechnol. 2018\n";
503        print STDERR "  ${prefix}https//doi.org/10.1038/nbt.4277\n";
504        print STDERR "${prefix}\n";
505    }
506
507    if (($mode eq "hicanu") ||
508        ($mode eq "all")) {
509        print STDERR "${prefix}For assemblies of PacBio HiFi reads:\n";
510        print STDERR "${prefix}  Nurk S, Walenz BP, Rhiea A, Vollger MR, Logsdon GA, Grothe R, Miga KH, Eichler EE, Phillippy AM, Koren S.\n";
511        print STDERR "${prefix}  HiCanu: accurate assembly of segmental duplications, satellites, and allelic variants from high-fidelity long reads.\n";
512        print STDERR "${prefix}  biorXiv. 2020.\n";
513        print STDERR "${prefix}  https://doi.org/10.1101/2020.03.14.992248\n";
514        print STDERR "${prefix}\n";
515    }
516
517    print STDERR "${prefix}Read and contig alignments during correction and consensus use:\n";
518    print STDERR "${prefix}  Šošic M, Šikic M.\n";
519    print STDERR "${prefix}  Edlib: a C/C ++ library for fast, exact sequence alignment using edit distance.\n";
520    print STDERR "${prefix}  Bioinformatics. 2017 May 1;33(9):1394-1395.\n";
521    print STDERR "${prefix}  http://doi.org/10.1093/bioinformatics/btw753\n";
522    print STDERR "${prefix}\n";
523
524    print STDERR "${prefix}Overlaps are generated using:\n";
525
526    if ((getGlobal("corOverlapper") eq "mhap") ||
527        (getGlobal("obtOverlapper") eq "mhap") ||
528        (getGlobal("utgOverlapper") eq "mhap")) {
529       print STDERR "${prefix}  Berlin K, et al.\n";
530       print STDERR "${prefix}  Assembling large genomes with single-molecule sequencing and locality-sensitive hashing.\n";
531       print STDERR "${prefix}  Nat Biotechnol. 2015 Jun;33(6):623-30.\n";
532       print STDERR "${prefix}  http://doi.org/10.1038/nbt.3238\n";
533       print STDERR "${prefix}\n";
534    }
535
536    if ((getGlobal("corOverlapper") eq "ovl") ||
537        (getGlobal("obtOverlapper") eq "ovl") ||
538        (getGlobal("utgOverlapper") eq "ovl")) {
539       print STDERR "${prefix}  Myers EW, et al.\n";
540       print STDERR "${prefix}  A Whole-Genome Assembly of Drosophila.\n";
541       print STDERR "${prefix}  Science. 2000 Mar 24;287(5461):2196-204.\n";
542       print STDERR "${prefix}  http://doi.org/10.1126/science.287.5461.2196\n";
543       print STDERR "${prefix}\n";
544    }
545
546    if ((getGlobal("corOverlapper") eq "minimap") ||
547        (getGlobal("obtOverlapper") eq "minimap") ||
548        (getGlobal("utgOverlapper") eq "minimap")) {
549       print STDERR "${prefix}  Li H.\n";
550       print STDERR "${prefix}  Minimap2: pairwise alignment for nucleotide sequences.\n";
551       print STDERR "${prefix}  arXiv.org. 2017 Aug 4.\n";
552       print STDERR "${prefix}  https://arxiv.org/abs/1708.01492\n";
553       print STDERR "${prefix}\n";
554    }
555
556    if ($mode ne "hicanu") {
557        print STDERR "${prefix}Corrected read consensus sequences are generated using an algorithm derived from FALCON-sense:\n";
558        print STDERR "${prefix}  Chin CS, et al.\n";
559        print STDERR "${prefix}  Phased diploid genome assembly with single-molecule real-time sequencing.\n";
560        print STDERR "${prefix}  Nat Methods. 2016 Dec;13(12):1050-1054.\n";
561        print STDERR "${prefix}  http://doi.org/10.1038/nmeth.4035\n";
562        print STDERR "${prefix}\n";
563    }
564
565    print STDERR "${prefix}Contig consensus sequences are generated using an algorithm derived from pbdagcon:\n";
566    print STDERR "${prefix}  Chin CS, et al.\n";
567    print STDERR "${prefix}  Nonhybrid, finished microbial genome assemblies from long-read SMRT sequencing data.\n";
568    print STDERR "${prefix}  Nat Methods. 2013 Jun;10(6):563-9\n";
569    print STDERR "${prefix}  http://doi.org/10.1038/nmeth.2474\n";
570    print STDERR "${prefix}\n";
571}
572
573
574
575sub makeAbsolute ($) {
576    my $var = shift @_;
577    my $val = getGlobal($var);
578    my $abs = abs_path($val);
579
580    if (defined($val) && ($val ne $abs)) {
581        setGlobal($var, $abs);
582        $val =~ s/\\\"/\"/g;
583        $val =~ s/\"/\\\"/g;
584        $val =~ s/\\\$/\$/g;
585        $val =~ s/\$/\\\$/g;
586
587        addCommandLineOption("'$var=$val'");
588    }
589}
590
591
592
593sub fixCase ($@) {
594    my $var = shift @_;
595    my $VAL = getGlobal($var);
596    my $val = $VAL;
597    my $upp = shift @_;
598
599    if (defined($upp)) {
600        $val =~ tr/a-z/A-Z/;
601    } else {
602        $val =~ tr/A-Z/a-z/;
603    }
604
605    setGlobal($var, $val)   if ($VAL ne $val)
606}
607
608
609
610sub setParametersFromFile ($) {
611    my $specFile  = shift @_;
612
613    #  Client should be ensuring that the file exists before calling this function.
614    die "specFile '$specFile' not found.\n"  if (! -e "$specFile");
615
616    $specLog .= "\n";
617    $specLog .= "###\n";
618    $specLog .= "###  Reading options from '$specFile'\n";
619    $specLog .= "###\n";
620    $specLog .= "\n";
621
622    #  We lost the use of caExit() here (moved to Execution.pm) and so can't call it.
623    #  Just die.
624
625    open(F, "< $specFile") or die("can't open '$specFile' for reading: $!\n");
626
627    while (<F>) {
628        $specLog .= $_;
629
630        s/^\s+//;
631        s/\s+$//;
632
633        next if (m/^#/);
634        next if (length($_) eq 0);
635
636        #  First, figure out the two words.
637        #
638        #  Word two won't match a #, but will gobble up spaces at the end.
639        #  Then, we can match a #, and any amount of comment, minimally.  If
640        #  word two is made non-greedy, it will shrink to nothing, as the
641        #  last bit will gobble up everything, since we're allowed to match
642        #  zero #'s in between.
643
644        my $one;
645        my $two;
646
647        if (m/^(\w*)\s*=\s*([^#]*)\s*#*.*?$/) {
648            $one = $1;
649            $two = $2;
650        } else {
651            addCommandLineError("ERROR:  File not found or unknown specFile option line '$_' in file '$specFile'.\n");
652        }
653
654        #  Now, clean up the second word to handle quotes.
655
656        $two =~ s/^\s+//;   #  There can be spaces from the greedy match.
657        $two =~ s/\s+$//;
658
659        $two = $1   if ($two =~ m/^'(.+)'$/);    #  Remove single quotes   |   But don't allowed mixed quotes; users
660        $two = $1   if ($two =~ m/^"(.+)"$/);    #  Remove double quotes   |   should certainly know better
661
662        #  And do something.
663
664        $two =~ s/^\s+//;  #  Remove spaces again.  They'll just confuse our option processing.
665        $two =~ s/\s+$//;
666
667        setGlobal($one, $two);
668    }
669    close(F);
670}
671
672
673
674sub setParametersFromCommandLine(@) {
675    my @specOpts = @_;
676
677    if (scalar(@specOpts) > 0) {
678        $specLog .= "\n";
679        $specLog .= "###\n";
680        $specLog .= "###  Reading options from the command line.\n";
681        $specLog .= "###\n";
682        $specLog .= "\n";
683    }
684
685    foreach my $s (@specOpts) {
686        $specLog .= "$s\n";
687
688        if ($s =~ m/\s*(\w*)\s*=(.*)/) {
689            my ($var, $val) = ($1, $2);
690            $var =~ s/^\s+//; $var =~ s/\s+$//;
691            $val =~ s/^\s+//; $val =~ s/\s+$//;
692            setGlobal($var, $val);
693        } else {
694            addCommandLineError("ERROR:  Misformed command line option '$s'.\n");
695        }
696    }
697}
698
699
700
701#  Helper function to add a variable/value pair to the list of global options.
702#  Both getGlobal() and setGlobal() will complain if an unknown variable is accessed.
703#
704sub setDefault ($$$) {
705    my $VAR         = shift @_;
706    my $var         = $VAR;
707    my $value       = shift @_;
708    my $description = shift @_;
709
710    $var =~ tr/A-Z/a-z/;
711
712    $global{$var} = $value;         #  Internal lookups always use all lowercase.
713    $synops{$var} = $description;   #  But these operate on the case-sensitive input string?
714    $synnam{$var} = $VAR;           #  Remember the stylized version ($VAR) of the option $var.
715}
716
717
718#  Helper to add variable/value pairs for all the options that affect
719#  exectuion.
720#
721sub setExecDefaults ($$) {
722    my $tag         = shift @_;
723    my $name        = shift @_;
724
725    setDefault("useGrid${tag}",     1,     "If 'true', run module $name under grid control; if 'false' run locally.");
726    setDefault("gridOptions${tag}", undef, "Grid engine options applied to $name jobs");
727    setDefault("${tag}Memory",      undef, "Amount of memory, in gigabytes, to use for $name jobs");
728    setDefault("${tag}Threads",     undef, "Number of threads to use for $name jobs");
729    setDefault("${tag}StageSpace",  undef, "Amount of local disk space needed to stage data for $name jobs");
730    setDefault("${tag}Concurrency", undef, "If grid not enabled, number of $name jobs to run at the same time; default is n_proc / n_threads");
731}
732
733
734#  Helper to add variable/value pairs for all the options that affect the
735#  overlap computation.
736#
737sub setOverlapDefaults ($$$) {
738    my $tag     = shift @_;  #  If 'cor', some parameters are loosened for raw pacbio reads
739    my $name    = shift @_;
740    my $default = shift @_;  #  Sets ${tag}Overlapper
741
742    #  Which overlapper to use.
743
744    setDefault("${tag}Overlapper",          $default,                  "Which overlap algorithm to use for $name");
745    setDefault("${tag}ReAlign",             0,                         "Refine overlaps by computing the actual alignment: 'true' or 'false'.  Not useful for overlapper=ovl.  Uses ${tag}OvlErrorRate");
746
747    #  OverlapInCore parameters.
748
749    setDefault("${tag}OvlHashBlockLength",  undef,                     "Amount of sequence (bp) to load into the overlap hash table");
750    setDefault("${tag}OvlRefBlockLength",   undef,                     "Amount of sequence (bp) to search against the hash table per batch");
751    setDefault("${tag}OvlHashBits",         undef,                     "Width of the kmer hash.  Width 22=1gb, 23=2gb, 24=4gb, 25=8gb.  Plus 10b per ${tag}OvlHashBlockLength");
752    setDefault("${tag}OvlHashLoad",         0.80,                      "Maximum hash table load.  If set too high, table lookups are inefficent; if too low, search overhead dominates run time; default 0.75");
753    setDefault("${tag}OvlMerSize",          ($tag eq "cor") ? 19 : 22, "K-mer size for seeds in overlaps");
754    setDefault("${tag}OvlMerThreshold",     undef,                     "K-mer frequency threshold; mers more frequent than this count are ignored");
755    setDefault("${tag}OvlMerDistinct",      undef,                     "K-mer frequency threshold; the least frequent fraction of distinct mers can seed overlaps");
756    setDefault("${tag}OvlFrequentMers",     undef,                     "Do not seed overlaps with these kmers");
757    setDefault("${tag}OvlFilter",           undef,                     "Filter overlaps based on expected kmers vs observed kmers");
758
759    #  Mhap parameters.  FilterThreshold MUST be a string, otherwise it gets printed in scientific notation (5e-06) which java doesn't understand.
760
761    setDefault("${tag}MhapVersion",         "2.1.3",                   "Version of the MHAP jar file to use");
762    setDefault("${tag}MhapFilterThreshold", "0.0000001",               "Value between 0 and 1. kmers which comprise more than this percentage of the input are downweighted");
763    setDefault("${tag}MhapFilterUnique",    undef,                     "Expert option: True or false, supress the low-frequency k-mer distribution based on them being likely noise and not true overlaps. Threshold auto-computed based on error rate and coverage.");
764    setDefault("${tag}MhapNoTf",            undef,                     "Expert option: True or false, do not use tf weighting, only idf of tf-idf.");
765    setDefault("${tag}MhapOptions",         undef,                     "Expert option: free-form parameters to pass to MHAP.");
766    setDefault("${tag}MhapPipe",            1,                         "Report results to a pipe instead of *large* files.");
767    setDefault("${tag}MhapBlockSize",       3000,                      "Number of reads per GB of memory allowed (mhapMemory)");
768    setDefault("${tag}MhapMerSize",         ($tag eq "cor") ? 16 : 16, "K-mer size for seeds in mhap");
769    setDefault("${tag}MhapOrderedMerSize",  ($tag eq "cor") ? 12 : 18, "K-mer size for second-stage filter in mhap");
770    setDefault("${tag}MhapSensitivity",     undef,                     "Coarse sensitivity level: 'low', 'normal' or 'high'.  Set automatically based on coverage; 'high' <= 30x < 'normal' < 60x <= 'low'");
771
772    #  MiniMap parameters.
773
774    setDefault("${tag}MMapBlockSize",       6000,                      "Number of reads per 1GB; memory * blockSize = the size of  block loaded into memory per job");
775    setDefault("${tag}MMapMerSize",         ($tag eq "cor") ? 15 : 21, "K-mer size for seeds in minmap");
776}
777
778
779
780sub setDefaults () {
781
782    #####  Internal stuff - no synopsis pre pretty-name for these.
783
784    $global{"errors"}                      = undef;   #  Command line errors.
785    $global{"version"}                     = undef;   #  Set in setVersion() once we know where binaries are.
786    $global{"availablehosts"}              = undef;   #  Internal list of cpus-memory-nodes describing the grid.
787
788    $global{"localmemory"}                 = 0;       #  Amount of memory on the local host, set in Grid_Local.pm
789    $global{"localthreads"}                = 0;
790
791    $global{"canuiteration"}               = 0;
792    $global{"canuiterationmax"}            = 2;
793
794    $global{"onexitdir"}                   = undef;   #  Copy of $wrk, for caExit() and caFailure() ONLY.
795    $global{"onexitnam"}                   = undef;   #  Copy of $asm, for caExit() and caFailure() ONLY.
796
797    #####  Meta options - no $global for these, only synopsis and pretty-format name,
798
799    $synops{"rawerrorrate"}                = "Expected fraction error in an alignment of two uncorrected reads";
800    $synnam{"rawerrorrate"}                = "rawErrorRate";
801
802    $synops{"correctederrorrate"}          = "Expected fraction error in an alignment of two corrected reads";
803    $synnam{"correctederrorrate"}          = "correctedErrorRate";
804
805    #####  General Configuration Options (aka miscellany)
806
807    my $java = (exists $ENV{"JAVA_HOME"} && -e "$ENV{'JAVA_HOME'}/bin/java") ? "$ENV{'JAVA_HOME'}/bin/java" : "java";
808
809    setDefault("showNext",            undef,      "Don't run any commands, just report what would run");
810    setDefault("shell",               "/bin/sh",  "Command interpreter to use; sh-compatible (e.g., bash), NOT C-shell (csh or tcsh); default '/bin/sh'");
811
812    setDefault("minimap",             "minimap2", "Path to minimap2; default 'minimap2'");
813
814    setDefault("java",                $java,      "Java interpreter to use; at least version 1.8; default 'java'");
815    setDefault("javaUse64Bit",        undef,      "Java interpreter supports the -d64 or -d32 flags; default auto");
816
817    setDefault("gnuplot",             "gnuplot",  "Path to the gnuplot executable");
818    setDefault("gnuplotImageFormat",  undef,      "Image format that gnuplot will generate.  Default: based on gnuplot, 'png', 'svg' or 'gif'");
819
820    setDefault("stageDirectory",      undef,      "If set, copy heavily used data to this node-local location");
821    setDefault("preExec",             undef,      "A command line to run at the start of Canu execution scripts");
822
823    #####  Cleanup and Termination options
824
825    setDefault("saveOverlaps",        0,         "Do not remove the overlap stores.  Default: false = remove overlap stores when they're no longer needed");
826    setDefault("purgeOverlaps",       "normal",  "When to delete intermediate overlap files: never, normal (default), aggressive, dangerous");
827    setDefault("saveReadCorrections", 0,         "Save intermediate read correction files, almost never a good idea");
828    setDefault("saveReadHaplotypes",  0,         "Save intermediate read haplotype files, almost never a good idea");
829    setDefault("saveMerCounts",       0,         "Save full mer counting results, sometimes useful");
830    setDefault("saveReads",           1,         "Save intermediate corrected and trimmed reads to asm.correctedReads.fasta.gz and asm.trimmedReads.fasta.gz");
831    setDefault("onSuccess",           undef,     "Full path to command to run on successful completion");
832    setDefault("onFailure",           undef,     "Full path to command to run on failure");
833
834    #####  Error Rates
835
836    setDefault("corOvlErrorRate",     undef,     "Overlaps above this error rate are not computed");
837    setDefault("obtOvlErrorRate",     undef,     "Overlaps at or below this error rate are used to trim reads");
838    setDefault("utgOvlErrorRate",     undef,     "Overlaps at or below this error rate are used to trim reads");
839    setDefault("utgErrorRate",        undef,     "Overlaps at or below this error rate are used to construct contigs");
840    setDefault("utgGraphDeviation",   undef,     "Overlaps this much above median will not be used for initial graph construction");
841    setDefault("utgBubbleDeviation",  1,         "Overlaps this much above mean of contig will be used to identify bubbles");
842    setDefault("utgRepeatDeviation",  undef,     "Overlaps this much above mean unitig error rate will not be used for repeat splitting");
843    setDefault("utgRepeatConfusedBP", 2500,      "Repeats where the next best edge is at least this many bp shorter will not be split");
844    setDefault("utgRepeatConfusedPC", 15,        "Repeats where the next best edge is at least this many percent shorter will not be split");
845    setDefault("utgChimeraType",      "deadend", "When to filter reads for contig construction: none, chimera (missing middle), uncovered (missing middle or ends), deadend (missing middle or end or no neighbor) (default)");
846    setDefault("corErrorRate",        undef,     "Only use raw alignments below this error rate to construct corrected reads");
847    setDefault("cnsErrorRate",        undef,     "Consensus expects alignments at about this error rate");
848
849    #####  Minimums and maximums
850
851    setDefault("minReadLength",        1000,      "Reads shorter than this length are not loaded into the assembler; default 1000");
852    setDefault("minOverlapLength",     500,       "Overlaps shorter than this length are not computed; default 500");
853
854    setDefault("readSamplingBias",     0.0,       "Score reads as 'random * length^bias', keep the highest scoring reads");
855
856    setDefault("minMemory",            0,         "Minimum amount of memory needed to compute the assembly (do not set unless prompted!)");
857    setDefault("maxMemory",            undef,     "Maximum memory to use by any component of the assembler");
858
859    setDefault("minThreads",           0,         "Minimum number of compute threads suggested to compute the assembly");
860    setDefault("maxThreads",           undef,     "Maximum number of compute threads to use by any component of the assembler");
861
862    setDefault("minInputCoverage",     10,        "Stop if input coverage is too low; default 10");
863    setDefault("maxInputCoverage",     undef,     "If input coverage is high, downsample to something reasonable; default 200");
864
865    #####  Stopping conditions
866
867    setDefault("stopOnLowCoverage",    10,        "Stop if raw, corrected or trimmed read coverage is low");
868    setDefault("stopAfter",            undef,     "Stop after a specific algorithm step is completed");
869
870    #####  Grid Engine configuration, internal parameters.  These are filled out in canu.pl, right after this function returns.
871
872    setDefault("gridEngine",                          undef, "Grid engine configuration, not documented");
873    setDefault("gridEngineSubmitCommand",             undef, "Grid engine configuration, not documented");
874    setDefault("gridEngineNameOption",                undef, "Grid engine configuration, not documented");
875    setDefault("gridEngineArrayOption",               undef, "Grid engine configuration, not documented");
876    setDefault("gridEngineArrayName",                 undef, "Grid engine configuration, not documented");
877    setDefault("gridEngineArrayMaxJobs",              undef, "Grid engine configuration, not documented");
878    setDefault("gridEngineOutputOption",              undef, "Grid engine configuration, not documented");
879    setDefault("gridEngineThreadsOption",             undef, "Grid engine configuration, not documented");
880    setDefault("gridEngineMemoryOption",              undef, "Grid engine configuration, not documented");
881    setDefault("gridEngineResourceOption",            undef, "Grid engine configuration, not documented");
882    setDefault("gridEngineMemoryUnits",               undef, "Grid engine configuration, not documented");
883    setDefault("gridEngineMemoryPerJob",              undef, "Grid engine configuration, not documented");
884    setDefault("gridEngineNameToJobIDCommand",        undef, "Grid engine configuration, not documented");
885    setDefault("gridEngineNameToJobIDCommandNoArray", undef, "Grid engine configuration, not documented");
886    setDefault("gridEngineStageOption",               undef, "Grid engine configuration, not documented");
887    setDefault("gridEngineTaskID",                    undef, "Grid engine configuration, not documented");
888    setDefault("gridEngineArraySubmitID",             undef, "Grid engine configuration, not documented");
889    setDefault("gridEngineJobID",                     undef, "Grid engine configuration, not documented");
890
891    #####  Slurm-specific parameters for controlling the number of
892    #####  cores / tasks dispatched per step or globally (WIP)
893
894    setDefault( 'slurmCormhapCoreLimit', undef, 'Maximum number of cores allocated for MHAP pre-computing and alignment within the correction phase' );
895    setDefault( 'slurmOvbCoreLimit', undef, 'Maximum number of single-core tasks dispatched for the ovlStore bucketizing step within the trimming phase' );
896    setDefault( 'slurmOvsCoreLimit', undef, 'Maximum number of single-core tasks dispatched for the ovlStore sorting step within the trimming phase' );
897    setDefault( 'slurmRedCoreLimit', undef, 'Maximum number of cores allocated for read error detection within the unitigging phase' );
898    setDefault( 'slurmArrayTaskLimit', undef, 'Maximum number of tasks permitted for each step throughout assembly' );
899    setDefault( 'slurmArrayCoreLimit', undef, 'Maximum number of cores allocated for each step throughout assembly' );
900
901    #####  Grid Engine Pipeline
902
903    setDefault("useGrid", 1, "If 'true', enable grid-based execution; if 'false', run all jobs on the local machine; if 'remote', create jobs for grid execution but do not submit; default 'true'");
904
905    #####  Grid Engine configuration, for each step of the pipeline
906
907    setDefault("gridOptions",           undef,  "Grid engine options applied to all jobs");
908    setDefault("gridOptionsExecutive",  undef,  "Grid engine options applied to the canu executive script");
909    setDefault("gridOptionsJobName",    undef,  "Grid jobs job-name suffix");
910
911    #####  Grid Engine configuration and parameters, for each step of the pipeline (memory, threads)
912
913    setExecDefaults("meryl",     "mer counting");
914    setExecDefaults("hap",       "haplotype assignment");
915    setExecDefaults("cor",       "read correction");
916
917    setExecDefaults("corovl",    "overlaps for correction");
918    setExecDefaults("obtovl",    "overlaps for trimming");
919    setExecDefaults("utgovl",    "overlaps for unitig construction");
920
921    setExecDefaults("cormhap",   "mhap overlaps for correction");
922    setExecDefaults("obtmhap",   "mhap overlaps for trimming");
923    setExecDefaults("utgmhap",   "mhap overlaps for unitig construction");
924
925    setExecDefaults("cormmap",   "mmap overlaps for correction");
926    setExecDefaults("obtmmap",   "mmap overlaps for trimming");
927    setExecDefaults("utgmmap",   "mmap overlaps for unitig construction");
928
929    setExecDefaults("ovb",       "overlap store bucketizing");
930    setExecDefaults("ovs",       "overlap store sorting");
931
932    setExecDefaults("red",       "read error detection");
933    setExecDefaults("oea",       "overlap error adjustment");
934
935    setExecDefaults("bat",       "unitig construction");
936    setExecDefaults("cns",       "unitig consensus");
937
938    #####  Object Storage
939
940    setDefault("objectStore",          undef,  "Type of object storage used; not ready for production yet");
941    setDefault("objectStoreClient",    undef,  "Path to the command line client used to access the object storage");
942    setDefault("objectStoreClientUA",  undef,  "Path to the command line client used to upload files to object storage");
943    setDefault("objectStoreClientDA",  undef,  "Path to the command line client used to download files from object storage");
944    setDefault("objectStoreNameSpace", undef,  "Object store parameters; specific to the type of objectStore used");
945    setDefault("objectStoreProject",   undef,  "Object store project; specific to the type of objectStore used");
946
947    #####  Overlapper
948
949    setOverlapDefaults("cor", "correction",             "mhap");  #  Overlaps computed for correction
950    setOverlapDefaults("obt", "overlap based trimming", "ovl");   #  Overlaps computed for trimming
951    setOverlapDefaults("utg", "unitig construction",    "ovl");   #  Overlaps computed for unitigging
952
953    setGlobal("corReAlign", "false");   #  To guard against someone only setting
954    setGlobal("obtReAlign", "false");   #  utgOverlapper=mhap, we default to realign
955    setGlobal("utgReAlign", "true");    #  enabled.
956
957    ##### Overlap Store
958
959    #  ovbMemory and ovsMemory are set above.
960
961    #####  Executive
962
963    setDefault("executiveMemory",   4,   "Amount of memory, in GB, to reserve for the Canu exective process");
964    setDefault("executiveThreads",  1,   "Number of threads to reserve for the Canu exective process");
965
966    #####  Mers
967
968    setDefault("merylMemory",      undef,  "Amount of memory, in gigabytes, to use for mer counting");
969    setDefault("merylThreads",     undef,  "Number of threads to use for mer counting");
970    setDefault("merylConcurrency", undef,  "Unused, there is only one process");
971
972    #####  Haplotyping
973
974    setDefault("hapUnknownFraction", 0.05,   "Fraction of allowed unknown bases before they are included in the assembly, between 0-1; default 0.05");
975    setDefault("hapMemory",          undef,  "Amount of memory, in gigabytes, to use for haplotype assignment");
976    setDefault("hapThreads",         undef,  "Number of threads to use for haplotype assignment");
977    setDefault("hapConcurrency",     undef,  "Unused, there is only one process");
978
979    #####  Overlap Based Trimming
980
981    setDefault("obtErrorRate",       undef, "Stringency of overlaps to use for trimming");
982    setDefault("trimReadsOverlap",   500,   "Minimum overlap between evidence to make contiguous trim; default '500'");
983    setDefault("trimReadsCoverage",  2,     "Minimum depth of evidence to retain bases; default '2");
984
985    #####  Fragment/Overlap Error Correction
986
987    setDefault("enableOEA",       1,     "Do overlap error adjustment - comprises two steps: read error detection (RED) and overlap error adjustment (OEA); default 'true'");
988
989    setDefault("oeaHaploConfirm", undef, "This many or more reads will confirm a true haplotype difference; default 5");
990    setDefault("oeaMaskTrivial",  undef, "Mask trivial DNA in Overlap Error Adjustment; default off; on for HiFi reads");
991    setDefault("oeaErrorRate",    undef, "Only use overlaps with at most this much fraction error to find errors in reads; default utgOvlErrorRate, 0.003 for HiFi reads");
992
993    setDefault("redBatchSize",    undef, "Number of reads per fragment error detection batch");
994    setDefault("redBatchLength",  undef, "Number of bases per fragment error detection batch");
995
996    setDefault("oeaBatchSize",    undef, "Number of reads per overlap error correction batch");
997    setDefault("oeaBatchLength",  undef, "Number of bases per overlap error correction batch");
998
999    #####  Unitigger & BOG & bogart Options
1000
1001    setDefault("unitigger",      "bogart", "Which unitig algorithm to use; only 'bogart' supported; default 'bogart'");
1002    setDefault("genomeSize",     undef, "An estimate of the size of the genome");
1003    setDefault("batOptions",     undef, "Advanced options to bogart");
1004    setDefault("batMemory",      undef, "Approximate maximum memory usage, in gigabytes, default is the maxMemory limit");
1005    setDefault("batThreads",     undef, "Number of threads to use; default is the maxThreads limit");
1006    setDefault("batConcurrency", undef, "Unused, only one process supported");
1007
1008    setDefault("contigFilter",   "2 0 1.0 0.5 3",   "Parameters to filter out 'unassembled' unitigs.  Five values: minReads minLength singleReadSpan lowCovFraction lowCovDepth");
1009
1010    #####  Consensus Options
1011
1012    setDefault("cnsMaxCoverage",  40,          "Limit unitig consensus to at most this coverage; default '40' = unlimited");
1013    setDefault("cnsConsensus",    "pbdagcon",  "Which consensus algorithm to use; 'pbdagcon' (fast, reliable); 'utgcns' (multialignment output); 'quick' (single read mosaic); default 'pbdagcon'");
1014    setDefault("cnsPartitions",   0,           "Attempt to create this many consensus jobs; default '0' = based on the largest tig");
1015
1016    #####  Correction Options
1017
1018    setDefault("corPartitions",                undef,        "Partition read correction into N jobs");
1019    setDefault("corPartitionMin",              undef,        "Don't make a read correction partition with fewer than N reads");
1020    setDefault("corMinEvidenceLength",         undef,        "Limit read correction to only overlaps longer than this; default: unlimited");
1021    setDefault("corMaxEvidenceErate",          undef,        "Limit read correction to only overlaps at or below this fraction error; default: unlimited");
1022    setDefault("corMaxEvidenceCoverageGlobal", "1.0x",       "Limit reads used for correction to supporting at most this coverage; default: '1.0x' = 1.0 * estimated coverage");
1023    setDefault("corMaxEvidenceCoverageLocal",  "2.0x",       "Limit reads being corrected to at most this much evidence coverage; default: '2.0x' = 2.0 * estimated coverage");
1024    setDefault("corOutCoverage",               40,           "Only correct the longest reads up to this coverage; default 40");
1025    setDefault("corMinCoverage",               undef,        "Minimum number of bases supporting each corrected base, if less than this sequences are split; default based on input read coverage: 0 <= 30x < 4 < 60x <= 4");
1026    setDefault("corFilter",                    "expensive",  "Method to filter short reads from correction; 'quick' or 'expensive'; default 'expensive'");
1027    setDefault("corConsensus",                 "falcon",     "Which consensus algorithm to use; only 'falcon' is supported; default 'falcon'");
1028
1029    setDefault("homoPolyCompress",             undef,        "Compute everything but consensus sequences using homopolymer compressed reads");
1030
1031    #####  Sanity check that all keys are lowercase.
1032
1033    foreach my $key (keys %global) {
1034        my $keylc = $key;
1035        $keylc =~ tr/A-Z/a-z/;
1036
1037        if ($key ne $keylc) {
1038            die "key '$key' is not all lowercase.\n";
1039        }
1040    }
1041}
1042
1043
1044#  Get the version information.
1045
1046sub setVersion ($) {
1047    my $bin    = shift @_;
1048    my $version;
1049
1050    open(F, "$bin/sqStoreCreate --version 2>&1 |");
1051    while (<F>) {
1052        $version = $_;  chomp $version;
1053    }
1054    close(F);
1055
1056    setGlobal("version", $version);
1057}
1058
1059
1060sub checkJava () {
1061    return  if ((getGlobal("corOverlapper") ne "mhap") &&
1062                (getGlobal("obtOverlapper") ne "mhap") &&
1063                (getGlobal("utgOverlapper") ne "mhap"));
1064
1065    my $java         = getGlobal("java");
1066    my $javaUse64Bit = getGlobal("javaUse64Bit");
1067    my $versionStr   = "unknown";
1068    my $version      = 0;
1069    my @javaVersionStrings;
1070
1071    if ($java =~ m/^\./) {
1072        addCommandLineError("ERROR:  path to java '$java' must not be a relative path.\n");
1073    }
1074
1075    #  We've seen errors running just this tiny java if too many copies are ran at the same time.
1076    #  So, run it twice, if needed, with a little random delay between.
1077
1078    for (my $iter=0; $iter<2; $iter++) {
1079        open(F, "$java -Xmx1g -showversion 2>&1 |");
1080        @javaVersionStrings = <F>;
1081        chomp @javaVersionStrings;
1082        close(F);
1083
1084        foreach (@javaVersionStrings) {
1085            #  First word is either "java" or "openjdk" or ...
1086            if (m/^.*\s+version\s+\"(\d+\.*\d*)(.*)\".*$/) {
1087                $versionStr = "$1$2";
1088                $version    =  $1;
1089            }
1090            if (m/-d64/) {
1091               setGlobal("javaUse64Bit", 1);
1092            }
1093        }
1094        close(F);
1095
1096        last  if ($version >= 1.8);
1097
1098        print STDERR "-- Failed Java version check.\n";
1099        print STDERR "--   '$javaVersionStrings[0]'\n"  if (length($javaVersionStrings[0]) > 0);
1100        print STDERR "--   '$javaVersionStrings[1]'\n"  if (length($javaVersionStrings[1]) > 0);
1101        print STDERR "--   '$javaVersionStrings[2]'\n"  if (length($javaVersionStrings[2]) > 0);
1102        print STDERR "--   '$javaVersionStrings[3]'\n"  if (length($javaVersionStrings[3]) > 0);
1103        print STDERR "--\n";
1104        print STDERR "-- Trying again.\n";
1105        print STDERR "--\n";
1106
1107        sleep(int(rand(3)+1));
1108    }
1109    setGlobal("javaUse64Bit",  0) if (!defined(getGlobal("javaUse64Bit")));
1110
1111    if ($version < 1.8) {
1112        addCommandLineError("ERROR:  mhap overlapper requires java version at least 1.8.0; you have $versionStr (from '$java').\n");
1113        addCommandLineError("ERROR:  '$java -Xmx1g -showversion' reports:\n");
1114
1115        for (my $ii=0; (($ii<20) && ($ii < scalar(@javaVersionStrings))); $ii++) {
1116            addCommandLineError("ERROR:    '$javaVersionStrings[$ii]'\n");
1117        }
1118
1119    } else {
1120        print STDERR "-- Detected Java(TM) Runtime Environment '$versionStr' (from '$java')";
1121        print STDERR (defined(getGlobal("javaUse64Bit")) && getGlobal("javaUse64Bit") == 1) ? " with " : " without ";
1122        print STDERR "-d64 support.\n";
1123    }
1124}
1125
1126
1127
1128sub checkMinimap ($) {
1129    my $minimap = getGlobal("minimap");
1130    my $version = undef;
1131
1132    return  if ((getGlobal("corOverlapper") ne "minimap") &&
1133                (getGlobal("obtOverlapper") ne "minimap") &&
1134                (getGlobal("utgOverlapper") ne "minimap"));
1135
1136    if ($minimap =~ m/^\./) {
1137        addCommandLineError("ERROR:  path to minimap2 '$minimap' must not be a relative path.\n");
1138        goto cleanupMinimap;
1139    }
1140
1141    system("cd /tmp && $minimap --version > /tmp/minimap2-$$.err 2>&1");
1142
1143    open(F, "< /tmp/minimap2-$$.err");
1144    while (<F>) {
1145        $version = $1  if ($_ =~ m/^(2.*$)/);
1146    }
1147    close(F);
1148
1149    if (!defined($version)) {
1150        addCommandLineError("ERROR:  failed to run minimap2 using command '$minimap'.\n");
1151        goto cleanupMinimap;
1152    }
1153
1154    print STDERR "-- Detected minimap2 version '$version' (from '$minimap').\n";
1155
1156 cleanupMinimap:
1157    unlink "/tmp/minimap2-$$.err";
1158}
1159
1160
1161
1162sub checkGnuplot () {
1163    my $gnuplot = getGlobal("gnuplot");
1164    my $format  = getGlobal("gnuplotImageFormat");
1165    my $version = undef;
1166
1167    if (($gnuplot eq undef) ||
1168        ($gnuplot eq "")) {
1169        print STDERR "-- No path to gnuplot executable.  Plots disabled.\n";
1170        goto cleanupGnuplot;
1171    }
1172
1173    if ($gnuplot =~ m/^\./) {
1174        addCommandLineError("ERROR:  path to gnuplot '$gnuplot' must not be a relative path.\n");
1175        goto cleanupGnuplot;
1176    }
1177
1178    #  Explicitly set pager to avoid having output corrupted by "Press enter..."
1179
1180    $ENV{"PAGER"} = "cat";
1181
1182    #  Check for existence of gnuplot.
1183
1184    open(F, "> /tmp/gnuplot-$$-test.gp");
1185    print F "show version long\n";
1186    print F "set terminal\n";
1187    close(F);
1188
1189    system("cd /tmp && $gnuplot < /dev/null /tmp/gnuplot-$$-test.gp > /tmp/gnuplot-$$-test.err 2>&1");
1190
1191    open(F, "< /tmp/gnuplot-$$-test.err");
1192    while (<F>) {
1193        $version = $1  if ($_ =~ m/^\s*[vV]ersion\s+(.*)/);
1194        $version = $1  if ($_ =~ m/^\s*[vV]ersion\s+(.*)\s+last/);
1195    }
1196    close(F);
1197
1198    if (!defined($version)) {
1199        print STDERR "--\n";
1200        print STDERR "-- WARNING:\n";
1201        print STDERR "-- WARNING:  Failed to run gnuplot using command '$gnuplot'.\n";
1202        print STDERR "-- WARNING:  Plots will be disabled.\n";
1203        print STDERR "-- WARNING:\n";
1204        print STDERR "--\n";
1205
1206        goto cleanupGnuplot;
1207    }
1208
1209    #  Check for existence of a decent output format.  Need to redirect in /dev/null to make gnuplot
1210    #  not use it's builtin pager.
1211
1212    if (!defined($format)) {
1213        my $havePNG = 0;
1214        my $haveSVG = 0;
1215        my $haveGIF = 0;
1216
1217        open(F, "< /tmp/gnuplot-$$-test.err");
1218        while (<F>) {
1219            s/^\s+//;
1220            s/\s+$//;
1221
1222            my @t = split '\s+', $_;
1223
1224            $havePNG = 1  if ($t[0] eq 'png');
1225            $haveSVG = 1  if ($t[0] eq 'svg');
1226            $haveGIF = 1  if ($t[0] eq 'gif');
1227        }
1228        close(F);
1229
1230        $format = "gif"   if ($haveGIF);
1231        $format = "svg"   if ($haveSVG);
1232        $format = "png"   if ($havePNG);
1233
1234        setGlobal("gnuplotImageFormat", $format);
1235    }
1236
1237    if (!defined($format)) {
1238        print STDERR "--\n";
1239        print STDERR "-- WARNING:\n";
1240        print STDERR "-- WARNING:  Failed to detect a suitable output format for gnuplot.  Looked for png, svg\n";
1241        print STDERR "-- WARNING:  and gif; found none of them.  Specify a format with gnuplotImageFormat=<type>,\n";
1242        print STDERR "-- WARNING:  or set 'gnuplot=undef' to disable gnuplot entirely.  Plots will be disabled.\n";
1243        print STDERR "-- WARNING:\n";
1244        print STDERR "--\n";
1245
1246        goto cleanupGnuplot;
1247    }
1248
1249    #  Test if we can actually make images.
1250
1251    open(F, "> /tmp/gnuplot-$$-test.gp");
1252    print F "set title 'gnuplot test'\n";
1253    print F "set xlabel 'X'\n";
1254    print F "set xlabel 'Y'\n";
1255    print F "\n";
1256    print F "set terminal $format size 1024,1024\n";
1257    print F "set output '/tmp/gnuplot-$$-test.1.$format'\n";
1258    print F "\n";
1259    print F "plot [-30:20] sin(x*20) * atan(x)\n\n";
1260    print F "\n";
1261    print F "set terminal $format size 256,256\n";
1262    print F "set output '/tmp/gnuplot-$$-test.2.$format'\n";
1263    print F "\n";
1264    print F "bogus line\n";
1265    close(F);
1266
1267    system("cd /tmp && $gnuplot < /dev/null /tmp/gnuplot-$$-test.gp > /tmp/gnuplot-$$-test.err 2>&1");
1268
1269    if ((! -e "/tmp/gnuplot-$$-test.1.$format") ||
1270        (! -e "/tmp/gnuplot-$$-test.2.$format")) {
1271
1272        print STDERR "--\n";
1273        print STDERR "-- WARNING:\n";
1274        print STDERR "-- WARNING:  gnuplot failed to generate images.  Specify a format with gnuplotImageFormat=<type>,\n";
1275        print STDERR "-- WARNING:  or set 'gnuplot=undef' to disable gnuplot entirely.  Plots will be disabled.\n";
1276        print STDERR "-- WARNING:\n";
1277        print STDERR "-- WARNING:  gnuplot reports:\n";
1278
1279        open(F, "< /tmp/gnuplot-$$-test.err");
1280        while (<F>) {
1281            chomp;
1282            print STDERR "-- WARNING:      $_\n";
1283        }
1284        close(F);
1285
1286        print STDERR "--\n";
1287
1288        goto cleanupGnuplot;
1289    }
1290
1291    #  Yay, gnuplot works!
1292
1293    print STDERR "-- Detected gnuplot version '$version' (from '$gnuplot') and image format '$format'.\n";
1294
1295 cleanupGnuplot:
1296    unlink "/tmp/gnuplot-$$-test.gp";
1297    unlink "/tmp/gnuplot-$$-test.err";
1298    unlink "/tmp/gnuplot-$$-test.1.$format";
1299    unlink "/tmp/gnuplot-$$-test.2.$format";
1300}
1301
1302
1303
1304#  Converts number with units to gigabytes.  If no units, gigabytes is assumed.
1305sub adjustMemoryValue ($) {
1306    my $val = shift @_;
1307
1308    return(undef)                     if (!defined($val));
1309
1310    return($1)                        if ($val =~ m/^(\d+\.{0,1}\d*)$/);
1311    return($1 / 1024 / 1024)          if ($val =~ m/^(\d+\.{0,1}\d*)[kK]$/);
1312    return($1 / 1024)                 if ($val =~ m/^(\d+\.{0,1}\d*)[mM]$/);
1313    return($1)                        if ($val =~ m/^(\d+\.{0,1}\d*)[gG]$/);
1314    return($1 * 1024)                 if ($val =~ m/^(\d+\.{0,1}\d*)[tT]$/);
1315    return($1 * 1024 * 1024)          if ($val =~ m/^(\d+\.{0,1}\d*)[pP]$/);
1316
1317    die "Invalid memory value '$val'\n";
1318}
1319
1320
1321#  Converts gigabytes to number with units.
1322sub displayMemoryValue ($) {
1323    my $val = shift @_;
1324
1325    return(($val * 1024 * 1024)        . "k")   if ($val < adjustMemoryValue("1m"));
1326    return(($val * 1024)               . "m")   if ($val < adjustMemoryValue("1g"));
1327    return(($val)                      . "g")   if ($val < adjustMemoryValue("1t"));
1328    return(($val / 1024)               . "t");
1329}
1330
1331
1332#  Converts number with units to bases.
1333sub adjustGenomeSize ($) {
1334    my $val = shift @_;
1335
1336    return(undef)               if (!defined($val));
1337
1338    return($1)                  if ($val =~ m/^(\d+\.{0,1}\d*)$/i);
1339    return($1 * 1000)           if ($val =~ m/^(\d+\.{0,1}\d*)[kK]$/i);
1340    return($1 * 1000000)        if ($val =~ m/^(\d+\.{0,1}\d*)[mM]$/i);
1341    return($1 * 1000000000)     if ($val =~ m/^(\d+\.{0,1}\d*)[gG]$/i);
1342    return($1 * 1000000000000)  if ($val =~ m/^(\d+\.{0,1}\d*)[tT]$/i);
1343
1344    die "Invalid genome size '$val'\n";
1345}
1346
1347
1348#  Converts bases to number with units.
1349sub displayGenomeSize ($) {
1350    my $val = shift @_;
1351
1352    return(($val))                        if ($val < adjustGenomeSize("1k"));
1353    return(($val / 1000)          . "k")  if ($val < adjustGenomeSize("1m"));
1354    return(($val / 1000000)       . "m")  if ($val < adjustGenomeSize("1g"));
1355    return(($val / 1000000000)    . "g")  if ($val < adjustGenomeSize("1t"));
1356    return(($val / 1000000000000) . "t");
1357}
1358
1359
1360
1361sub checkParameters () {
1362
1363    #
1364    #  Fiddle with filenames to make them absolute paths.
1365    #
1366
1367    makeAbsolute("corOvlFrequentMers");
1368    makeAbsolute("obtOvlFrequentMers");
1369    makeAbsolute("utgOvlFrequentMers");
1370
1371    #
1372    #  Adjust case on some of them
1373    #
1374
1375    fixCase("corOverlapper");
1376    fixCase("obtOverlapper");
1377    fixCase("utgOverlapper");
1378
1379    fixCase("corConsensus");
1380    fixCase("cnsConsensus");
1381
1382    fixCase("corFilter");
1383
1384    fixCase("unitigger");
1385    fixCase("stopAfter");
1386
1387    fixCase("gridEngine",  "upper");   #  These want to be uppercase, grrrr!
1388    fixCase("objectStore", "upper");
1389
1390    #
1391    #  Convert things with units to just values.  The other memory sizes are
1392    #  'adjusted' in Configure.sh.
1393    #
1394
1395    setGlobal("genomeSize", adjustGenomeSize(getGlobal("genomeSize")));
1396
1397    setGlobal("minMemory",  adjustMemoryValue(getGlobal("minMemory")));
1398    setGlobal("maxMemory",  adjustMemoryValue(getGlobal("maxMemory")));
1399
1400    setGlobal("executiveMemory", adjustMemoryValue(getGlobal("executiveMemory")));
1401
1402    if (getGlobal("executiveMemory") < getGlobal("minMemory")) {   #  Silently bump up execMemory to minMemory
1403        setGlobal("executiveMemory", getGlobal("minMemory"));      #  if needed.
1404    }
1405
1406    #
1407    #  Check for inconsistent parameters
1408    #
1409
1410    my $gs = getGlobal("genomeSize");
1411
1412    addCommandLineError("ERROR:  Required parameter 'genomeSize' not set.\n")           if (!defined($gs));
1413    addCommandLineError("ERROR:  Implausibly small genome size $gs.  Check units!\n")   if ($gs < 1000);
1414
1415    #
1416    #  If we're running as a job array, unset the ID of the job array.  This screws
1417    #  up our scheduling, as our jobs think they're running in a task array.
1418    #
1419    #  Silly SGE sets this to 'undefined' for normal jobs.
1420    #
1421
1422    if (exists($ENV{getGlobal("gridEngineTaskID")})) {
1423        my $ja = $ENV{getGlobal("gridEngineTaskID")};
1424
1425        if (($ja ne "undefined") &&
1426            ($ja ne "0")) {
1427            print STDERR "--\n";
1428            print STDERR "-- I appear to be task $ja in a job array, unsetting ", getGlobal("gridEngineTaskID"), ".\n";
1429        }
1430
1431        undef $ENV{getGlobal("gridEngineTaskID")};
1432    }
1433
1434    #  Undefined error rate are OK; we'll set them to defaults later.
1435    #  Non-numeric or negative (or too positive) rates are definitely bad.
1436
1437    foreach my $var ("corOvlErrorRate", "obtOvlErrorRate", "utgOvlErrorRate", "corErrorRate", "obtErrorRate", "utgErrorRate", "cnsErrorRate") {
1438        if    (!defined(getGlobal($var))) {
1439        }
1440        elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) {
1441            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
1442        }
1443        elsif ((getGlobal($var) < 0.0) || (getGlobal($var) > 1.0)) {
1444            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0 and no more than 1.0\n");
1445        }
1446    }
1447
1448    foreach my $var ("corOvlMerDistinct", "obtOvlMerDistinct", "utgOvlMerDistinct") {
1449        if    (!defined(getGlobal($var))) {
1450        }
1451        elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) {
1452            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
1453        }
1454        elsif ((getGlobal($var) < 0.0) || (getGlobal($var) > 1.0)) {
1455            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0 and no more than 1.0\n");
1456        }
1457    }
1458
1459    foreach my $var ("corOvlMerThreshold", "obtOvlMerThreshold", "utgOvlMerThreshold") {
1460        if    (!defined(getGlobal($var))) {
1461        }
1462        elsif (getGlobal($var) !~ m/^[0123456789]+$/) {
1463            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be an integer\n");
1464        }
1465        elsif (getGlobal($var) < 2) {
1466            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 2\n");
1467        }
1468    }
1469
1470    if (getGlobal("minReadLength") < getGlobal("minOverlapLength")) {
1471        my $mr = getGlobal("minReadLength");
1472        my $mo = getGlobal("minOverlapLength");
1473
1474        addCommandLineError("ERROR:  minReadLength=$mr must be at least minOverlapLength=$mo.\n");
1475    }
1476
1477    foreach my $var ("corOutCoverage") {
1478        if (!defined(getGlobal($var))) {
1479            addCommandLineError("ERROR:  Invalid 'corOutCoverage' specified; must be at least 1.0\n");
1480        }
1481        elsif (getGlobal($var) =~ m/all/i) {
1482            setGlobal($var, 9999);
1483        }
1484        elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) {
1485            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
1486        }
1487        elsif (getGlobal($var) < 1.0) {
1488            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 1.0\n");
1489        }
1490    }
1491
1492    foreach my $var ("corMaxEvidenceCoverageGlobal", "corMaxEvidenceCoverageLocal") {
1493        if (!defined(getGlobal($var))) {
1494            #  If undef, defaults to corOutCoverage in CorrectReads.pm
1495        }
1496        elsif (getGlobal($var) =~ m/^(\d*\.*\d*)(x*)$/) {
1497            if (($1 < 1.0) && ($2 ne "x")) {
1498                addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 1.0\n");
1499            }
1500        }
1501        else {
1502            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
1503        }
1504    }
1505
1506    foreach my $var ("utgGraphDeviation", "utgRepeatDeviation", "utgRepeatConfusedBP", "minReadLength", "minOverlapLength") {
1507        setGlobal("utgGraphDeviation", 12) if $var eq "utgGraphDeviation" && !defined(getGlobal($var));
1508        setGlobal("utgRepeatDeviation", 1) if $var eq "utgRepeatDeviation" && !defined(getGlobal($var));
1509
1510        if (!defined(getGlobal($var))) {
1511            addCommandLineError("ERROR:  Invalid '$var' specified; must be set\n");
1512        }
1513        elsif (getGlobal($var) !~ m/^[.-0123456789]+$/) {
1514            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be numeric\n");
1515        }
1516        elsif (getGlobal($var) < 0.0) {
1517            addCommandLineError("ERROR:  Invalid '$var' specified (" . getGlobal("$var") . "); must be at least 0.0\n");
1518        }
1519    }
1520
1521    if ((getGlobal("utgChimeraType") ne "none") &&
1522        (getGlobal("utgChimeraType") ne "chimer") &&
1523        (getGlobal("utgChimeraType") ne "uncovered") &&
1524        (getGlobal("utgChimeraType") ne "deadend")) {
1525        addCommandLineError("ERROR:  Invalid 'utgChimeraType' specified (" . getGlobal("utgChimeraType") . "); must be 'none', 'chimer', 'uncovered' or 'deadend'\n");
1526    }
1527
1528    #
1529    #  Check for invalid usage
1530    #
1531
1532    foreach my $tag ("cor", "obt", "utg") {
1533        if ((getGlobal("${tag}Overlapper") ne "mhap") &&
1534            (getGlobal("${tag}Overlapper") ne "ovl")  &&
1535            (getGlobal("${tag}Overlapper") ne "minimap")) {
1536            addCommandLineError("ERROR:  Invalid '${tag}Overlapper' specified (" . getGlobal("${tag}Overlapper") . "); must be 'mhap', 'ovl', or 'minimap'\n");
1537        }
1538    }
1539
1540    foreach my $tag ("cor", "obt", "utg") {
1541        if (getGlobal("${tag}MhapSensitivity") eq "fast") {
1542            print STDERR "WARNING: deprecated ${tag}NhapSensitivity=fast replaced with ${tag}MhapSensitivity=low\n";
1543        }
1544
1545        if ((getGlobal("${tag}MhapSensitivity") ne undef)    &&
1546            (getGlobal("${tag}MhapSensitivity") ne "low")    &&
1547            (getGlobal("${tag}MhapSensitivity") ne "normal") &&
1548            (getGlobal("${tag}MhapSensitivity") ne "high")) {
1549            addCommandLineError("ERROR:  Invalid '${tag}MhapSensitivity' specified (" . getGlobal("${tag}MhapSensitivity") . "); must be 'fast' or 'normal' or 'high'\n");
1550        }
1551    }
1552
1553    if (getGlobal("unitigger") ne "bogart") {
1554        addCommandLineError("ERROR:  Invalid 'unitigger' specified (" . getGlobal("unitigger") . "); must be 'bogart'\n");
1555    }
1556
1557    if ((getGlobal("corConsensus") ne "falcon")) {
1558        addCommandLineError("ERROR:  Invalid 'corConsensus' specified (" . getGlobal("corConsensus") . "); must be 'utgcns' or 'falcon' or 'falconpipe'\n");
1559    }
1560
1561    if ((getGlobal("cnsConsensus") ne "quick") &&
1562        (getGlobal("cnsConsensus") ne "pbdagcon") &&
1563        (getGlobal("cnsConsensus") ne "utgcns")) {
1564        addCommandLineError("ERROR:  Invalid 'cnsConsensus' specified (" . getGlobal("cnsConsensus") . "); must be 'quick', 'pbdagcon', or 'utgcns'\n");
1565    }
1566
1567    if (!defined(getGlobal("maxInputCoverage"))) {
1568       setGlobal("maxInputCoverage", 200);
1569    }
1570    if (getGlobal("maxInputCoverage") eq "all") {
1571        setGlobal("maxInputCoverage", 0);
1572    }
1573    if ((getGlobal("maxInputCoverage") > 0) && (getGlobal("maxInputCoverage") < getGlobal("minInputCoverage"))) {
1574        my $minc = getGlobal("minInputCoverage");
1575        my $maxc = getGlobal("maxInputCoverage");
1576
1577        addCommandLineError("ERROR:  minInputCoverage ($minc) must be less than maxInputCoverage ($maxc).\n");
1578    }
1579    if ((getGlobal("maxInputCoverage") > 0) && (getGlobal("maxInputCoverage") < getGlobal("stopOnLowCoverage"))) {
1580        my $minc = getGlobal("stopOnLowCoverage");
1581        my $maxc = getGlobal("maxInputCoverage");
1582
1583        addCommandLineError("ERROR:  stopOnLowCoverage ($minc) must be less than maxInputCoverage ($maxc).\n");
1584    }
1585
1586    if ((getGlobal("saveOverlaps") ne "0") &&
1587        (getGlobal("saveOverlaps") ne "1")) {
1588        addCommandLineError("ERROR:  Invalid 'saveOverlaps' specified (" . getGlobal("saveOverlaps") . "); must be 'true' or 'false'\n");
1589    }
1590
1591    if ((getGlobal("purgeOverlaps") eq "0") ||
1592        (getGlobal("purgeOverlaps") eq "no")) {
1593        setGlobal("purgeOverlaps", "never");
1594    }
1595
1596    if ((getGlobal("purgeOverlaps") eq "1") ||
1597        (getGlobal("purgeOverlaps") eq "yes")) {
1598        setGlobal("purgeOverlaps", "normal");
1599    }
1600
1601    if ((getGlobal("purgeOverlaps") ne "never") &&
1602        (getGlobal("purgeOverlaps") ne "normal") &&
1603        (getGlobal("purgeOverlaps") ne "aggressive") &&
1604        (getGlobal("purgeOverlaps") ne "dangerous")) {
1605        addCommandLineError("ERROR:  Invalid 'purgeOverlaps' specified (" . getGlobal("purgeOverlaps") . "); must be 'never', 'normal', 'aggressive' or 'dangerous'\n");
1606    }
1607
1608    if ((getGlobal("corFilter") ne "quick") &&
1609        (getGlobal("corFilter") ne "expensive") &&
1610        (getGlobal("corFilter") ne "none")) {
1611        addCommandLineError("ERROR:  Invalid 'corFilter' specified (" . getGlobal("corFilter") . "); must be 'none' or 'quick' or 'expensive'\n");
1612    }
1613
1614
1615    if ((getGlobal("useGrid") ne "0") &&
1616        (getGlobal("useGrid") ne "1") &&
1617        (getGlobal("useGrid") ne "remote")) {
1618        addCommandLineError("ERROR:  Invalid 'useGrid' specified (" . getGlobal("useGrid") . "); must be 'true', 'false' or 'remote'\n");
1619    }
1620
1621
1622    if (defined(getGlobal("stopAfter"))) {
1623        my $ok = 0;
1624        my $st = getGlobal("stopAfter");
1625        $st =~ tr/A-Z/a-z/;
1626
1627        $st = "correction"   if ($st eq "readcorrection");    #  Update the string to allow deprecated usage.
1628        $st = "trimming"     if ($st eq "readtrimming");
1629
1630        my $failureString = "ERROR:  Invalid stopAfter specified (" . getGlobal("stopAfter") . "); must be one of:\n";
1631
1632        my @stopAfter = ("sequenceStore",
1633                         "meryl-configure",
1634                         "meryl-count",
1635                         "meryl-merge",
1636                         "meryl-process",
1637                         "meryl-subtract",
1638                         "meryl",
1639                         "haplotype-configure",
1640                         "haplotype",
1641                         "overlapConfigure",
1642                         "overlap",
1643                         "overlapStoreConfigure",
1644                         "overlapStore",
1645                         "correctionConfigure",
1646                         "correction",
1647                         "trimming",
1648                         "unitig",
1649                         "consensusConfigure",
1650                         "consensus");
1651
1652        foreach my $sa (@stopAfter) {
1653            $failureString .= "ERROR:      '$sa'\n";
1654
1655            $sa =~ tr/A-Z/a-z/;
1656
1657            if ($st eq $sa) {
1658                $ok++;
1659                setGlobal('stopAfter', $st);
1660            }
1661        }
1662
1663        addCommandLineError($failureString)   if ($ok == 0);
1664    }
1665
1666    {
1667        my @v = split '\s+', getGlobal("contigFilter");
1668
1669        if (scalar(@v) != 5) {
1670            addCommandLineError("contigFilter must have five values: minReads minLength singleReadSpan lowCovFraction lowCovDepth\n");
1671        }
1672
1673        addCommandLineError("contigFilter 'minReads' must be a positive integer, currently $v[0]\n")          if (($v[0] < 0) || ($v[0] !~ m/^[0-9]+$/));
1674        addCommandLineError("contigFilter 'minLength' must be a positive integer, currently $v[1]\n")         if (($v[1] < 0) || ($v[1] !~ m/^[0-9]+$/));
1675        addCommandLineError("contigFilter 'singleReadSpan' must be between 0.0 and 1.0, currently $v[2]\n")   if (($v[2] < 0) || (1 < $v[2]) || ($v[2] !~ m/^[0-9]*\.{0,1}[0-9]*$/));
1676        addCommandLineError("contigFilter 'lowCovFraction' must be between 0.0 and 1.0, currently $v[3]\n")   if (($v[3] < 0) || (1 < $v[3]) || ($v[3] !~ m/^[0-9]*\.{0,1}[0-9]*$/));
1677        addCommandLineError("contigFilter 'lowCovDepth' must be a positive integer, currently $v[4]\n")       if (($v[4] < 0) || ($v[4] !~ m/^[0-9]+$/));
1678    }
1679}
1680
1681
16821;
1683