1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;
2 
3 import com.google.common.collect.Lists;
4 import org.broadinstitute.barclay.argparser.Advanced;
5 import org.broadinstitute.barclay.argparser.Argument;
6 import org.broadinstitute.barclay.argparser.Hidden;
7 import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
9 import org.broadinstitute.hellbender.utils.MathUtils;
10 
11 import java.io.Serializable;
12 import java.util.List;
13 
14 /**
15  * Set of arguments related to the {@link org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler}
16  */
17 public abstract class ReadThreadingAssemblerArgumentCollection implements Serializable {
18     private static final long serialVersionUID = 1L;
19 
20     public static final double DEFAULT_PRUNING_LOG_ODDS_THRESHOLD = MathUtils.log10ToLog(1.0);
21     public static final double DEFAULT_PRUNING_SEEDING_LOG_ODDS_THRESHOLD = MathUtils.log10ToLog(4.0);
22 
23     public static final String ERROR_CORRECT_READS_LONG_NAME = "error-correct-reads";
24     public static final String PILEUP_ERROR_CORRECTION_LOG_ODDS_NAME = "error-correction-log-odds";
25 
26     public static final String CAPTURE_ASSEMBLY_FAILURE_BAM_LONG_NAME = "capture-assembly-failure-bam";
27     public static final String KMER_SIZE_LONG_NAME = "kmer-size";
28     public static final String DONT_INCREASE_KMER_SIZE_LONG_NAME = "dont-increase-kmer-sizes-for-cycles";
29     public static final String LINKED_DE_BRUIJN_GRAPH_LONG_NAME = "linked-de-bruijn-graph";
30 
31     // -----------------------------------------------------------------------------------------------
32     // arguments to control internal behavior of the read threading assembler
33     // -----------------------------------------------------------------------------------------------
34 
35     /**
36      * Multiple kmer sizes can be specified, using e.g. `--kmer-size 10 --kmer-size 25`.
37      */
38     @Advanced
39     @Argument(fullName= KMER_SIZE_LONG_NAME, doc="Kmer size to use in the read threading assembler", optional = true)
40     public List<Integer> kmerSizes = Lists.newArrayList(10, 25);
41 
42     /**
43      * When graph cycles are detected, the normal behavior is to increase kmer sizes iteratively until the cycles are
44      * resolved. Disabling this behavior may cause the program to give up on assembling the ActiveRegion.
45      */
46     @Advanced
47     @Argument(fullName= DONT_INCREASE_KMER_SIZE_LONG_NAME, doc="Disable iterating over kmer sizes when graph cycles are detected", optional = true)
48     public boolean dontIncreaseKmerSizesForCycles = false;
49 
50     /**
51      * By default, the program does not allow processing of reference sections that contain non-unique kmers. Disabling
52      * this check may cause problems in the assembly graph.
53      */
54     @Advanced
55     @Argument(fullName="allow-non-unique-kmers-in-ref", doc="Allow graphs that have non-unique kmers in the reference", optional = true)
56     public boolean allowNonUniqueKmersInRef = false;
57 
58     /**
59      * If fewer samples than the specified number pass the minPruning threshold for a given path, that path will be eliminated from the graph.
60      */
61     @Advanced
62     @Argument(fullName="num-pruning-samples", doc="Number of samples that must pass the minPruning threshold", optional = true)
63     public int numPruningSamples = 1;
64 
65     /**
66      * When constructing the assembly graph we are often left with "dangling" branches.  The assembly engine attempts to rescue these branches
67      * by merging them back into the main graph.  This argument describes the minimum length of a dangling branch needed for the engine to
68      * try to rescue it.  A smaller number here will lead to higher sensitivity to real variation but also to a higher number of false positives.
69      */
70     @Advanced
71     @Argument(fullName="min-dangling-branch-length", doc="Minimum length of a dangling branch to attempt recovery", optional = true)
72     public int minDanglingBranchLength = 4;
73 
74     /**
75      * By default, the read threading assembler does not recover dangling branches that fork after splitting from the reference.  This argument
76      * tells the assembly engine to recover all dangling branches.
77      */
78     @Advanced
79     @Argument(fullName="recover-all-dangling-branches", doc="Recover all dangling branches", optional = true)
80     public boolean recoverAllDanglingBranches = false;
81 
82     /**
83      * The assembly graph can be quite complex, and could imply a very large number of possible haplotypes.  Each haplotype
84      * considered requires N PairHMM evaluations if there are N reads across all samples.  In order to control the
85      * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their
86      * weights, no matter how many paths are possible to generate from the graph.  Putting this number too low
87      * will result in dropping true variation because paths that include the real variant are not even considered.
88      * You can consider increasing this number when calling organisms with high heterozygosity.
89      */
90     @Advanced
91     @Argument(fullName="max-num-haplotypes-in-population", doc="Maximum number of haplotypes to consider for your population", optional = true)
92     public int maxNumHaplotypesInPopulation = 128;
93 
94     /**
95      * Paths with fewer supporting kmers than the specified threshold will be pruned from the graph.
96      *
97      * Be aware that this argument can dramatically affect the results of variant calling and should only be used with great caution.
98      * Using a prune factor of 1 (or below) will prevent any pruning from the graph, which is generally not ideal; it can make the
99      * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph).  Higher values
100      * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher
101      * depth to produce calls).
102      */
103     @Advanced
104     @Argument(fullName="min-pruning", doc = "Minimum support to not prune paths in the graph", optional = true)
105     public int minPruneFactor = 2;
106 
107     /**
108      * Initial base error rate guess for the probabilistic adaptive pruning model.  Results are not very sensitive to this
109      * parameter because it is only a starting point from which the algorithm discovers the true error rate.
110      */
111     @Advanced
112     @Argument(fullName="adaptive-pruning-initial-error-rate", doc = "Initial base error rate estimate for adaptive pruning", optional = true)
113     public double initialErrorRateForPruning = 0.001;
114 
115     /**
116      * Log-10 likelihood ratio threshold for adaptive pruning algorithm.
117      */
118     @Advanced
119     @Argument(fullName="pruning-lod-threshold", doc = "Ln likelihood ratio threshold for adaptive pruning algorithm", optional = true)
120     public double pruningLogOddsThreshold = DEFAULT_PRUNING_LOG_ODDS_THRESHOLD;
121 
122     /**
123      * Log-10 likelihood ratio threshold for adaptive pruning algorithm.
124      */
125     @Advanced
126     @Argument(fullName="pruning-seeding-lod-threshold", doc = "Ln likelihood ratio threshold for seeding subgraph of good variation in adaptive pruning algorithm", optional = true)
127     public double pruningSeedingLogOddsThreshold = DEFAULT_PRUNING_SEEDING_LOG_ODDS_THRESHOLD;
128 
129     /**
130      * The maximum number of variants in graph the adaptive pruner will allow
131      */
132     @Advanced
133     @Argument(fullName="max-unpruned-variants", doc = "Maximum number of variants in graph the adaptive pruner will allow", optional = true)
134     public int maxUnprunedVariants = 100;
135 
136     /**
137      * Disables graph simplification into a seq graph, opts to construct a proper De Bruijn graph with potential loops
138      *
139      * NOTE: --linked-de-bruijn-graph is currently an experimental feature that does not directly match with
140      *        the regular HaplotypeCaller. Specifically the haplotype finding code does not perform correctly at complicated
141      *        sites. Use this mode at your own risk.
142      */
143     @Advanced
144     @Argument(fullName= LINKED_DE_BRUIJN_GRAPH_LONG_NAME, doc = "If enabled, the Assembly Engine will construct a Linked De Bruijn graph to recover better haplotypes", optional = true)
145     public boolean useLinkedDeBruijnGraph = false;
146 
147     /**
148      * This is used to disable the recovery of paths that were dropped in the graph based on the junction trees. Disabling this
149      * will affect sensitivity but improve phasing and runtime somewhat.
150      */
151     @Hidden
152     @Argument(fullName="disable-artificial-haplotype-recovery", doc = "If in 'linked-de-bruijn-graph' mode, disable recovery of haplotypes based on graph edges that are not in junction trees", optional = true)
153     public boolean disableArtificialHaplotypeRecovery = false;
154 
155     /**
156      * This option exists purely to support concordance with DRAGEN-GATK, it is not recommended to enable this in most use cases.
157      * Use this toggle to re-enable the GATK3 behavior of checking for graph cycles (and consequently throwing away the graph) before
158      * purning low weight chains.
159      */
160     @Hidden
161     @Argument(fullName="enable-legacy-graph-cycle-detection", doc = "Use to revert the change to assembly graph code that moved pruning to before cycle detection")
162     public boolean enableLegacyGraphCycleDetection = false;
163 
164     @Advanced
165     @Argument(fullName="debug-assembly", shortName="debug", doc="Print out verbose debug information about each assembly region", optional = true)
166     public boolean debugAssembly;
167 
168     @Hidden
169     @Argument(fullName="debug-graph-transformations", doc="Write DOT formatted graph files out of the assembler for only this graph size", optional = true)
170     public boolean debugGraphTransformations = false;
171 
172     /**
173      * This argument is meant for debugging and is not immediately useful for normal analysis use.
174      */
175     @Argument(fullName="graph-output", shortName="graph", doc="Write debug assembly graph information to this file", optional = true)
176     public String graphOutput = null;
177 
178     /**
179      * This argument is meant for debugging and is not immediately useful for normal analysis use.
180      */
181     @Hidden
182     @Argument(fullName="haplotype-debug-histogram-output", doc="Write debug assembly graph information to this file", optional = true)
183     public String haplotypeHistogramOutput = null;
184 
185     @Hidden
186     @Argument(fullName = CAPTURE_ASSEMBLY_FAILURE_BAM_LONG_NAME, doc = "Write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", optional = true)
187     public boolean captureAssemblyFailureBAM = false;
188 
189     @Hidden
190     @Argument(fullName = "num-matching-bases-in-dangling-end-to-recover", doc = "Sets the number of exactly matching bases in the suffix of a dangling tail and the prefix for a dangling head necessary in order to recover the path. (-1 indicates legacy behavior)", optional = true)
191     public int minMatchingBasesToDanglingEndRecovery = -1;
192 
193     //---------------------------------------------------------------------------------------------------------------
194     //
195     // Read Error Corrector Related Parameters
196     //
197     // ---------------------------------------------------------------------------------------------------------------
198 
199     /**
200      * Enabling this argument may cause fundamental problems with the assembly graph itself.
201      */
202     @Hidden
203     @Argument(fullName = PILEUP_ERROR_CORRECTION_LOG_ODDS_NAME, doc = "Log odds threshold for pileup error correction.  Off by default", optional = true)
204     public double pileupErrorCorrectionLogOdds = Double.NEGATIVE_INFINITY;
205 
206 
207     /**
208      * Enabling this argument may cause fundamental problems with the assembly graph itself.
209      */
210     @Hidden
211     @Argument(fullName = ERROR_CORRECT_READS_LONG_NAME, doc = "Use an exploratory algorithm to error correct the kmers used during assembly", optional = true)
212     public boolean errorCorrectReads = false;
213 
214     /**
215      * Enabling this argument may cause fundamental problems with the assembly graph itself.
216      */
217     @Hidden
218     @Argument(fullName="kmer-length-for-read-error-correction", doc = "Use an exploratory algorithm to error correct the kmers used during assembly", optional = true)
219     public int kmerLengthForReadErrorCorrection = 25;
220 
221     @Hidden
222     @Argument(fullName="min-observations-for-kmer-to-be-solid", doc = "A k-mer must be seen at least these times for it considered to be solid", optional = true)
223     public int minObservationsForKmerToBeSolid = 20;
224 
makeReadThreadingAssembler()225     public abstract ReadThreadingAssembler makeReadThreadingAssembler();
226 }
227