1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller; 2 3 import com.google.common.collect.Lists; 4 import org.broadinstitute.barclay.argparser.Advanced; 5 import org.broadinstitute.barclay.argparser.Argument; 6 import org.broadinstitute.barclay.argparser.Hidden; 7 import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection; 8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; 9 import org.broadinstitute.hellbender.utils.MathUtils; 10 11 import java.io.Serializable; 12 import java.util.List; 13 14 /** 15 * Set of arguments related to the {@link org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler} 16 */ 17 public abstract class ReadThreadingAssemblerArgumentCollection implements Serializable { 18 private static final long serialVersionUID = 1L; 19 20 public static final double DEFAULT_PRUNING_LOG_ODDS_THRESHOLD = MathUtils.log10ToLog(1.0); 21 public static final double DEFAULT_PRUNING_SEEDING_LOG_ODDS_THRESHOLD = MathUtils.log10ToLog(4.0); 22 23 public static final String ERROR_CORRECT_READS_LONG_NAME = "error-correct-reads"; 24 public static final String PILEUP_ERROR_CORRECTION_LOG_ODDS_NAME = "error-correction-log-odds"; 25 26 public static final String CAPTURE_ASSEMBLY_FAILURE_BAM_LONG_NAME = "capture-assembly-failure-bam"; 27 public static final String KMER_SIZE_LONG_NAME = "kmer-size"; 28 public static final String DONT_INCREASE_KMER_SIZE_LONG_NAME = "dont-increase-kmer-sizes-for-cycles"; 29 public static final String LINKED_DE_BRUIJN_GRAPH_LONG_NAME = "linked-de-bruijn-graph"; 30 31 // ----------------------------------------------------------------------------------------------- 32 // arguments to control internal behavior of the read threading assembler 33 // ----------------------------------------------------------------------------------------------- 34 35 /** 36 * Multiple kmer sizes can be specified, using e.g. `--kmer-size 10 --kmer-size 25`. 37 */ 38 @Advanced 39 @Argument(fullName= KMER_SIZE_LONG_NAME, doc="Kmer size to use in the read threading assembler", optional = true) 40 public List<Integer> kmerSizes = Lists.newArrayList(10, 25); 41 42 /** 43 * When graph cycles are detected, the normal behavior is to increase kmer sizes iteratively until the cycles are 44 * resolved. Disabling this behavior may cause the program to give up on assembling the ActiveRegion. 45 */ 46 @Advanced 47 @Argument(fullName= DONT_INCREASE_KMER_SIZE_LONG_NAME, doc="Disable iterating over kmer sizes when graph cycles are detected", optional = true) 48 public boolean dontIncreaseKmerSizesForCycles = false; 49 50 /** 51 * By default, the program does not allow processing of reference sections that contain non-unique kmers. Disabling 52 * this check may cause problems in the assembly graph. 53 */ 54 @Advanced 55 @Argument(fullName="allow-non-unique-kmers-in-ref", doc="Allow graphs that have non-unique kmers in the reference", optional = true) 56 public boolean allowNonUniqueKmersInRef = false; 57 58 /** 59 * If fewer samples than the specified number pass the minPruning threshold for a given path, that path will be eliminated from the graph. 60 */ 61 @Advanced 62 @Argument(fullName="num-pruning-samples", doc="Number of samples that must pass the minPruning threshold", optional = true) 63 public int numPruningSamples = 1; 64 65 /** 66 * When constructing the assembly graph we are often left with "dangling" branches. The assembly engine attempts to rescue these branches 67 * by merging them back into the main graph. This argument describes the minimum length of a dangling branch needed for the engine to 68 * try to rescue it. A smaller number here will lead to higher sensitivity to real variation but also to a higher number of false positives. 69 */ 70 @Advanced 71 @Argument(fullName="min-dangling-branch-length", doc="Minimum length of a dangling branch to attempt recovery", optional = true) 72 public int minDanglingBranchLength = 4; 73 74 /** 75 * By default, the read threading assembler does not recover dangling branches that fork after splitting from the reference. This argument 76 * tells the assembly engine to recover all dangling branches. 77 */ 78 @Advanced 79 @Argument(fullName="recover-all-dangling-branches", doc="Recover all dangling branches", optional = true) 80 public boolean recoverAllDanglingBranches = false; 81 82 /** 83 * The assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype 84 * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the 85 * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their 86 * weights, no matter how many paths are possible to generate from the graph. Putting this number too low 87 * will result in dropping true variation because paths that include the real variant are not even considered. 88 * You can consider increasing this number when calling organisms with high heterozygosity. 89 */ 90 @Advanced 91 @Argument(fullName="max-num-haplotypes-in-population", doc="Maximum number of haplotypes to consider for your population", optional = true) 92 public int maxNumHaplotypesInPopulation = 128; 93 94 /** 95 * Paths with fewer supporting kmers than the specified threshold will be pruned from the graph. 96 * 97 * Be aware that this argument can dramatically affect the results of variant calling and should only be used with great caution. 98 * Using a prune factor of 1 (or below) will prevent any pruning from the graph, which is generally not ideal; it can make the 99 * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values 100 * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher 101 * depth to produce calls). 102 */ 103 @Advanced 104 @Argument(fullName="min-pruning", doc = "Minimum support to not prune paths in the graph", optional = true) 105 public int minPruneFactor = 2; 106 107 /** 108 * Initial base error rate guess for the probabilistic adaptive pruning model. Results are not very sensitive to this 109 * parameter because it is only a starting point from which the algorithm discovers the true error rate. 110 */ 111 @Advanced 112 @Argument(fullName="adaptive-pruning-initial-error-rate", doc = "Initial base error rate estimate for adaptive pruning", optional = true) 113 public double initialErrorRateForPruning = 0.001; 114 115 /** 116 * Log-10 likelihood ratio threshold for adaptive pruning algorithm. 117 */ 118 @Advanced 119 @Argument(fullName="pruning-lod-threshold", doc = "Ln likelihood ratio threshold for adaptive pruning algorithm", optional = true) 120 public double pruningLogOddsThreshold = DEFAULT_PRUNING_LOG_ODDS_THRESHOLD; 121 122 /** 123 * Log-10 likelihood ratio threshold for adaptive pruning algorithm. 124 */ 125 @Advanced 126 @Argument(fullName="pruning-seeding-lod-threshold", doc = "Ln likelihood ratio threshold for seeding subgraph of good variation in adaptive pruning algorithm", optional = true) 127 public double pruningSeedingLogOddsThreshold = DEFAULT_PRUNING_SEEDING_LOG_ODDS_THRESHOLD; 128 129 /** 130 * The maximum number of variants in graph the adaptive pruner will allow 131 */ 132 @Advanced 133 @Argument(fullName="max-unpruned-variants", doc = "Maximum number of variants in graph the adaptive pruner will allow", optional = true) 134 public int maxUnprunedVariants = 100; 135 136 /** 137 * Disables graph simplification into a seq graph, opts to construct a proper De Bruijn graph with potential loops 138 * 139 * NOTE: --linked-de-bruijn-graph is currently an experimental feature that does not directly match with 140 * the regular HaplotypeCaller. Specifically the haplotype finding code does not perform correctly at complicated 141 * sites. Use this mode at your own risk. 142 */ 143 @Advanced 144 @Argument(fullName= LINKED_DE_BRUIJN_GRAPH_LONG_NAME, doc = "If enabled, the Assembly Engine will construct a Linked De Bruijn graph to recover better haplotypes", optional = true) 145 public boolean useLinkedDeBruijnGraph = false; 146 147 /** 148 * This is used to disable the recovery of paths that were dropped in the graph based on the junction trees. Disabling this 149 * will affect sensitivity but improve phasing and runtime somewhat. 150 */ 151 @Hidden 152 @Argument(fullName="disable-artificial-haplotype-recovery", doc = "If in 'linked-de-bruijn-graph' mode, disable recovery of haplotypes based on graph edges that are not in junction trees", optional = true) 153 public boolean disableArtificialHaplotypeRecovery = false; 154 155 /** 156 * This option exists purely to support concordance with DRAGEN-GATK, it is not recommended to enable this in most use cases. 157 * Use this toggle to re-enable the GATK3 behavior of checking for graph cycles (and consequently throwing away the graph) before 158 * purning low weight chains. 159 */ 160 @Hidden 161 @Argument(fullName="enable-legacy-graph-cycle-detection", doc = "Use to revert the change to assembly graph code that moved pruning to before cycle detection") 162 public boolean enableLegacyGraphCycleDetection = false; 163 164 @Advanced 165 @Argument(fullName="debug-assembly", shortName="debug", doc="Print out verbose debug information about each assembly region", optional = true) 166 public boolean debugAssembly; 167 168 @Hidden 169 @Argument(fullName="debug-graph-transformations", doc="Write DOT formatted graph files out of the assembler for only this graph size", optional = true) 170 public boolean debugGraphTransformations = false; 171 172 /** 173 * This argument is meant for debugging and is not immediately useful for normal analysis use. 174 */ 175 @Argument(fullName="graph-output", shortName="graph", doc="Write debug assembly graph information to this file", optional = true) 176 public String graphOutput = null; 177 178 /** 179 * This argument is meant for debugging and is not immediately useful for normal analysis use. 180 */ 181 @Hidden 182 @Argument(fullName="haplotype-debug-histogram-output", doc="Write debug assembly graph information to this file", optional = true) 183 public String haplotypeHistogramOutput = null; 184 185 @Hidden 186 @Argument(fullName = CAPTURE_ASSEMBLY_FAILURE_BAM_LONG_NAME, doc = "Write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", optional = true) 187 public boolean captureAssemblyFailureBAM = false; 188 189 @Hidden 190 @Argument(fullName = "num-matching-bases-in-dangling-end-to-recover", doc = "Sets the number of exactly matching bases in the suffix of a dangling tail and the prefix for a dangling head necessary in order to recover the path. (-1 indicates legacy behavior)", optional = true) 191 public int minMatchingBasesToDanglingEndRecovery = -1; 192 193 //--------------------------------------------------------------------------------------------------------------- 194 // 195 // Read Error Corrector Related Parameters 196 // 197 // --------------------------------------------------------------------------------------------------------------- 198 199 /** 200 * Enabling this argument may cause fundamental problems with the assembly graph itself. 201 */ 202 @Hidden 203 @Argument(fullName = PILEUP_ERROR_CORRECTION_LOG_ODDS_NAME, doc = "Log odds threshold for pileup error correction. Off by default", optional = true) 204 public double pileupErrorCorrectionLogOdds = Double.NEGATIVE_INFINITY; 205 206 207 /** 208 * Enabling this argument may cause fundamental problems with the assembly graph itself. 209 */ 210 @Hidden 211 @Argument(fullName = ERROR_CORRECT_READS_LONG_NAME, doc = "Use an exploratory algorithm to error correct the kmers used during assembly", optional = true) 212 public boolean errorCorrectReads = false; 213 214 /** 215 * Enabling this argument may cause fundamental problems with the assembly graph itself. 216 */ 217 @Hidden 218 @Argument(fullName="kmer-length-for-read-error-correction", doc = "Use an exploratory algorithm to error correct the kmers used during assembly", optional = true) 219 public int kmerLengthForReadErrorCorrection = 25; 220 221 @Hidden 222 @Argument(fullName="min-observations-for-kmer-to-be-solid", doc = "A k-mer must be seen at least these times for it considered to be solid", optional = true) 223 public int minObservationsForKmerToBeSolid = 20; 224 makeReadThreadingAssembler()225 public abstract ReadThreadingAssembler makeReadThreadingAssembler(); 226 } 227