1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller; 2 3 import htsjdk.variant.variantcontext.VariantContext; 4 import org.broadinstitute.barclay.argparser.Advanced; 5 import org.broadinstitute.barclay.argparser.Argument; 6 import org.broadinstitute.barclay.argparser.ArgumentCollection; 7 import org.broadinstitute.hellbender.engine.FeatureInput; 8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; 9 import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter; 10 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; 11 12 /** 13 * Set of arguments for Assembly Based Callers 14 */ 15 public abstract class AssemblyBasedCallerArgumentCollection { 16 private static final long serialVersionUID = 1L; 17 public static final String USE_FILTERED_READS_FOR_ANNOTATIONS_LONG_NAME = "use-filtered-reads-for-annotations"; 18 public static final String BAM_OUTPUT_LONG_NAME = "bam-output"; 19 public static final String BAM_OUTPUT_SHORT_NAME = "bamout"; 20 public static final String BAM_WRITER_TYPE_LONG_NAME = "bam-writer-type"; 21 public static final String DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME = "dont-use-soft-clipped-bases"; 22 public static final String DO_NOT_RUN_PHYSICAL_PHASING_LONG_NAME = "do-not-run-physical-phasing"; 23 public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance"; 24 public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist"; 25 26 public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score"; 27 public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman"; 28 public static final String FORCE_CALL_ALLELES_LONG_NAME = "alleles"; 29 public static final String FORCE_CALL_FILTERED_ALLELES_LONG_NAME = "force-call-filtered-alleles"; 30 public static final String FORCE_CALL_FILTERED_ALLELES_SHORT_NAME = "genotype-filtered-alleles"; 31 public static final String EMIT_REF_CONFIDENCE_LONG_NAME = "emit-ref-confidence"; 32 public static final String EMIT_REF_CONFIDENCE_SHORT_NAME = "ERC"; 33 public static final String ALLELE_EXTENSION_LONG_NAME = "allele-informative-reads-overlap-margin"; 34 createReadThreadingAssembler()35 public ReadThreadingAssembler createReadThreadingAssembler() { 36 final ReadThreadingAssembler assemblyEngine = assemblerArgs.makeReadThreadingAssembler(); 37 assemblyEngine.setDebug(assemblerArgs.debugAssembly); 38 assemblyEngine.setMinBaseQualityToUseInAssembly(minBaseQualityScore); 39 40 return assemblyEngine; 41 } 42 getReadThreadingAssemblerArgumentCollection()43 protected abstract ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgumentCollection(); 44 45 @ArgumentCollection 46 public ReadThreadingAssemblerArgumentCollection assemblerArgs = getReadThreadingAssemblerArgumentCollection(); 47 48 @ArgumentCollection 49 public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection(); 50 51 /** 52 * The assembled haplotypes and locally realigned reads will be written as BAM to this file if requested. Really 53 * for debugging purposes only. Note that the output here does not include uninformative reads so that not every 54 * input read is emitted to the bam. 55 * 56 * Turning on this mode may result in serious performance cost for the caller. It's really only appropriate to 57 * use in specific areas where you want to better understand why the caller is making specific calls. 58 * 59 * The reads are written out containing an "HC" tag (integer) that encodes which haplotype each read best matches 60 * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended 61 * to allow good coloring of reads in IGV. Simply go to "Color Alignments By > Tag" and enter "HC" to more 62 * easily see which reads go with these haplotype. 63 * 64 * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire 65 * active region, coming from sample "HC" and a special read group called "ArtificialHaplotype". This will increase the 66 * pileup depth compared to what would be expected from the reads only, especially in complex regions. 67 * 68 * Note also that only reads that are actually informative about the haplotypes are emitted. By informative we mean 69 * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to 70 * its next best haplotype. 71 * 72 * If multiple BAMs are passed as input to the tool (as is common for M2), then they will be combined in the bamout 73 * output and tagged with the appropriate sample names. 74 * 75 * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, 76 * and give it the "HC" tag, so you can see which reads support each haplotype. Finally, you can tell IGV 77 * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen in 78 * <a href="https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png">this screenshot</a> 79 * 80 */ 81 @Advanced 82 @Argument(fullName= BAM_OUTPUT_LONG_NAME, shortName= BAM_OUTPUT_SHORT_NAME, doc="File to which assembled haplotypes should be written", optional = true) 83 public String bamOutputPath = null; 84 85 /** 86 * The type of BAM output we want to see. This determines whether HC will write out all of the haplotypes it 87 * considered (top 128 max) or just the ones that were selected as alleles and assigned to samples. 88 */ 89 @Advanced 90 @Argument(fullName= BAM_WRITER_TYPE_LONG_NAME, doc="Which haplotypes should be written to the BAM", optional = true) 91 public HaplotypeBAMWriter.WriterType bamWriterType = HaplotypeBAMWriter.WriterType.CALLED_HAPLOTYPES; 92 93 // ----------------------------------------------------------------------------------------------- 94 // arguments for debugging / developing 95 // ----------------------------------------------------------------------------------------------- 96 97 @Advanced 98 @Argument(fullName = DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME, doc = "Do not analyze soft clipped bases in the reads", optional = true) 99 public boolean dontUseSoftClippedBases = false; 100 101 // Parameters to control read error correction 102 103 /** 104 * Bases with a quality below this threshold will not be used for calling. 105 */ 106 @Argument(fullName = MIN_BASE_QUALITY_SCORE_LONG_NAME, shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", optional = true) 107 public byte minBaseQualityScore = 10; 108 109 //Annotations 110 111 @Advanced 112 @Argument(fullName = SMITH_WATERMAN_LONG_NAME, doc = "Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice", optional = true) 113 public SmithWatermanAligner.Implementation smithWatermanImplementation = SmithWatermanAligner.Implementation.JAVA; 114 115 /** 116 * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference. 117 * See https://software.broadinstitute.org/gatk/documentation/article.php?id=4017 for information about GVCFs. 118 * For Mutect2, this is a BETA feature that functions similarly to the HaplotypeCaller reference confidence/GVCF mode. 119 */ 120 @Advanced 121 @Argument(fullName= EMIT_REF_CONFIDENCE_LONG_NAME, shortName= EMIT_REF_CONFIDENCE_SHORT_NAME, doc="Mode for emitting reference confidence scores (For Mutect2, this is a BETA feature)", optional = true) 122 public ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; 123 getDefaultMaxMnpDistance()124 protected abstract int getDefaultMaxMnpDistance(); 125 126 /** 127 * Two or more phased substitutions separated by this distance or less are merged into MNPs. 128 */ 129 @Advanced 130 @Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME, 131 doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true) 132 public int maxMnpDistance = getDefaultMaxMnpDistance(); 133 134 @Argument(fullName= FORCE_CALL_ALLELES_LONG_NAME, doc="The set of alleles to force-call regardless of evidence", optional=true) 135 public FeatureInput<VariantContext> alleles; 136 137 @Advanced 138 @Argument(fullName = FORCE_CALL_FILTERED_ALLELES_LONG_NAME, shortName = FORCE_CALL_FILTERED_ALLELES_SHORT_NAME, doc = "Force-call filtered alleles included in the resource specified by --alleles", optional = true) 139 public boolean forceCallFiltered = false; 140 141 @Advanced 142 @Argument(fullName = "soft-clip-low-quality-ends", doc = "If enabled will preserve low-quality read ends as softclips (used for DRAGEN-GATK BQD genotyper model)", optional = true) 143 public boolean softClipLowQualityEnds = false; 144 145 @Advanced 146 @Argument(fullName = ALLELE_EXTENSION_LONG_NAME, 147 doc = "Likelihood and read-based annotations will only take into consideration reads " + 148 "that overlap the variant or any base no further than this distance expressed in base pairs", 149 optional = true) 150 public int informativeReadOverlapMargin = 2; 151 } 152