1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;
2 
3 import htsjdk.variant.variantcontext.VariantContext;
4 import org.broadinstitute.barclay.argparser.Advanced;
5 import org.broadinstitute.barclay.argparser.Argument;
6 import org.broadinstitute.barclay.argparser.ArgumentCollection;
7 import org.broadinstitute.hellbender.engine.FeatureInput;
8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
9 import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter;
10 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
11 
12 /**
13  * Set of arguments for Assembly Based Callers
14  */
15 public abstract class AssemblyBasedCallerArgumentCollection {
16     private static final long serialVersionUID = 1L;
17     public static final String USE_FILTERED_READS_FOR_ANNOTATIONS_LONG_NAME = "use-filtered-reads-for-annotations";
18     public static final String BAM_OUTPUT_LONG_NAME = "bam-output";
19     public static final String BAM_OUTPUT_SHORT_NAME = "bamout";
20     public static final String BAM_WRITER_TYPE_LONG_NAME = "bam-writer-type";
21     public static final String DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME = "dont-use-soft-clipped-bases";
22     public static final String DO_NOT_RUN_PHYSICAL_PHASING_LONG_NAME = "do-not-run-physical-phasing";
23     public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
24     public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
25 
26     public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score";
27     public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman";
28     public static final String FORCE_CALL_ALLELES_LONG_NAME = "alleles";
29     public static final String FORCE_CALL_FILTERED_ALLELES_LONG_NAME = "force-call-filtered-alleles";
30     public static final String FORCE_CALL_FILTERED_ALLELES_SHORT_NAME = "genotype-filtered-alleles";
31     public static final String EMIT_REF_CONFIDENCE_LONG_NAME = "emit-ref-confidence";
32     public static final String EMIT_REF_CONFIDENCE_SHORT_NAME = "ERC";
33     public static final String ALLELE_EXTENSION_LONG_NAME = "allele-informative-reads-overlap-margin";
34 
createReadThreadingAssembler()35     public ReadThreadingAssembler createReadThreadingAssembler() {
36         final ReadThreadingAssembler assemblyEngine = assemblerArgs.makeReadThreadingAssembler();
37         assemblyEngine.setDebug(assemblerArgs.debugAssembly);
38         assemblyEngine.setMinBaseQualityToUseInAssembly(minBaseQualityScore);
39 
40         return assemblyEngine;
41     }
42 
getReadThreadingAssemblerArgumentCollection()43     protected abstract ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgumentCollection();
44 
45     @ArgumentCollection
46     public ReadThreadingAssemblerArgumentCollection assemblerArgs = getReadThreadingAssemblerArgumentCollection();
47 
48     @ArgumentCollection
49     public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection();
50 
51     /**
52      * The assembled haplotypes and locally realigned reads will be written as BAM to this file if requested.  Really
53      * for debugging purposes only. Note that the output here does not include uninformative reads so that not every
54      * input read is emitted to the bam.
55      *
56      * Turning on this mode may result in serious performance cost for the caller.  It's really only appropriate to
57      * use in specific areas where you want to better understand why the caller is making specific calls.
58      *
59      * The reads are written out containing an "HC" tag (integer) that encodes which haplotype each read best matches
60      * according to the haplotype caller's likelihood calculation.  The use of this tag is primarily intended
61      * to allow good coloring of reads in IGV.  Simply go to "Color Alignments By > Tag" and enter "HC" to more
62      * easily see which reads go with these haplotype.
63      *
64      * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire
65      * active region, coming from sample "HC" and a special read group called "ArtificialHaplotype". This will increase the
66      * pileup depth compared to what would be expected from the reads only, especially in complex regions.
67      *
68      * Note also that only reads that are actually informative about the haplotypes are emitted.  By informative we mean
69      * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to
70      * its next best haplotype.
71      *
72      * If multiple BAMs are passed as input to the tool (as is common for M2), then they will be combined in the bamout
73      * output and tagged with the appropriate sample names.
74      *
75      * The best way to visualize the output of this mode is with IGV.  Tell IGV to color the alignments by tag,
76      * and give it the "HC" tag, so you can see which reads support each haplotype.  Finally, you can tell IGV
77      * to group by sample, which will separate the potential haplotypes from the reads.  All of this can be seen in
78      * <a href="https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png">this screenshot</a>
79      *
80      */
81     @Advanced
82     @Argument(fullName= BAM_OUTPUT_LONG_NAME, shortName= BAM_OUTPUT_SHORT_NAME, doc="File to which assembled haplotypes should be written", optional = true)
83     public String bamOutputPath = null;
84 
85     /**
86      * The type of BAM output we want to see. This determines whether HC will write out all of the haplotypes it
87      * considered (top 128 max) or just the ones that were selected as alleles and assigned to samples.
88      */
89     @Advanced
90     @Argument(fullName= BAM_WRITER_TYPE_LONG_NAME, doc="Which haplotypes should be written to the BAM", optional = true)
91     public HaplotypeBAMWriter.WriterType bamWriterType = HaplotypeBAMWriter.WriterType.CALLED_HAPLOTYPES;
92 
93     // -----------------------------------------------------------------------------------------------
94     // arguments for debugging / developing
95     // -----------------------------------------------------------------------------------------------
96 
97     @Advanced
98     @Argument(fullName = DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME, doc = "Do not analyze soft clipped bases in the reads", optional = true)
99     public boolean dontUseSoftClippedBases = false;
100 
101     // Parameters to control read error correction
102 
103     /**
104      * Bases with a quality below this threshold will not be used for calling.
105      */
106     @Argument(fullName = MIN_BASE_QUALITY_SCORE_LONG_NAME, shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", optional = true)
107     public byte minBaseQualityScore = 10;
108 
109     //Annotations
110 
111     @Advanced
112     @Argument(fullName = SMITH_WATERMAN_LONG_NAME, doc = "Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice", optional = true)
113     public SmithWatermanAligner.Implementation smithWatermanImplementation = SmithWatermanAligner.Implementation.JAVA;
114 
115     /**
116      * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference.
117      * See https://software.broadinstitute.org/gatk/documentation/article.php?id=4017 for information about GVCFs.
118      * For Mutect2, this is a BETA feature that functions similarly to the HaplotypeCaller reference confidence/GVCF mode.
119      */
120     @Advanced
121     @Argument(fullName= EMIT_REF_CONFIDENCE_LONG_NAME, shortName= EMIT_REF_CONFIDENCE_SHORT_NAME, doc="Mode for emitting reference confidence scores (For Mutect2, this is a BETA feature)", optional = true)
122     public ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;
123 
getDefaultMaxMnpDistance()124     protected abstract int getDefaultMaxMnpDistance();
125 
126     /**
127      * Two or more phased substitutions separated by this distance or less are merged into MNPs.
128      */
129     @Advanced
130     @Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME,
131             doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true)
132     public int maxMnpDistance = getDefaultMaxMnpDistance();
133 
134     @Argument(fullName= FORCE_CALL_ALLELES_LONG_NAME, doc="The set of alleles to force-call regardless of evidence", optional=true)
135     public FeatureInput<VariantContext> alleles;
136 
137     @Advanced
138     @Argument(fullName = FORCE_CALL_FILTERED_ALLELES_LONG_NAME, shortName = FORCE_CALL_FILTERED_ALLELES_SHORT_NAME, doc = "Force-call filtered alleles included in the resource specified by --alleles", optional = true)
139     public boolean forceCallFiltered = false;
140 
141     @Advanced
142     @Argument(fullName = "soft-clip-low-quality-ends", doc = "If enabled will preserve low-quality read ends as softclips (used for DRAGEN-GATK BQD genotyper model)", optional = true)
143     public boolean softClipLowQualityEnds = false;
144 
145     @Advanced
146     @Argument(fullName = ALLELE_EXTENSION_LONG_NAME,
147             doc = "Likelihood and read-based annotations will only take into consideration reads " +
148                     "that overlap the variant or any base no further than this distance expressed in base pairs",
149             optional = true)
150     public int informativeReadOverlapMargin = 2;
151 }
152