1 package org.broadinstitute.hellbender.tools.walkers.mutect.filtering;
2 
3 import org.broadinstitute.barclay.argparser.Argument;
4 import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
5 import org.broadinstitute.hellbender.utils.MathUtils;
6 
7 import java.io.File;
8 import java.util.ArrayList;
9 import java.util.List;
10 
11 public class M2FiltersArgumentCollection {
12     private static final long serialVersionUID = 9345L;
13 
14     /**
15      * meta-filtering parameters -- how we set the threshold on artifact probabilities
16      */
17     public static final String THRESHOLD_STRATEGY_LONG_NAME = "threshold-strategy";
18     public static final String F_SCORE_BETA_LONG_NAME = "f-score-beta";
19     public static final String FALSE_DISCOVERY_RATE_LONG_NAME = "false-discovery-rate";
20     public static final String INITIAL_THRESHOLD_LONG_NAME = "initial-threshold";
21 
22     private static final ThresholdCalculator.Strategy DEFAULT_THRESHOLD_STRATEGY = ThresholdCalculator.Strategy.OPTIMAL_F_SCORE;
23     private static final double DEFAULT_F_SCORE_BETA = 1.0;
24     private static final double DEFAULT_MAX_FALSE_DISCOVERY_RATE = 0.05;
25     private static final double DEFAULT_INITIAL_POSTERIOR_THRESHOLD = 0.1;
26 
27     @Argument(fullName = THRESHOLD_STRATEGY_LONG_NAME, optional = true, doc = "The method for optimizing the posterior probability threshold")
28     public ThresholdCalculator.Strategy thresholdStrategy = DEFAULT_THRESHOLD_STRATEGY;
29 
30     @Argument(fullName = F_SCORE_BETA_LONG_NAME, optional = true, doc = "F score beta, the relative weight of recall to precision, used if OPTIMAL_F_SCORE strategy is chosen")
31     public double fScoreBeta = DEFAULT_F_SCORE_BETA;
32 
33     @Argument(fullName = FALSE_DISCOVERY_RATE_LONG_NAME, optional = true, doc = "Maximum false discovery rate allowed if FALSE_DISCOVERY_RATE threshold strategy is chosen")
34     public double maxFalsePositiveRate = DEFAULT_MAX_FALSE_DISCOVERY_RATE;
35 
36     @Argument(fullName = INITIAL_THRESHOLD_LONG_NAME, optional = true, doc = "Initial artifact probability threshold used in first iteration")
37     public double initialPosteriorThreshold = DEFAULT_INITIAL_POSTERIOR_THRESHOLD;
38 
39     /**
40      * Mitochondria mode excludes the filters {@link ClusteredEventsFilter}, {@link MultiallelicFilter}, {@link PolymeraseSlippageFilter},
41      * {@link FilteredHaplotypeFilter}, {@link FragmentLengthFilter}, and {@link GermlineFilter}
42      */
43     @Argument(fullName = M2ArgumentCollection.MITOCHONDRIA_MODE_LONG_NAME, optional = true, doc = "Set filters to mitochondrial defaults")
44     public boolean mitochondria = false;
45 
46 
47     /**
48      * Hard filter thresholds
49      */
50     public static final String MAX_EVENTS_IN_REGION_LONG_NAME = "max-events-in-region";
51     public static final String MAX_ALT_ALLELE_COUNT_LONG_NAME = "max-alt-allele-count";
52     public static final String UNIQUE_ALT_READ_COUNT_LONG_NAME = "unique-alt-read-count";
53     public static final String MIN_MEDIAN_MAPPING_QUALITY_LONG_NAME = "min-median-mapping-quality";
54     public static final String MIN_MEDIAN_BASE_QUALITY_LONG_NAME = "min-median-base-quality";
55     public static final String MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_LONG_NAME = "max-median-fragment-length-difference";
56     public static final String MIN_MEDIAN_READ_POSITION_LONG_NAME = "min-median-read-position";
57     public static final String MAX_N_RATIO_LONG_NAME = "max-n-ratio";
58     public static final String MIN_READS_ON_EACH_STRAND_LONG_NAME = "min-reads-per-strand";
59     public static final String MIN_AF_LONG_NAME = "min-allele-fraction";
60 
61     private static final int DEFAULT_MAX_EVENTS_IN_REGION = 2;
62     private static final int DEFAULT_MAX_ALT_ALLELES = 1;
63     private static final int DEFAULT_MIN_UNIQUE_ALT_READS = 0;
64     private static final int DEFAULT_MIN_MEDIAN_MAPPING_QUALITY = 30;
65     private static final int DEFAULT_MIN_MEDIAN_BASE_QUALITY = 20;
66     private static final int DEFAULT_MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE = 10000;
67     private static final int DEFAULT_MIN_MEDIAN_READ_POSITION = 1;
68     private static final double DEFAULT_MAX_N_RATIO = Double.POSITIVE_INFINITY;
69     private static final int DEFAULT_MIN_READS_ON_EACH_STRAND = 0;
70     private static final double DEFAULT_MAX_NUMT_FRACTION = 0.85;
71     private static final double DEFAULT_MIN_AF = 0;
72 
73     @Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum events in a single assembly region.  Filter all variants if exceeded.")
74     public int maxEventsInRegion = DEFAULT_MAX_EVENTS_IN_REGION;
75 
76     @Argument(fullName = MAX_ALT_ALLELE_COUNT_LONG_NAME, optional = true, doc = "Maximum alt alleles per site.")
77     public int numAltAllelesThreshold = DEFAULT_MAX_ALT_ALLELES;
78 
79     @Argument(fullName = UNIQUE_ALT_READ_COUNT_LONG_NAME, shortName = "unique", optional = true, doc = "Minimum unique (i.e. deduplicated) reads supporting the alternate allele")
80     public int uniqueAltReadCount = DEFAULT_MIN_UNIQUE_ALT_READS;
81 
82     @Argument(fullName = MIN_MEDIAN_MAPPING_QUALITY_LONG_NAME, optional = true, doc="Minimum median mapping quality of alt reads")
83     public int minMedianMappingQuality = DEFAULT_MIN_MEDIAN_MAPPING_QUALITY;
84 
85     @Argument(fullName = MIN_MEDIAN_BASE_QUALITY_LONG_NAME, optional = true, doc="Minimum median base quality of alt reads")
86     public int minMedianBaseQuality = DEFAULT_MIN_MEDIAN_BASE_QUALITY;
87 
88     @Argument(fullName = MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_LONG_NAME, optional = true, doc="Maximum difference between median alt and ref fragment lengths")
89     public int maxMedianFragmentLengthDifference = DEFAULT_MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE;
90 
91     @Argument(fullName = MIN_MEDIAN_READ_POSITION_LONG_NAME, optional = true, doc = "Minimum median distance of variants from the end of reads")
92     public int minMedianReadPosition = DEFAULT_MIN_MEDIAN_READ_POSITION;
93 
94     @Argument(fullName = MAX_N_RATIO_LONG_NAME, optional = true, doc = "Maximum fraction of non-ref bases in the pileup that are N (unknown)")
95     public double nRatio = DEFAULT_MAX_N_RATIO;
96 
97     @Argument(fullName = MIN_READS_ON_EACH_STRAND_LONG_NAME, optional = true, doc = "Minimum alt reads required on both forward and reverse strands")
98     public int minReadsOnEachStrand = DEFAULT_MIN_READS_ON_EACH_STRAND;
99 
100     @Argument(fullName = MIN_AF_LONG_NAME, doc="Minimum allele fraction required", optional = true)
101     public double minAf = DEFAULT_MIN_AF;
102 
103 
104     /**
105      * Input files and values to use if inputs are missing
106      */
107     public static final String CONTAMINATION_TABLE_LONG_NAME = "contamination-table";
108     public static final String CONTAMINATION_ESTIMATE_LONG_NAME = "contamination-estimate";
109     public static final String TUMOR_SEGMENTATION_LONG_NAME = "tumor-segmentation";
110     public static final String ARTIFACT_PRIOR_TABLE_NAME = "orientation-bias-artifact-priors";
111     public static final String ARTIFACT_PRIOR_TABLE_SHORT_NAME = "ob-priors";
112 
113     private static final double DEFAULT_CONTAMINATION = 0.0;
114 
115     @Argument(fullName = CONTAMINATION_TABLE_LONG_NAME, optional = true, doc = "Tables containing contamination information.")
116     public List<File> contaminationTables = new ArrayList<>();
117 
118     @Argument(fullName = CONTAMINATION_ESTIMATE_LONG_NAME, optional = true, doc = "Estimate of contamination.")
119     public double contaminationEstimate = DEFAULT_CONTAMINATION;
120 
121     @Argument(fullName = TUMOR_SEGMENTATION_LONG_NAME, doc="Tables containing tumor segments' minor allele fractions for germline hets emitted by CalculateContamination", optional = true)
122     public List<File> tumorSegmentationTables = new ArrayList<>();
123 
124     @Argument(fullName = ARTIFACT_PRIOR_TABLE_NAME, shortName =  ARTIFACT_PRIOR_TABLE_SHORT_NAME, optional = true, doc = "One or more .tar.gz files containing tables of prior artifact probabilities for the read orientation filter model, one table per tumor sample")
125     public List<File> readOrientationPriorTarGzs = new ArrayList<>();
126 
127 
128     /**
129      * Parameters
130      */
131     public static final String LOG_SNV_PRIOR_LONG_NAME = "log-snv-prior";
132     public static final String LOG_INDEL_PRIOR_LONG_NAME = "log-indel-prior";
133     public static final String LOG_ARTIFACT_VERSUS_VARIANT_PRIOR_LONG_NAME = "log-artifact-prior";
134     public static final String NORMAL_P_VALUE_THRESHOLD_LONG_NAME = "normal-p-value-threshold";
135     public static final String MIN_POLYMERASE_SLIPPAGE_LENGTH = "min-slippage-length";
136     public static final String PCR_SLIPPAGE_RATE_LONG_NAME = "pcr-slippage-rate";
137     public static final String MAX_DISTANCE_TO_FILTERED_CALL_ON_SAME_HAPLOTYPE_LONG_NAME = "distance-on-haplotype";
138     public static final String LONG_INDEL_LENGTH_LONG_NAME = "long-indel-length";
139 
140     private static final double DEFAULT_LOG_SNV_PRIOR = MathUtils.log10ToLog(-6);
141     private static final double DEFAULT_LOG_INDEL_PRIOR = MathUtils.log10ToLog(-7);
142     // Mitochondria defaults from a back of the envelope calculation. This assumes ~3 indels and ~50 snps in the 16kb
143     // mitochondria, which is a reasonable assumption for some haplogroups.
144     private static final double DEFAULT_LOG_SNV_PRIOR_FOR_MITO = MathUtils.log10ToLog(-2.5);
145     private static final double DEFAULT_LOG_INDEL_PRIOR_FOR_MITO = MathUtils.log10ToLog(-3.75);
146     private static final double DEFAULT_INITIAL_LOG_PRIOR_OF_VARIANT_VERSUS_ARTIFACT = MathUtils.log10ToLog(-1);
147     private static final double DEFAULT_NORMAL_P_VALUE_THRESHOLD = 0.001;
148     private static final int DEFAULT_MIN_SLIPPAGE_LENGTH = 8;
149     private static final double DEFAULT_SLIPPAGE_RATE = 0.1;
150     private static final int DEFAULT_MAX_INTRA_HAPLOTYPE_DISTANCE = 100;
151     private static final int DEFAULT_LONG_INDEL_SIZE = 5;
152 
153     @Argument(fullName= LOG_SNV_PRIOR_LONG_NAME, doc="Initial ln prior probability that a site has a somatic SNV", optional = true)
154     public double logSNVPrior = DEFAULT_LOG_SNV_PRIOR;
155 
getLogSnvPrior()156     public double getLogSnvPrior() {
157         return mitochondria && logSNVPrior == DEFAULT_LOG_SNV_PRIOR ? DEFAULT_LOG_SNV_PRIOR_FOR_MITO : logSNVPrior;
158     }
159 
160     @Argument(fullName= LOG_INDEL_PRIOR_LONG_NAME, doc="Initial ln prior probability that a site has a somatic indel", optional = true)
161     public double logIndelPrior = DEFAULT_LOG_INDEL_PRIOR;
162 
getLogIndelPrior()163     public double getLogIndelPrior() {
164         return mitochondria && logIndelPrior == DEFAULT_LOG_INDEL_PRIOR ? DEFAULT_LOG_INDEL_PRIOR_FOR_MITO : logIndelPrior;
165     }
166 
167     @Argument(fullName= LOG_ARTIFACT_VERSUS_VARIANT_PRIOR_LONG_NAME, doc="Initial ln prior probability that a called site is not a technical artifact", optional = true)
168     public double initialLogPriorOfVariantVersusArtifact = DEFAULT_INITIAL_LOG_PRIOR_OF_VARIANT_VERSUS_ARTIFACT;
169 
170     @Argument(fullName = NORMAL_P_VALUE_THRESHOLD_LONG_NAME, optional = true, doc = "P value threshold for normal artifact filter")
171     public double normalPileupPValueThreshold = DEFAULT_NORMAL_P_VALUE_THRESHOLD;
172 
173     @Argument(fullName = MIN_POLYMERASE_SLIPPAGE_LENGTH, optional = true, doc = "Minimum number of reference bases in an STR to suspect polymerase slippage")
174     public int minSlippageLength = DEFAULT_MIN_SLIPPAGE_LENGTH;
175 
176     @Argument(fullName = PCR_SLIPPAGE_RATE_LONG_NAME, optional = true, doc = "The frequency of polymerase slippage in contexts where it is suspected")
177     public double slippageRate = DEFAULT_SLIPPAGE_RATE;
178 
179     @Argument(fullName = MAX_DISTANCE_TO_FILTERED_CALL_ON_SAME_HAPLOTYPE_LONG_NAME, optional = true, doc = "On second filtering pass, variants with same PGT and PID tags as a filtered variant within this distance are filtered.")
180     public int maxDistanceToFilteredCallOnSameHaplotype = DEFAULT_MAX_INTRA_HAPLOTYPE_DISTANCE;
181 
182     @Argument(fullName = LONG_INDEL_LENGTH_LONG_NAME, optional = true, doc = "Indels of this length or greater are treated specially by the mapping quality filter.")
183     public int longIndelLength = DEFAULT_LONG_INDEL_SIZE;
184 }
185