1 package org.broadinstitute.hellbender.tools.walkers.mutect.filtering; 2 3 import org.broadinstitute.barclay.argparser.Argument; 4 import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection; 5 import org.broadinstitute.hellbender.utils.MathUtils; 6 7 import java.io.File; 8 import java.util.ArrayList; 9 import java.util.List; 10 11 public class M2FiltersArgumentCollection { 12 private static final long serialVersionUID = 9345L; 13 14 /** 15 * meta-filtering parameters -- how we set the threshold on artifact probabilities 16 */ 17 public static final String THRESHOLD_STRATEGY_LONG_NAME = "threshold-strategy"; 18 public static final String F_SCORE_BETA_LONG_NAME = "f-score-beta"; 19 public static final String FALSE_DISCOVERY_RATE_LONG_NAME = "false-discovery-rate"; 20 public static final String INITIAL_THRESHOLD_LONG_NAME = "initial-threshold"; 21 22 private static final ThresholdCalculator.Strategy DEFAULT_THRESHOLD_STRATEGY = ThresholdCalculator.Strategy.OPTIMAL_F_SCORE; 23 private static final double DEFAULT_F_SCORE_BETA = 1.0; 24 private static final double DEFAULT_MAX_FALSE_DISCOVERY_RATE = 0.05; 25 private static final double DEFAULT_INITIAL_POSTERIOR_THRESHOLD = 0.1; 26 27 @Argument(fullName = THRESHOLD_STRATEGY_LONG_NAME, optional = true, doc = "The method for optimizing the posterior probability threshold") 28 public ThresholdCalculator.Strategy thresholdStrategy = DEFAULT_THRESHOLD_STRATEGY; 29 30 @Argument(fullName = F_SCORE_BETA_LONG_NAME, optional = true, doc = "F score beta, the relative weight of recall to precision, used if OPTIMAL_F_SCORE strategy is chosen") 31 public double fScoreBeta = DEFAULT_F_SCORE_BETA; 32 33 @Argument(fullName = FALSE_DISCOVERY_RATE_LONG_NAME, optional = true, doc = "Maximum false discovery rate allowed if FALSE_DISCOVERY_RATE threshold strategy is chosen") 34 public double maxFalsePositiveRate = DEFAULT_MAX_FALSE_DISCOVERY_RATE; 35 36 @Argument(fullName = INITIAL_THRESHOLD_LONG_NAME, optional = true, doc = "Initial artifact probability threshold used in first iteration") 37 public double initialPosteriorThreshold = DEFAULT_INITIAL_POSTERIOR_THRESHOLD; 38 39 /** 40 * Mitochondria mode excludes the filters {@link ClusteredEventsFilter}, {@link MultiallelicFilter}, {@link PolymeraseSlippageFilter}, 41 * {@link FilteredHaplotypeFilter}, {@link FragmentLengthFilter}, and {@link GermlineFilter} 42 */ 43 @Argument(fullName = M2ArgumentCollection.MITOCHONDRIA_MODE_LONG_NAME, optional = true, doc = "Set filters to mitochondrial defaults") 44 public boolean mitochondria = false; 45 46 47 /** 48 * Hard filter thresholds 49 */ 50 public static final String MAX_EVENTS_IN_REGION_LONG_NAME = "max-events-in-region"; 51 public static final String MAX_ALT_ALLELE_COUNT_LONG_NAME = "max-alt-allele-count"; 52 public static final String UNIQUE_ALT_READ_COUNT_LONG_NAME = "unique-alt-read-count"; 53 public static final String MIN_MEDIAN_MAPPING_QUALITY_LONG_NAME = "min-median-mapping-quality"; 54 public static final String MIN_MEDIAN_BASE_QUALITY_LONG_NAME = "min-median-base-quality"; 55 public static final String MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_LONG_NAME = "max-median-fragment-length-difference"; 56 public static final String MIN_MEDIAN_READ_POSITION_LONG_NAME = "min-median-read-position"; 57 public static final String MAX_N_RATIO_LONG_NAME = "max-n-ratio"; 58 public static final String MIN_READS_ON_EACH_STRAND_LONG_NAME = "min-reads-per-strand"; 59 public static final String MIN_AF_LONG_NAME = "min-allele-fraction"; 60 61 private static final int DEFAULT_MAX_EVENTS_IN_REGION = 2; 62 private static final int DEFAULT_MAX_ALT_ALLELES = 1; 63 private static final int DEFAULT_MIN_UNIQUE_ALT_READS = 0; 64 private static final int DEFAULT_MIN_MEDIAN_MAPPING_QUALITY = 30; 65 private static final int DEFAULT_MIN_MEDIAN_BASE_QUALITY = 20; 66 private static final int DEFAULT_MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE = 10000; 67 private static final int DEFAULT_MIN_MEDIAN_READ_POSITION = 1; 68 private static final double DEFAULT_MAX_N_RATIO = Double.POSITIVE_INFINITY; 69 private static final int DEFAULT_MIN_READS_ON_EACH_STRAND = 0; 70 private static final double DEFAULT_MAX_NUMT_FRACTION = 0.85; 71 private static final double DEFAULT_MIN_AF = 0; 72 73 @Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum events in a single assembly region. Filter all variants if exceeded.") 74 public int maxEventsInRegion = DEFAULT_MAX_EVENTS_IN_REGION; 75 76 @Argument(fullName = MAX_ALT_ALLELE_COUNT_LONG_NAME, optional = true, doc = "Maximum alt alleles per site.") 77 public int numAltAllelesThreshold = DEFAULT_MAX_ALT_ALLELES; 78 79 @Argument(fullName = UNIQUE_ALT_READ_COUNT_LONG_NAME, shortName = "unique", optional = true, doc = "Minimum unique (i.e. deduplicated) reads supporting the alternate allele") 80 public int uniqueAltReadCount = DEFAULT_MIN_UNIQUE_ALT_READS; 81 82 @Argument(fullName = MIN_MEDIAN_MAPPING_QUALITY_LONG_NAME, optional = true, doc="Minimum median mapping quality of alt reads") 83 public int minMedianMappingQuality = DEFAULT_MIN_MEDIAN_MAPPING_QUALITY; 84 85 @Argument(fullName = MIN_MEDIAN_BASE_QUALITY_LONG_NAME, optional = true, doc="Minimum median base quality of alt reads") 86 public int minMedianBaseQuality = DEFAULT_MIN_MEDIAN_BASE_QUALITY; 87 88 @Argument(fullName = MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_LONG_NAME, optional = true, doc="Maximum difference between median alt and ref fragment lengths") 89 public int maxMedianFragmentLengthDifference = DEFAULT_MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE; 90 91 @Argument(fullName = MIN_MEDIAN_READ_POSITION_LONG_NAME, optional = true, doc = "Minimum median distance of variants from the end of reads") 92 public int minMedianReadPosition = DEFAULT_MIN_MEDIAN_READ_POSITION; 93 94 @Argument(fullName = MAX_N_RATIO_LONG_NAME, optional = true, doc = "Maximum fraction of non-ref bases in the pileup that are N (unknown)") 95 public double nRatio = DEFAULT_MAX_N_RATIO; 96 97 @Argument(fullName = MIN_READS_ON_EACH_STRAND_LONG_NAME, optional = true, doc = "Minimum alt reads required on both forward and reverse strands") 98 public int minReadsOnEachStrand = DEFAULT_MIN_READS_ON_EACH_STRAND; 99 100 @Argument(fullName = MIN_AF_LONG_NAME, doc="Minimum allele fraction required", optional = true) 101 public double minAf = DEFAULT_MIN_AF; 102 103 104 /** 105 * Input files and values to use if inputs are missing 106 */ 107 public static final String CONTAMINATION_TABLE_LONG_NAME = "contamination-table"; 108 public static final String CONTAMINATION_ESTIMATE_LONG_NAME = "contamination-estimate"; 109 public static final String TUMOR_SEGMENTATION_LONG_NAME = "tumor-segmentation"; 110 public static final String ARTIFACT_PRIOR_TABLE_NAME = "orientation-bias-artifact-priors"; 111 public static final String ARTIFACT_PRIOR_TABLE_SHORT_NAME = "ob-priors"; 112 113 private static final double DEFAULT_CONTAMINATION = 0.0; 114 115 @Argument(fullName = CONTAMINATION_TABLE_LONG_NAME, optional = true, doc = "Tables containing contamination information.") 116 public List<File> contaminationTables = new ArrayList<>(); 117 118 @Argument(fullName = CONTAMINATION_ESTIMATE_LONG_NAME, optional = true, doc = "Estimate of contamination.") 119 public double contaminationEstimate = DEFAULT_CONTAMINATION; 120 121 @Argument(fullName = TUMOR_SEGMENTATION_LONG_NAME, doc="Tables containing tumor segments' minor allele fractions for germline hets emitted by CalculateContamination", optional = true) 122 public List<File> tumorSegmentationTables = new ArrayList<>(); 123 124 @Argument(fullName = ARTIFACT_PRIOR_TABLE_NAME, shortName = ARTIFACT_PRIOR_TABLE_SHORT_NAME, optional = true, doc = "One or more .tar.gz files containing tables of prior artifact probabilities for the read orientation filter model, one table per tumor sample") 125 public List<File> readOrientationPriorTarGzs = new ArrayList<>(); 126 127 128 /** 129 * Parameters 130 */ 131 public static final String LOG_SNV_PRIOR_LONG_NAME = "log-snv-prior"; 132 public static final String LOG_INDEL_PRIOR_LONG_NAME = "log-indel-prior"; 133 public static final String LOG_ARTIFACT_VERSUS_VARIANT_PRIOR_LONG_NAME = "log-artifact-prior"; 134 public static final String NORMAL_P_VALUE_THRESHOLD_LONG_NAME = "normal-p-value-threshold"; 135 public static final String MIN_POLYMERASE_SLIPPAGE_LENGTH = "min-slippage-length"; 136 public static final String PCR_SLIPPAGE_RATE_LONG_NAME = "pcr-slippage-rate"; 137 public static final String MAX_DISTANCE_TO_FILTERED_CALL_ON_SAME_HAPLOTYPE_LONG_NAME = "distance-on-haplotype"; 138 public static final String LONG_INDEL_LENGTH_LONG_NAME = "long-indel-length"; 139 140 private static final double DEFAULT_LOG_SNV_PRIOR = MathUtils.log10ToLog(-6); 141 private static final double DEFAULT_LOG_INDEL_PRIOR = MathUtils.log10ToLog(-7); 142 // Mitochondria defaults from a back of the envelope calculation. This assumes ~3 indels and ~50 snps in the 16kb 143 // mitochondria, which is a reasonable assumption for some haplogroups. 144 private static final double DEFAULT_LOG_SNV_PRIOR_FOR_MITO = MathUtils.log10ToLog(-2.5); 145 private static final double DEFAULT_LOG_INDEL_PRIOR_FOR_MITO = MathUtils.log10ToLog(-3.75); 146 private static final double DEFAULT_INITIAL_LOG_PRIOR_OF_VARIANT_VERSUS_ARTIFACT = MathUtils.log10ToLog(-1); 147 private static final double DEFAULT_NORMAL_P_VALUE_THRESHOLD = 0.001; 148 private static final int DEFAULT_MIN_SLIPPAGE_LENGTH = 8; 149 private static final double DEFAULT_SLIPPAGE_RATE = 0.1; 150 private static final int DEFAULT_MAX_INTRA_HAPLOTYPE_DISTANCE = 100; 151 private static final int DEFAULT_LONG_INDEL_SIZE = 5; 152 153 @Argument(fullName= LOG_SNV_PRIOR_LONG_NAME, doc="Initial ln prior probability that a site has a somatic SNV", optional = true) 154 public double logSNVPrior = DEFAULT_LOG_SNV_PRIOR; 155 getLogSnvPrior()156 public double getLogSnvPrior() { 157 return mitochondria && logSNVPrior == DEFAULT_LOG_SNV_PRIOR ? DEFAULT_LOG_SNV_PRIOR_FOR_MITO : logSNVPrior; 158 } 159 160 @Argument(fullName= LOG_INDEL_PRIOR_LONG_NAME, doc="Initial ln prior probability that a site has a somatic indel", optional = true) 161 public double logIndelPrior = DEFAULT_LOG_INDEL_PRIOR; 162 getLogIndelPrior()163 public double getLogIndelPrior() { 164 return mitochondria && logIndelPrior == DEFAULT_LOG_INDEL_PRIOR ? DEFAULT_LOG_INDEL_PRIOR_FOR_MITO : logIndelPrior; 165 } 166 167 @Argument(fullName= LOG_ARTIFACT_VERSUS_VARIANT_PRIOR_LONG_NAME, doc="Initial ln prior probability that a called site is not a technical artifact", optional = true) 168 public double initialLogPriorOfVariantVersusArtifact = DEFAULT_INITIAL_LOG_PRIOR_OF_VARIANT_VERSUS_ARTIFACT; 169 170 @Argument(fullName = NORMAL_P_VALUE_THRESHOLD_LONG_NAME, optional = true, doc = "P value threshold for normal artifact filter") 171 public double normalPileupPValueThreshold = DEFAULT_NORMAL_P_VALUE_THRESHOLD; 172 173 @Argument(fullName = MIN_POLYMERASE_SLIPPAGE_LENGTH, optional = true, doc = "Minimum number of reference bases in an STR to suspect polymerase slippage") 174 public int minSlippageLength = DEFAULT_MIN_SLIPPAGE_LENGTH; 175 176 @Argument(fullName = PCR_SLIPPAGE_RATE_LONG_NAME, optional = true, doc = "The frequency of polymerase slippage in contexts where it is suspected") 177 public double slippageRate = DEFAULT_SLIPPAGE_RATE; 178 179 @Argument(fullName = MAX_DISTANCE_TO_FILTERED_CALL_ON_SAME_HAPLOTYPE_LONG_NAME, optional = true, doc = "On second filtering pass, variants with same PGT and PID tags as a filtered variant within this distance are filtered.") 180 public int maxDistanceToFilteredCallOnSameHaplotype = DEFAULT_MAX_INTRA_HAPLOTYPE_DISTANCE; 181 182 @Argument(fullName = LONG_INDEL_LENGTH_LONG_NAME, optional = true, doc = "Indels of this length or greater are treated specially by the mapping quality filter.") 183 public int longIndelLength = DEFAULT_LONG_INDEL_SIZE; 184 } 185