1 package org.broadinstitute.hellbender.tools.walkers.mutect.filtering; 2 3 import htsjdk.variant.variantcontext.VariantContext; 4 import htsjdk.variant.variantcontext.writer.VariantContextWriter; 5 import htsjdk.variant.vcf.VCFHeader; 6 import htsjdk.variant.vcf.VCFHeaderLine; 7 import htsjdk.variant.vcf.VCFStandardHeaderLines; 8 import org.broadinstitute.barclay.argparser.Argument; 9 import org.broadinstitute.barclay.argparser.ArgumentCollection; 10 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; 11 import org.broadinstitute.barclay.help.DocumentedFeature; 12 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 13 import org.broadinstitute.hellbender.engine.FeatureContext; 14 import org.broadinstitute.hellbender.engine.MultiplePassVariantWalker; 15 import org.broadinstitute.hellbender.engine.ReadsContext; 16 import org.broadinstitute.hellbender.engine.ReferenceContext; 17 import org.broadinstitute.hellbender.exceptions.GATKException; 18 import org.broadinstitute.hellbender.exceptions.UserException; 19 import org.broadinstitute.hellbender.tools.walkers.contamination.CalculateContamination; 20 import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2; 21 import org.broadinstitute.hellbender.utils.Utils; 22 import org.broadinstitute.hellbender.utils.io.IOUtils; 23 import org.broadinstitute.hellbender.utils.param.ParamUtils; 24 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 25 import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; 26 import picard.cmdline.programgroups.VariantFilteringProgramGroup; 27 import org.broadinstitute.hellbender.tools.walkers.readorientation.LearnReadOrientationModel; 28 29 import java.io.File; 30 import java.nio.file.Path; 31 import java.util.Set; 32 import java.util.stream.Collectors; 33 34 /** 35 * <p>Filter variants in a Mutect2 VCF callset.</p> 36 * 37 * <p> 38 * FilterMutectCalls applies filters to the raw output of {@link Mutect2}. 39 * Parameters are contained in {@link M2FiltersArgumentCollection} and described in 40 * <a href='https://github.com/broadinstitute/gatk/tree/master/docs/mutect/mutect.pdf' target='_blank'>https://github.com/broadinstitute/gatk/tree/master/docs/mutect/mutect.pdf</a>. 41 * To filter based on sequence context artifacts, specify the --orientation-bias-artifact-priors [artifact priors tar.gz file] argument 42 * one or more times. This input is generated by {@link LearnReadOrientationModel}. 43 * </p> 44 * <p> 45 * If given a --contamination-table file, e.g. results from {@link CalculateContamination}, the tool will additionally 46 * filter variants due to contamination. This argument may be specified with a table for one or more tumor samples. 47 * Alternatively, provide an estimate of the contamination with the --contamination argument. 48 * 49 * FilterMutectCalls can also be given one or more --tumor-segmentation files, which are also output by {@link CalculateContamination}. 50 * </p> 51 * <p> 52 * This tool is featured in the Somatic Short Mutation calling Best Practice Workflow. 53 * See <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11136">Tutorial#11136</a> for a 54 * step-by-step description of the workflow and <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11127">Article#11127</a> 55 * for an overview of what traditional somatic calling entails. For the latest pipeline scripts, see the 56 * <a href="https://github.com/broadinstitute/gatk/tree/master/scripts/mutect2_wdl">Mutect2 WDL scripts directory</a>. 57 * </p> 58 * 59 * <h3>Usage example</h3> 60 * <pre> 61 * gatk FilterMutectCalls \ 62 * -R reference.fasta \ 63 * -V somatic.vcf.gz \ 64 * --contamination-table contamination.table \ 65 * --tumor-segmentation segments.tsv \ 66 * -O filtered.vcf.gz 67 * </pre> 68 * 69 * 70 * When running on unfiltered output of Mutect2 in --mitochondria mode, setting the advanced option --autosomal-coverage 71 * argument (default 0) activates a recommended filter against likely erroneously mapped <a href="https://en.wikipedia.org/wiki/NUMT">NuMTs (nuclear mitochondrial DNA segments)</a>. 72 * For the value, provide the median coverage expected in autosomal regions with coverage. 73 * 74 */ 75 @CommandLineProgramProperties( 76 summary = "Filter somatic SNVs and indels called by Mutect2", 77 oneLineSummary = "Filter somatic SNVs and indels called by Mutect2", 78 programGroup = VariantFilteringProgramGroup.class 79 ) 80 @DocumentedFeature 81 public final class FilterMutectCalls extends MultiplePassVariantWalker { 82 83 public static final String FILTERING_STATS_LONG_NAME = "filtering-stats"; 84 85 public static final String FILTERING_STATUS_VCF_KEY = "filtering_status"; 86 87 public static final String FILTERING_STATS_EXTENSION = ".filteringStats.tsv"; 88 89 @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName =StandardArgumentDefinitions.OUTPUT_SHORT_NAME, 90 doc="The output filtered VCF file", optional=false) 91 private final String outputVcf = null; 92 93 @Argument(fullName = Mutect2.MUTECT_STATS_SHORT_NAME, doc="The Mutect stats file output by Mutect2", optional=true) 94 private final String statsTable = null; 95 96 @Argument(fullName = FILTERING_STATS_LONG_NAME, doc="The output filtering stats file", optional=true) 97 private final String filteringStatsOutput = null; 98 99 @ArgumentCollection 100 protected M2FiltersArgumentCollection MTFAC = new M2FiltersArgumentCollection(); 101 102 private VariantContextWriter vcfWriter; 103 104 private Mutect2FilteringEngine filteringEngine; 105 106 private static final int NUMBER_OF_LEARNING_PASSES = 2; 107 108 @Override numberOfPasses()109 protected int numberOfPasses() { return NUMBER_OF_LEARNING_PASSES + 2; } // {@code NUMBER_OF_LEARNING_PASSES} passes for learning, one for the threshold, and one for calling 110 111 @Override requiresReference()112 public boolean requiresReference() { return true;} 113 114 @Override onTraversalStart()115 public void onTraversalStart() { 116 Utils.resetRandomGenerator(); 117 final VCFHeader inputHeader = getHeaderForVariants(); 118 final Set<VCFHeaderLine> headerLines = inputHeader.getMetaDataInSortedOrder().stream() 119 .filter(line -> !line.getKey().equals(FILTERING_STATUS_VCF_KEY)) //remove header line from Mutect2 stating that calls are unfiltered. 120 .collect(Collectors.toSet()); 121 headerLines.add(new VCFHeaderLine(FILTERING_STATUS_VCF_KEY, "These calls have been filtered by " + FilterMutectCalls.class.getSimpleName() + " to label false positives with a list of failed filters and true positives with PASS.")); 122 123 // all possible filters, even allele specific (since they can apply to the site as well 124 GATKVCFConstants.MUTECT_FILTER_NAMES.stream().map(GATKVCFHeaderLines::getFilterLine).forEach(headerLines::add); 125 126 // these are the possible allele specific filters which will be in the INFO section 127 // when all relevant alleles (non-symbolic, etc) are filtered, the filter will be applied to the site level filter also 128 GATKVCFConstants.MUTECT_AS_FILTER_NAMES.stream().map(GATKVCFHeaderLines::getInfoLine).forEach(headerLines::add); 129 130 headerLines.addAll(getDefaultToolVCFHeaderLines()); 131 132 final VCFHeader vcfHeader = new VCFHeader(headerLines, inputHeader.getGenotypeSamples()); 133 vcfWriter = createVCFWriter(new File(outputVcf)); 134 vcfWriter.writeHeader(vcfHeader); 135 136 137 final File mutect2StatsTable = new File(statsTable == null ? drivingVariantFile + Mutect2.DEFAULT_STATS_EXTENSION : statsTable); 138 filteringEngine = new Mutect2FilteringEngine(MTFAC, vcfHeader, mutect2StatsTable); 139 if (!mutect2StatsTable.exists()) { 140 throw new UserException.CouldNotReadInputFile("Mutect stats table " + mutect2StatsTable + " not found. When Mutect2 outputs a file calls.vcf it also creates" + 141 " a calls.vcf" + Mutect2.DEFAULT_STATS_EXTENSION + " file. Perhaps this file was not moved along with the vcf, or perhaps it was not delocalized from a" + 142 " virtual machine while running in the cloud." ); 143 } 144 } 145 146 @Override nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, final int n)147 protected void nthPassApply(final VariantContext variant, 148 final ReadsContext readsContext, 149 final ReferenceContext referenceContext, 150 final FeatureContext featureContext, 151 final int n) { 152 ParamUtils.isPositiveOrZero(n, "Passes must start at the 0th pass."); 153 if (n <= NUMBER_OF_LEARNING_PASSES) { 154 filteringEngine.accumulateData(variant, referenceContext); 155 } else if (n == NUMBER_OF_LEARNING_PASSES + 1) { 156 vcfWriter.add(filteringEngine.applyFiltersAndAccumulateOutputStats(variant, referenceContext)); 157 } else { 158 throw new GATKException.ShouldNeverReachHereException("This walker should never reach (zero-indexed) pass " + n); 159 } 160 } 161 162 @Override afterNthPass(final int n)163 protected void afterNthPass(final int n) { 164 if (n < NUMBER_OF_LEARNING_PASSES) { 165 filteringEngine.learnParameters(); 166 } else if (n == NUMBER_OF_LEARNING_PASSES) { 167 // it's important for filter parameters to stay the same and only learn the threshold in the final pass so that the 168 // final threshold used corresponds exactly to the filters 169 filteringEngine.learnThreshold(); 170 }else if (n == NUMBER_OF_LEARNING_PASSES + 1) { 171 final Path filteringStats = IOUtils.getPath(filteringStatsOutput != null ? filteringStatsOutput : outputVcf + FILTERING_STATS_EXTENSION); 172 filteringEngine.writeFilteringStats(filteringStats); 173 } else { 174 throw new GATKException.ShouldNeverReachHereException("This walker should never reach (zero-indexed) pass " + n); 175 } 176 } 177 178 @Override closeTool()179 public void closeTool() { 180 if ( vcfWriter != null ) { 181 vcfWriter.close(); 182 } 183 } 184 185 } 186