1 package org.broadinstitute.hellbender.tools.walkers.mutect.filtering;
2 
3 import htsjdk.variant.variantcontext.VariantContext;
4 import htsjdk.variant.variantcontext.writer.VariantContextWriter;
5 import htsjdk.variant.vcf.VCFHeader;
6 import htsjdk.variant.vcf.VCFHeaderLine;
7 import htsjdk.variant.vcf.VCFStandardHeaderLines;
8 import org.broadinstitute.barclay.argparser.Argument;
9 import org.broadinstitute.barclay.argparser.ArgumentCollection;
10 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
11 import org.broadinstitute.barclay.help.DocumentedFeature;
12 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
13 import org.broadinstitute.hellbender.engine.FeatureContext;
14 import org.broadinstitute.hellbender.engine.MultiplePassVariantWalker;
15 import org.broadinstitute.hellbender.engine.ReadsContext;
16 import org.broadinstitute.hellbender.engine.ReferenceContext;
17 import org.broadinstitute.hellbender.exceptions.GATKException;
18 import org.broadinstitute.hellbender.exceptions.UserException;
19 import org.broadinstitute.hellbender.tools.walkers.contamination.CalculateContamination;
20 import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2;
21 import org.broadinstitute.hellbender.utils.Utils;
22 import org.broadinstitute.hellbender.utils.io.IOUtils;
23 import org.broadinstitute.hellbender.utils.param.ParamUtils;
24 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
25 import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
26 import picard.cmdline.programgroups.VariantFilteringProgramGroup;
27 import org.broadinstitute.hellbender.tools.walkers.readorientation.LearnReadOrientationModel;
28 
29 import java.io.File;
30 import java.nio.file.Path;
31 import java.util.Set;
32 import java.util.stream.Collectors;
33 
34 /**
35  * <p>Filter variants in a Mutect2 VCF callset.</p>
36  *
37  * <p>
38  *     FilterMutectCalls applies filters to the raw output of {@link Mutect2}.
39  *     Parameters are contained in {@link M2FiltersArgumentCollection} and described in
40  *     <a href='https://github.com/broadinstitute/gatk/tree/master/docs/mutect/mutect.pdf' target='_blank'>https://github.com/broadinstitute/gatk/tree/master/docs/mutect/mutect.pdf</a>.
41  *     To filter based on sequence context artifacts, specify the --orientation-bias-artifact-priors [artifact priors tar.gz file] argument
42  *     one or more times.  This input is generated by {@link LearnReadOrientationModel}.
43  * </p>
44  * <p>
45  *     If given a --contamination-table file, e.g. results from {@link CalculateContamination}, the tool will additionally
46  *     filter variants due to contamination. This argument may be specified with a table for one or more tumor samples.
47  *     Alternatively, provide an estimate of the contamination with the --contamination argument.
48  *
49  *     FilterMutectCalls can also be given one or more --tumor-segmentation files, which are also output by {@link CalculateContamination}.
50  * </p>
51  * <p>
52  *     This tool is featured in the Somatic Short Mutation calling Best Practice Workflow.
53  *     See <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11136">Tutorial#11136</a> for a
54  *     step-by-step description of the workflow and <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11127">Article#11127</a>
55  *     for an overview of what traditional somatic calling entails. For the latest pipeline scripts, see the
56  *     <a href="https://github.com/broadinstitute/gatk/tree/master/scripts/mutect2_wdl">Mutect2 WDL scripts directory</a>.
57  * </p>
58  *
59  * <h3>Usage example</h3>
60  * <pre>
61  * gatk FilterMutectCalls \
62  *   -R reference.fasta \
63  *   -V somatic.vcf.gz \
64  *   --contamination-table contamination.table \
65  *   --tumor-segmentation segments.tsv \
66  *   -O filtered.vcf.gz
67  * </pre>
68  *
69  *
70  * When running on unfiltered output of Mutect2 in --mitochondria mode, setting the advanced option --autosomal-coverage
71  * argument (default 0) activates a recommended filter against likely erroneously mapped  <a href="https://en.wikipedia.org/wiki/NUMT">NuMTs (nuclear mitochondrial DNA segments)</a>.
72  * For the value, provide the median coverage expected in autosomal regions with coverage.
73  *
74  */
75 @CommandLineProgramProperties(
76         summary = "Filter somatic SNVs and indels called by Mutect2",
77         oneLineSummary = "Filter somatic SNVs and indels called by Mutect2",
78         programGroup = VariantFilteringProgramGroup.class
79 )
80 @DocumentedFeature
81 public final class FilterMutectCalls extends MultiplePassVariantWalker {
82 
83     public static final String FILTERING_STATS_LONG_NAME = "filtering-stats";
84 
85     public static final String FILTERING_STATUS_VCF_KEY = "filtering_status";
86 
87     public static final String FILTERING_STATS_EXTENSION = ".filteringStats.tsv";
88 
89     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName =StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
90             doc="The output filtered VCF file", optional=false)
91     private final String outputVcf = null;
92 
93     @Argument(fullName = Mutect2.MUTECT_STATS_SHORT_NAME, doc="The Mutect stats file output by Mutect2", optional=true)
94     private final String statsTable = null;
95 
96     @Argument(fullName = FILTERING_STATS_LONG_NAME, doc="The output filtering stats file", optional=true)
97     private final String filteringStatsOutput = null;
98 
99     @ArgumentCollection
100     protected M2FiltersArgumentCollection MTFAC = new M2FiltersArgumentCollection();
101 
102     private VariantContextWriter vcfWriter;
103 
104     private Mutect2FilteringEngine filteringEngine;
105 
106     private static final int NUMBER_OF_LEARNING_PASSES = 2;
107 
108     @Override
numberOfPasses()109     protected int numberOfPasses() { return NUMBER_OF_LEARNING_PASSES + 2; }    // {@code NUMBER_OF_LEARNING_PASSES} passes for learning, one for the threshold, and one for calling
110 
111     @Override
requiresReference()112     public boolean requiresReference() { return true;}
113 
114     @Override
onTraversalStart()115     public void onTraversalStart() {
116         Utils.resetRandomGenerator();
117         final VCFHeader inputHeader = getHeaderForVariants();
118         final Set<VCFHeaderLine> headerLines = inputHeader.getMetaDataInSortedOrder().stream()
119                 .filter(line -> !line.getKey().equals(FILTERING_STATUS_VCF_KEY)) //remove header line from Mutect2 stating that calls are unfiltered.
120                 .collect(Collectors.toSet());
121         headerLines.add(new VCFHeaderLine(FILTERING_STATUS_VCF_KEY, "These calls have been filtered by " + FilterMutectCalls.class.getSimpleName() + " to label false positives with a list of failed filters and true positives with PASS."));
122 
123         // all possible filters, even allele specific (since they can apply to the site as well
124         GATKVCFConstants.MUTECT_FILTER_NAMES.stream().map(GATKVCFHeaderLines::getFilterLine).forEach(headerLines::add);
125 
126         // these are the possible allele specific filters which will be in the INFO section
127         // when all relevant alleles (non-symbolic, etc) are filtered, the filter will be applied to the site level filter also
128         GATKVCFConstants.MUTECT_AS_FILTER_NAMES.stream().map(GATKVCFHeaderLines::getInfoLine).forEach(headerLines::add);
129 
130         headerLines.addAll(getDefaultToolVCFHeaderLines());
131 
132         final VCFHeader vcfHeader = new VCFHeader(headerLines, inputHeader.getGenotypeSamples());
133         vcfWriter = createVCFWriter(new File(outputVcf));
134         vcfWriter.writeHeader(vcfHeader);
135 
136 
137         final File mutect2StatsTable = new File(statsTable == null ? drivingVariantFile + Mutect2.DEFAULT_STATS_EXTENSION : statsTable);
138         filteringEngine = new Mutect2FilteringEngine(MTFAC, vcfHeader, mutect2StatsTable);
139         if (!mutect2StatsTable.exists()) {
140             throw new UserException.CouldNotReadInputFile("Mutect stats table " + mutect2StatsTable + " not found.  When Mutect2 outputs a file calls.vcf it also creates" +
141                     " a calls.vcf" + Mutect2.DEFAULT_STATS_EXTENSION + " file.  Perhaps this file was not moved along with the vcf, or perhaps it was not delocalized from a" +
142                     " virtual machine while running in the cloud." );
143         }
144     }
145 
146     @Override
nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, final int n)147     protected void nthPassApply(final VariantContext variant,
148                                 final ReadsContext readsContext,
149                                 final ReferenceContext referenceContext,
150                                 final FeatureContext featureContext,
151                                 final int n) {
152         ParamUtils.isPositiveOrZero(n, "Passes must start at the 0th pass.");
153         if (n <= NUMBER_OF_LEARNING_PASSES) {
154             filteringEngine.accumulateData(variant, referenceContext);
155         } else if (n == NUMBER_OF_LEARNING_PASSES + 1) {
156             vcfWriter.add(filteringEngine.applyFiltersAndAccumulateOutputStats(variant, referenceContext));
157         } else {
158             throw new GATKException.ShouldNeverReachHereException("This walker should never reach (zero-indexed) pass " + n);
159         }
160     }
161 
162     @Override
afterNthPass(final int n)163     protected void afterNthPass(final int n) {
164         if (n < NUMBER_OF_LEARNING_PASSES) {
165             filteringEngine.learnParameters();
166         } else if (n == NUMBER_OF_LEARNING_PASSES) {
167             // it's important for filter parameters to stay the same and only learn the threshold in the final pass so that the
168             // final threshold used corresponds exactly to the filters
169             filteringEngine.learnThreshold();
170         }else if (n == NUMBER_OF_LEARNING_PASSES + 1) {
171             final Path filteringStats = IOUtils.getPath(filteringStatsOutput != null ? filteringStatsOutput : outputVcf + FILTERING_STATS_EXTENSION);
172             filteringEngine.writeFilteringStats(filteringStats);
173         } else {
174             throw new GATKException.ShouldNeverReachHereException("This walker should never reach (zero-indexed) pass " + n);
175         }
176     }
177 
178     @Override
closeTool()179     public void closeTool() {
180         if ( vcfWriter != null ) {
181             vcfWriter.close();
182         }
183     }
184 
185 }
186