1 package org.broadinstitute.hellbender.tools.walkers.mutect;
2 
3 import com.google.common.collect.ImmutableSet;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.Genotype;
6 import htsjdk.variant.variantcontext.VariantContext;
7 import htsjdk.variant.vcf.VCFConstants;
8 import htsjdk.variant.vcf.VCFHeader;
9 import org.apache.commons.collections4.SetUtils;
10 import org.apache.commons.lang3.tuple.Pair;
11 import org.broadinstitute.hellbender.CommandLineProgramTest;
12 import org.broadinstitute.hellbender.Main;
13 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
14 import org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection;
15 import org.broadinstitute.hellbender.engine.FeatureDataSource;
16 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
17 import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
18 import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
19 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection;
20 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReadThreadingAssemblerArgumentCollection;
21 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReferenceConfidenceMode;
22 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.FilterMutectCalls;
23 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.M2FiltersArgumentCollection;
24 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.NuMTFilterTool;
25 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.ReadOrientationFilter;
26 import org.broadinstitute.hellbender.tools.walkers.readorientation.LearnReadOrientationModel;
27 import org.broadinstitute.hellbender.tools.walkers.validation.Concordance;
28 import org.broadinstitute.hellbender.tools.walkers.validation.ConcordanceSummaryRecord;
29 import org.broadinstitute.hellbender.tools.walkers.variantutils.ValidateVariants;
30 import org.broadinstitute.hellbender.utils.*;
31 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
32 import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
33 import org.testng.Assert;
34 import org.testng.annotations.DataProvider;
35 import org.testng.annotations.Test;
36 
37 import java.io.File;
38 import java.io.IOException;
39 import java.nio.file.Files;
40 import java.util.*;
41 import java.util.function.Function;
42 import java.util.stream.Collectors;
43 
44 /**
45  * Created by davidben on 9/1/16.
46  */
47 @Test(groups = {"variantcalling"})
48 public class Mutect2IntegrationTest extends CommandLineProgramTest {
49     // positions 10,000,000 - 11,000,000 of chr 20 and with most annotations removed
50     private static final File GNOMAD = new File(largeFileTestDir, "very-small-gnomad.vcf");
51 
52     private static final String DREAM_BAMS_DIR = largeFileTestDir + "mutect/dream_synthetic_bams/";
53     private static final File DREAM_4_NORMAL = new File(DREAM_BAMS_DIR, "normal_4.bam");
54     private static final File DREAM_3_NORMAL = new File(DREAM_BAMS_DIR, "normal_3.bam");
55     private static final File DREAM_4_TUMOR = new File(DREAM_BAMS_DIR, "tumor_4.bam");
56     private static final File DREAM_3_TUMOR = new File(DREAM_BAMS_DIR, "tumor_3.bam");
57     private static final File DREAM_2_NORMAL = new File(DREAM_BAMS_DIR, "normal_2.bam");
58     private static final File DREAM_1_NORMAL = new File(DREAM_BAMS_DIR, "normal_1.bam");
59     private static final File DREAM_2_TUMOR = new File(DREAM_BAMS_DIR, "tumor_2.bam");
60     private static final File DREAM_1_TUMOR = new File(DREAM_BAMS_DIR, "tumor_1.bam");
61 
62     private static final String DREAM_VCFS_DIR = toolsTestDir + "mutect/dream/vcfs/";
63     private static final File DREAM_4_TRUTH = new File(DREAM_VCFS_DIR, "sample_4.vcf");
64     private static final File DREAM_3_TRUTH = new File(DREAM_VCFS_DIR, "sample_3.vcf");
65     private static final File DREAM_2_TRUTH = new File(DREAM_VCFS_DIR, "sample_2.vcf");
66     private static final File DREAM_1_TRUTH = new File(DREAM_VCFS_DIR, "sample_1.vcf");
67 
68     private static final String DREAM_MASKS_DIR = toolsTestDir + "mutect/dream/masks/";
69     private static final File DREAM_4_MASK = new File(DREAM_MASKS_DIR, "mask4.list");
70     private static final File DREAM_3_MASK = new File(DREAM_MASKS_DIR, "mask3.list");
71     private static final File DREAM_2_MASK = new File(DREAM_MASKS_DIR, "mask2.list");
72     private static final File DREAM_1_MASK = new File(DREAM_MASKS_DIR, "mask1.list");
73 
74     private static final File NO_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/no-contamination.table");
75     private static final File FIVE_PCT_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/five-pct-contamination.table");
76     private static final File TEN_PCT_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/ten-pct-contamination.table");
77 
78     private static final File NA12878_MITO_BAM = new File(toolsTestDir, "mutect/mito/NA12878.bam");
79     private static final File NA12878_MITO_VCF = new File(toolsTestDir, "mutect/mito/unfiltered-with-assb.vcf");
80     private static final File NA12878_MITO_GVCF = new File(toolsTestDir, "mitochondria/NA12878.MT.g.vcf");
81     private static final File NA12878_MITO_INITIAL_FILTERED_VCF = new File(toolsTestDir, "mutect/mito/initialFiltered.vcf");
82     private static final File MITO_REF = new File(toolsTestDir, "mutect/mito/Homo_sapiens_assembly38.mt_only.fasta");
83     private static final File DEEP_MITO_BAM = new File(largeFileTestDir, "mutect/highDPMTsnippet.bam");
84     private static final String DEEP_MITO_SAMPLE_NAME = "mixture";
85 
86     private static final File FILTERING_DIR = new File(toolsTestDir, "mutect/filtering");
87 
88     private static final File GNOMAD_WITHOUT_AF_SNIPPET = new File(toolsTestDir, "mutect/gnomad-without-af.vcf");
89 
90     private static final double TLOD_MATCH_EPSILON = 0.05;
91     private static final double VARIANT_TLOD_MATCH_PCT = 0.01;
92 
93     private static final String CHROMOSOME_20 = "20";
94 
95     // tumor bams, normal bams, truth vcf, mask, required sensitivity, whether to error-correct
96     @DataProvider(name = "dreamSyntheticData")
dreamSyntheticData()97     public Object[][] dreamSyntheticData() {
98         return new Object[][]{
99                 {DREAM_1_TUMOR, Optional.of(DREAM_1_NORMAL), DREAM_1_TRUTH, DREAM_1_MASK, 0.97, false},
100                 {DREAM_2_TUMOR, Optional.of(DREAM_2_NORMAL), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, false},
101                 {DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, false},
102                 {DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, true},
103                 {DREAM_3_TUMOR, Optional.of(DREAM_3_NORMAL), DREAM_3_TRUTH, DREAM_3_MASK, 0.90, false},
104                 {DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, false},
105                 {DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, true},
106                 {DREAM_4_TUMOR, Optional.empty(), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, false},
107         };
108     }
109 
110     /**
111      * Several DREAM challenge bams with synthetic truth data.  In order to keep file sizes manageable, bams are restricted
112      * to chromosome 20, leaving ~100-200 variants, and then further restricted to 400-bp intervals centered around
113      * these variants.
114      * <p>
115      * Because this truth data is synthetic, ideal performance is perfect concordance with the truth vcfs.
116      * <p>
117      * The characteristics of the data are as follows (note that we removed structural variants from the original
118      * DREAM challenge vcfs):
119      * <p>
120      * Sample 1: pure monoclonal sample, SNVs only
121      * Sample 2: 80% pure monoclonal sample, SNVs only
122      * Sample 3: pure triclonal sample, subclone minor allele frequencies are 1/2, 1/3, and 1/5, SNVs and indels
123      * Sample 4: 80% biclonal sample, subclone minor allele fractions are 50% and 35%, SNVs and indels
124      *
125      */
126     @Test(dataProvider = "dreamSyntheticData")
testDreamTumorNormal(final File tumor, final Optional<File> normal, final File truth, final File mask, final double requiredSensitivity, final boolean errorCorrectReads)127     public void testDreamTumorNormal(final File tumor, final Optional<File> normal, final File truth, final File mask,
128                                      final double requiredSensitivity, final boolean errorCorrectReads) throws Exception {
129         Utils.resetRandomGenerator();
130         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
131         final File filteredVcf = createTempFile("filtered", ".vcf");
132         final File f1r2Counts = createTempFile("f1r2", ".tar.gz");
133         final File orientationModel = createTempFile("orientation", ".tar.gz");
134 
135         final List<File> normals = normal.isPresent() ? Collections.singletonList(normal.get()) : Collections.emptyList();
136         runMutect2(Collections.singletonList(tumor), normals, unfilteredVcf, CHROMOSOME_20, b37Reference, Optional.of(GNOMAD),
137                 args -> args.addMask(mask).add(M2ArgumentCollection.F1R2_TAR_GZ_NAME, f1r2Counts),
138                 args -> errorCorrectReads ? args.add(ReadThreadingAssemblerArgumentCollection.PILEUP_ERROR_CORRECTION_LOG_ODDS_NAME, 3.0) : args
139         );
140 
141         // verify that alleles contained in likelihoods matrix but dropped from somatic calls do not show up in annotations
142         // also check that alleles have been properly clipped after dropping any non-called alleles, i.e. if we had AAA AA A
143         // and A got dropped, we need AAA AA -> AA A.  The condition we don't want is that all alleles share a common first base
144         // and no allele has length 1.
145         VariantContextTestUtils.streamVcf(unfilteredVcf)
146                 .forEach(vc -> {
147                     for (final Genotype genotype : vc.getGenotypes()) {
148                         final int[] f1r2 = ReadOrientationFilter.getF1R2(genotype);
149                         Assert.assertEquals(f1r2.length, vc.getNAlleles());
150                         if (vc.getAlleles().stream().filter(a -> !a.isSymbolic()).map(a -> a.getBases()[0]).distinct().count() == 1) {
151                             Assert.assertTrue(vc.getAlleles().stream().anyMatch(a -> a.getBases().length == 1));
152                         }
153                     }
154                 });
155 
156         final ArgumentsBuilder orientationBiasArgs = new ArgumentsBuilder().addInput(f1r2Counts).addOutput(orientationModel);
157         runCommandLine(orientationBiasArgs, LearnReadOrientationModel.class.getSimpleName());
158 
159         for (final boolean runOrientationFilter : new boolean[] { true, false}) {
160 
161             runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference,
162                     args -> runOrientationFilter ? args.add(M2FiltersArgumentCollection.ARTIFACT_PRIOR_TABLE_NAME, orientationModel) : args);
163 
164             final File concordanceSummary = createTempFile("concordance", ".txt");
165             final File truePositivesFalseNegatives = createTempFile("tpfn", ".vcf");
166             runConcordance(truth, filteredVcf,concordanceSummary, CHROMOSOME_20, mask, Optional.of (truePositivesFalseNegatives));
167 
168             final List<ConcordanceSummaryRecord> summaryRecords = new ConcordanceSummaryRecord.Reader(concordanceSummary.toPath()).toList();
169             summaryRecords.forEach(rec -> {
170                 if (rec.getTruePositives() + rec.getFalseNegatives() > 0) {
171                     Assert.assertTrue(rec.getSensitivity() > requiredSensitivity);
172                     // tumor-only will have germline variants sneak in
173                     if (!normals.isEmpty()) {
174                         Assert.assertTrue(rec.getPrecision() > 0.5);
175                     }
176                 }
177             });
178         }
179     }
180 
181     @Test
testNA12878NormalNormalFiltering()182     public void testNA12878NormalNormalFiltering() {
183         Utils.resetRandomGenerator();
184         final File unfilteredVcf = new File(FILTERING_DIR, "NA12878.vcf");
185         final File contamination = new File(FILTERING_DIR, "contamination.table");
186         final File segments = new File(FILTERING_DIR, "segments.table");
187 
188         final File filteredVcf = createTempFile("filtered", ".vcf");
189 
190         runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference,
191                 args -> args.add(M2FiltersArgumentCollection.TUMOR_SEGMENTATION_LONG_NAME, segments),
192                 args -> args.add(M2FiltersArgumentCollection.CONTAMINATION_TABLE_LONG_NAME, contamination));
193 
194         final long numPassVariants = VariantContextTestUtils.streamVcf(filteredVcf)
195                 .filter(vc -> vc.getFilters().isEmpty()).count();
196 
197         Assert.assertTrue(numPassVariants < 10);
198     }
199 
200     // tumorBams, normalBam, truthVcf, mask, requiredSensitivity
201     @DataProvider(name = "twoTumorData")
202     public Object[][] twoTumorData() {
203         return new Object[][]{
204                 {Arrays.asList(DREAM_1_TUMOR, DREAM_2_TUMOR), Collections.singletonList(DREAM_1_NORMAL), DREAM_1_TRUTH, DREAM_1_MASK, 0.97},
205                 {Arrays.asList(DREAM_3_TUMOR, DREAM_4_TUMOR), Collections.singletonList(DREAM_3_NORMAL), DREAM_3_TRUTH, DREAM_3_MASK, 0.90}
206         };
207     }
208 
209     @Test(dataProvider = "twoTumorData")
210     public void testTwoDreamTumorSamples(final List<File> tumors, final List<File> normals,
211                                          final File truth, final File mask, final double requiredSensitivity) throws Exception {
212         Utils.resetRandomGenerator();
213         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
214         final File filteredVcf = createTempFile("filtered", ".vcf");
215 
216         runMutect2(tumors, normals, unfilteredVcf, CHROMOSOME_20, b37Reference, Optional.of(GNOMAD), args -> args.addMask(mask));
217         runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference);
218 
219         final File concordanceSummary = createTempFile("concordance", ".txt");
220         runConcordance(truth, filteredVcf, concordanceSummary, CHROMOSOME_20, mask, Optional.empty());
221 
222         final List<ConcordanceSummaryRecord> summaryRecords = new ConcordanceSummaryRecord.Reader(concordanceSummary.toPath()).toList();
223         summaryRecords.forEach(rec -> {
224             if (rec.getTruePositives() + rec.getFalseNegatives() > 0) {
225                 Assert.assertTrue(rec.getSensitivity() > requiredSensitivity);
226                 // tumor-only will have germline variants sneak in
227                 if (!normals.isEmpty()) {
228                     //Assert.assertTrue(rec.getPrecision() > 0.5);
229                 }
230             }
231         });
232     }
233 
234     // make a pon with a tumor and then use this pon to call somatic variants on the same tumor
235     // if the pon is doing its job all calls should be filtered by this pon
236     @Test
testPon()237     public void testPon() {
238         Utils.resetRandomGenerator();
239         final File tumor = DREAM_1_TUMOR;
240         final File normal = DREAM_1_NORMAL;
241         final File ponVcf = createTempFile("pon", ".vcf");
242         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
243         final File filteredVcf = createTempFile("filtered", ".vcf");
244 
245         runMutect2(tumor, normal, ponVcf, CHROMOSOME_20, b37Reference, Optional.empty());
246         runMutect2(tumor, normal, unfilteredVcf, CHROMOSOME_20, b37Reference, Optional.empty(),
247                 args -> args.add(M2ArgumentCollection.PANEL_OF_NORMALS_LONG_NAME, ponVcf));
248         runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference);
249 
250         final long numVariants = VariantContextTestUtils.streamVcf(filteredVcf)
251                 .filter(vc -> vc.getFilters().isEmpty()).count();
252 
253         Assert.assertEquals(numVariants, 0);
254     }
255 
256     // run tumor-normal mode using the original DREAM synthetic sample 1 tumor and normal restricted to
257     // 1/3 of our dbSNP interval, in which there is only one true positive.
258     // we want to see that the number of false positives is small
259     @Test
testTumorNormal()260     public void testTumorNormal()  {
261         Utils.resetRandomGenerator();
262         final File unfilteredVcf = createTempFile("output", ".vcf");
263         final File filteredVcf = createTempFile("filtered", ".vcf");
264         final List<File> tumor = Collections.singletonList(new File(DREAM_BAMS_DIR, "tumor.bam"));
265         final List<File> normals = Arrays.asList(new File(DREAM_BAMS_DIR, "normal.bam"), DREAM_2_NORMAL);
266 
267         runMutect2(tumor, normals, unfilteredVcf, "20:10000000-10100000", b37Reference, Optional.empty());
268         runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference);
269 
270         VariantContextTestUtils.streamVcf(unfilteredVcf).flatMap(vc -> vc.getGenotypes().stream()).forEach(g -> Assert.assertTrue(g.hasAD()));
271         final long numVariants = VariantContextTestUtils.streamVcf(filteredVcf)
272                 .filter(VariantContext::isNotFiltered)
273                 .count();
274         Assert.assertTrue(numVariants < 4);
275     }
276 
277     // run tumor-only using our mini gnomAD on NA12878, which is not a tumor
278     @Test
279     public void testTumorOnly() {
280         Utils.resetRandomGenerator();
281         final File tumor = new File(NA12878_20_21_WGS_bam);
282         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
283         final File filteredVcf = createTempFile("filtered", ".vcf");
284 
285         runMutect2(tumor, unfilteredVcf, "20:10000000-10010000", b37Reference, Optional.of(GNOMAD));
286         runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference);
287 
288         final long numVariantsBeforeFiltering = VariantContextTestUtils.streamVcf(unfilteredVcf).count();
289 
290         final long numVariantsPassingFilters = VariantContextTestUtils.streamVcf(filteredVcf)
291                 .filter(vc -> vc.getFilters().isEmpty()).count();
292 
293         // just a sanity check that this bam has some germline variants on this interval so that our test doesn't pass trivially!
294         Assert.assertTrue(numVariantsBeforeFiltering > 15);
295 
296         // every variant on this interval in this sample is in gnomAD
297         Assert.assertTrue(numVariantsPassingFilters < 2);
298     }
299 
300     // make sure we can call tumor alts when the normal has a different alt at the same site
301     // regression test for https://github.com/broadinstitute/gatk/issues/6901
302     @Test
303     public void testDifferentAltsInTumorAndNormal() {
304         Utils.resetRandomGenerator();
305         final File tumor = new File(toolsTestDir, "mutect/different-alts-tumor.bam");
306         final File normal = new File(toolsTestDir, "mutect/different-alts-normal.bam");
307         final File output = createTempFile("output", ".vcf");
308 
309 
310         runMutect2(tumor, normal, output, "20:10020000-10021200", b37Reference, Optional.empty(),
311                 args -> args.add(M2ArgumentCollection.NORMAL_LOG_10_ODDS_LONG_NAME, 0.0));
312 
313         final Map<Integer, Allele> altAllelesByPosition = VariantContextTestUtils.streamVcf(output)
314                 .collect(Collectors.toMap(VariantContext::getStart, VariantContext::getAltAlleleWithHighestAlleleCount));
315 
316         Assert.assertTrue(altAllelesByPosition.get(10020042).basesMatch(Allele.ALT_C)); //tumor G->C, normal G->A
317         Assert.assertTrue(altAllelesByPosition.get(10020124).basesMatch(Allele.ALT_G)); //tumor A->G, normal A->T
318     }
319 
320     // test on an artificial bam with several contrived MNPs
321     @Test
testMnps()322     public void testMnps() {
323         Utils.resetRandomGenerator();
324         final File bam = new File(toolsTestDir, "mnp.bam");
325 
326         for (final int maxMnpDistance : new int[]{0, 1, 2, 3, 5}) {
327             final File outputVcf = createTempFile("unfiltered", ".vcf");
328 
329             runMutect2(bam, outputVcf, "20:10019000-10022000", b37Reference, Optional.empty(),
330                     args -> args.add(M2ArgumentCollection.EMISSION_LOG_SHORT_NAME, 15),
331                     args -> args.add(AssemblyBasedCallerArgumentCollection.MAX_MNP_DISTANCE_SHORT_NAME, maxMnpDistance));
332 
333             // note that for testing HaplotypeCaller GVCF mode we will always have the symbolic <NON REF> allele
334             final Map<Integer, List<String>> alleles = VariantContextTestUtils.streamVcf(outputVcf)
335                     .collect(Collectors.toMap(VariantContext::getStart, vc -> vc.getAlternateAlleles().stream().filter(a -> !a.isSymbolic()).map(Allele::getBaseString).collect(Collectors.toList())));
336 
337             // phased, two bases apart
338             if (maxMnpDistance < 2) {
339                 Assert.assertEquals(alleles.get(10019968), Collections.singletonList("G"));
340                 Assert.assertEquals(alleles.get(10019970), Collections.singletonList("G"));
341             } else {
342                 Assert.assertEquals(alleles.get(10019968), Collections.singletonList("GAG"));
343                 Assert.assertTrue(!alleles.containsKey(10019970));
344             }
345 
346             // adjacent and out of phase
347             Assert.assertEquals(alleles.get(10020229), Collections.singletonList("A"));
348             Assert.assertEquals(alleles.get(10020230), Collections.singletonList("G"));
349 
350             // 4-substitution MNP w/ spacings 2, 3, 4
351             if (maxMnpDistance < 2) {
352                 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("G"));
353                 Assert.assertEquals(alleles.get(10020432), Collections.singletonList("G"));
354                 Assert.assertEquals(alleles.get(10020435), Collections.singletonList("G"));
355                 Assert.assertEquals(alleles.get(10020439), Collections.singletonList("G"));
356             } else if (maxMnpDistance < 3) {
357                 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("GAG"));
358                 Assert.assertEquals(alleles.get(10020435), Collections.singletonList("G"));
359                 Assert.assertEquals(alleles.get(10020439), Collections.singletonList("G"));
360             } else if (maxMnpDistance < 4) {
361                 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("GAGTTG"));
362                 Assert.assertEquals(alleles.get(10020439), Collections.singletonList("G"));
363             } else {
364                 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("GAGTTGTCTG"));
365             }
366 
367             // two out of phase DNPs that overlap and have a base in common
368             if (maxMnpDistance > 0) {
369                 Assert.assertEquals(alleles.get(10020680), Collections.singletonList("TA"));
370                 Assert.assertEquals(alleles.get(10020681), Collections.singletonList("AT"));
371             }
372         }
373     }
374 
375     @Test
testForceCalling()376     public void testForceCalling() {
377         Utils.resetRandomGenerator();
378         final File tumor = new File(NA12878_20_21_WGS_bam);
379 
380         // The kmerSize = 1 case is a ridiculous setting that forces assembly to fail.
381         for (final int kmerSize : new int[] {1, 20}) {
382             final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
383             final File forceCalls = new File(toolsTestDir, "mutect/gga_mode.vcf");
384 
385             runMutect2(tumor, unfilteredVcf, "20:9998500-10010000", b37Reference, Optional.empty(),
386                     args -> args.add(AssemblyBasedCallerArgumentCollection.FORCE_CALL_ALLELES_LONG_NAME, forceCalls),
387                     args -> args.add(ReadThreadingAssemblerArgumentCollection.KMER_SIZE_LONG_NAME, kmerSize),
388                     args -> args.add(ReadThreadingAssemblerArgumentCollection.DONT_INCREASE_KMER_SIZE_LONG_NAME, true));
389 
390             final Map<Integer, List<Allele>> altAllelesByPosition = VariantContextTestUtils.streamVcf(unfilteredVcf)
391                     .collect(Collectors.toMap(VariantContext::getStart, VariantContext::getAlternateAlleles));
392             for (final VariantContext vc : new FeatureDataSource<VariantContext>(forceCalls)) {
393                 final List<Allele> altAllelesAtThisLocus = altAllelesByPosition.get(vc.getStart());
394                 vc.getAlternateAlleles().stream().filter(a-> a.length() > 0 && BaseUtils.isNucleotide(a.getBases()[0])).forEach(a -> Assert.assertTrue(altAllelesAtThisLocus.contains(a)));
395             }
396         }
397     }
398 
399     // test that the dont-use-soft-clips option actually does something
400     @Test
testDontUseSoftClips()401     public void testDontUseSoftClips() {
402         Utils.resetRandomGenerator();
403         final File tumor = new File(NA12878_20_21_WGS_bam);
404         final int start = 10050000;
405 
406         final SimpleInterval interval = new SimpleInterval("20", start, start + 25000);
407 
408         final File calls1 = createTempFile("unfiltered", ".vcf");
409         runMutect2(tumor, calls1, interval.toString(), b37Reference, Optional.of(GNOMAD));
410 
411         final File calls2 = createTempFile("unfiltered", ".vcf");
412         runMutect2(tumor, calls2, interval.toString(), b37Reference, Optional.of(GNOMAD),
413                 args -> args.addFlag(AssemblyBasedCallerArgumentCollection.DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME));
414 
415         final List<VariantContext> indelsWithSoftClips = VariantContextTestUtils.streamVcf(calls1).filter(VariantContext::isIndel).collect(Collectors.toList());
416         final List<VariantContext> indelsWithoutSoftClips = VariantContextTestUtils.streamVcf(calls2).filter(VariantContext::isIndel).collect(Collectors.toList());
417 
418         Assert.assertTrue(indelsWithoutSoftClips.size() < indelsWithSoftClips.size());
419 
420         final int startOfDroppedVariant = 10068160;
421         final int endOfDroppedVariant = 10068174;
422         Assert.assertTrue(indelsWithSoftClips.stream().anyMatch(vc -> vc.getStart() == startOfDroppedVariant && vc.getEnd() == endOfDroppedVariant));
423         Assert.assertFalse(indelsWithoutSoftClips.stream().anyMatch(vc -> vc.getStart() == startOfDroppedVariant && vc.getEnd() == endOfDroppedVariant));
424 
425     }
426 
427 
428     // make sure that force calling with given alleles that normally wouldn't be called due to complete lack of coverage
429     // doesn't run into any edge case bug involving empty likelihoods matrices
430     @Test
testForceCallingZeroCoverage()431     public void testForceCallingZeroCoverage() {
432         Utils.resetRandomGenerator();
433         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
434         final File forceCalls = new File(toolsTestDir, "mutect/gga_mode_2.vcf");
435 
436         runMutect2(DREAM_3_TUMOR, unfilteredVcf, "20:1119000-1120000", b37Reference, Optional.empty(),
437                 args -> args.add(AssemblyBasedCallerArgumentCollection.FORCE_CALL_ALLELES_LONG_NAME, forceCalls));
438     }
439 
440     // make sure we have fixed a bug where germline resources with AF=. throw errors
441     @Test
testMissingAF()442     public void testMissingAF() {
443         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
444         runMutect2(DREAM_4_TUMOR, unfilteredVcf, "20:1119000-1120000", b37Reference, Optional.of(GNOMAD_WITHOUT_AF_SNIPPET),
445                 args -> args.addInterval(new SimpleInterval("20:10837425-10837426")));
446     }
447 
448     @Test
testContaminationFilter()449     public void testContaminationFilter() {
450         Utils.resetRandomGenerator();
451         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
452         runMutect2(new File(NA12878_20_21_WGS_bam), unfilteredVcf, "20:10000000-20010000", b37Reference, Optional.of(GNOMAD));
453 
454         final Map<Integer, Set<VariantContext>> filteredVariants = Arrays.stream(new int[] {0, 5, 10}).boxed().collect(Collectors.toMap(pct -> pct, pct -> {
455             final File filteredVcf = createTempFile("filtered-" + pct, ".vcf");
456             final File contaminationTable = pct == 0 ? NO_CONTAMINATION_TABLE :
457                     (pct == 5 ? FIVE_PCT_CONTAMINATION_TABLE : TEN_PCT_CONTAMINATION_TABLE);
458             runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference,
459                     args -> args.add(M2FiltersArgumentCollection.CONTAMINATION_TABLE_LONG_NAME, contaminationTable));
460 
461             return VariantContextTestUtils.streamVcf(filteredVcf).collect(Collectors.toSet());
462         }));
463 
464 
465         final int variantsFilteredAtZeroPercent = (int) filteredVariants.get(0).stream()
466                 .filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME))
467                 .count();
468 
469         final List<VariantContext> variantsFilteredAtFivePercent = filteredVariants.get(5).stream()
470                 .filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)).collect(Collectors.toList());
471         Assert.assertEquals(variantsFilteredAtZeroPercent, 0);
472         Assert.assertTrue(variantsFilteredAtFivePercent.size() <
473                 filteredVariants.get(10).stream().filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)).count());
474 
475         final List<VariantContext> missedObviousVariantsAtTenPercent = filteredVariants.get(10).stream()
476                 .filter(vc -> !vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME))
477                 .filter(VariantContext::isBiallelic)
478                 .filter(vc -> {
479                     final int[] AD = vc.getGenotype(0).getAD();
480                     return AD[1] < 0.15 * AD[0];
481                 }).collect(Collectors.toList());
482 
483         Assert.assertTrue(missedObviousVariantsAtTenPercent.isEmpty());
484 
485         // If the filter is smart, it won't filter variants with allele fraction much higher than the contamination
486         final List<VariantContext> highAlleleFractionFilteredVariantsAtFivePercent = variantsFilteredAtFivePercent.stream()
487                 .filter(VariantContext::isBiallelic)
488                 .filter(vc -> vc.getAttributeAsDouble(GATKVCFConstants.CONTAMINATION_QUAL_KEY, 100) < 30)
489                 .filter(vc -> {
490                     final int[] AD = vc.getGenotype(0).getAD();
491                     return MathUtils.sum(AD) > 30 && AD[1] > AD[0];
492                 }).collect(Collectors.toList());
493 
494         Assert.assertTrue(highAlleleFractionFilteredVariantsAtFivePercent.isEmpty());
495     }
496 
497     // test that ReadFilterLibrary.NON_ZERO_REFERENCE_LENGTH_ALIGNMENT removes reads that consume zero reference bases
498     // e.g. read name HAVCYADXX150109:1:2102:20528:2129 with cigar 23S53I
499     @Test
testReadsThatConsumeZeroReferenceReads()500     public void testReadsThatConsumeZeroReferenceReads()  {
501         final File bam = new File(publicTestDir + "org/broadinstitute/hellbender/tools/mutect/na12878-chr20-consumes-zero-reference-bases.bam");
502         final File outputVcf = createTempFile("output", ".vcf");
503 
504         runMutect2(bam, outputVcf, CHROMOSOME_20, b37Reference, Optional.empty());
505     }
506 
507     // make sure that unpaired reads that pass filtering do not cause errors
508     // in particular, the read HAVCYADXX150109:1:1109:11610:46575 with SAM flag 16 fails without the patch
509     @Test
testUnpairedReads()510     public void testUnpairedReads()  {
511         final File bam = new File(toolsTestDir + "unpaired.bam");
512         final File outputVcf = createTempFile("output", ".vcf");
513 
514         runMutect2(bam, outputVcf, CHROMOSOME_20, b37Reference, Optional.empty());
515     }
516 
517     // some bams from external pipelines use faulty adapter trimming programs that introduce identical repeated reads
518     // into bams.  Although these bams fail the Picard tool ValidateSamFile, one can run HaplotypeCaller and Mutect on them
519     // and get fine results.  This test ensures that this remains the case.  The test bam is a small chunk of reads surrounding
520     // a germline SNP in NA12878, where we have duplicated 40 of the reads. (In practice bams of this nature might have one bad read
521     // per megabase).
522     @Test
testBamWithRepeatedReads()523     public void testBamWithRepeatedReads() {
524         final File bam = new File(toolsTestDir + "mutect/repeated_reads.bam");
525         final File outputVcf = createTempFile("output", ".vcf");
526 
527         runMutect2(bam, outputVcf, "20:10018000-10020000", b37Reference, Optional.empty());
528     }
529 
530     // In the rare case that a particular fragment is only supported by supplementary reads, that should not
531     // result in an exception.  This test ensures that Mutect2 does not fail to finish in that case.
532     @Test
testBamWithOnlySupplementaryReads()533     public void testBamWithOnlySupplementaryReads() {
534         final File bam = new File(toolsTestDir + "mutect/only_supplementary_reads.bam");
535         final File outputVcf = createTempFile("output", ".vcf");
536 
537         runMutect2(bam, outputVcf, "20:10018000-10020000", b37Reference, Optional.empty());
538     }
539 
540     // basic test on a small chunk of NA12878 mitochondria.  This is not a validation, but rather a sanity check
541     // that M2 makes obvious calls, doesn't trip up on the beginning of the circular chromosome, and can handle high depth
542     @Test
testMitochondria()543     public void testMitochondria()  {
544         Utils.resetRandomGenerator();
545         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
546 
547         runMutect2(NA12878_MITO_BAM, unfilteredVcf, "chrM:1-1000", MITO_REF.getAbsolutePath(), Optional.empty(),
548                 args -> args.add(M2ArgumentCollection.MITOCHONDRIA_MODE_LONG_NAME, true));
549 
550         final List<VariantContext> variants = VariantContextTestUtils.streamVcf(unfilteredVcf).collect(Collectors.toList());
551         final List<String> variantKeys = variants.stream().map(VariantContextTestUtils::keyForVariant).collect(Collectors.toList());
552 
553         final List<String> expectedKeys = Arrays.asList(
554                 "chrM:152-152 T*, [C]",
555                 "chrM:263-263 A*, [G]",
556                 "chrM:302-302 A*, [AC, ACC, ACCCCCCCCCCCCC, C]",
557                 "chrM:310-310 T*, [C, TC]",
558                 "chrM:750-750 A*, [G]");
559         Assert.assertTrue(variantKeys.containsAll(expectedKeys));
560 
561         Assert.assertEquals(variants.get(0).getAttributeAsInt(GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY, 0), 1741);
562     }
563 
564     @DataProvider(name = "vcfsForFiltering")
vcfsForFiltering()565     public Object[][] vcfsForFiltering() {
566         return new Object[][]{
567                 {NA12878_MITO_VCF, 0.5, Collections.emptyList(), Arrays.asList(
568                         ImmutableSet.of(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME),
569                         Collections.emptySet(),
570                         ImmutableSet.of( GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME,
571                                 GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME),
572                         Collections.emptySet(),
573                         Collections.emptySet(),
574                         ImmutableSet.of(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME),
575                         ImmutableSet.of(GATKVCFConstants.FAIL)),
576                         Arrays.asList(
577                                 Arrays.asList(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME), // strand_bias, strict_stand
578                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE
579                                 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME), // weak_evidence, low_allele_frac
580                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME, GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME), // SITE|weak_evidence, strand_bias, low_allele_frac|strand_bias, strict_strand, low_allele_frac
581                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS),  // SITE
582                                 Arrays.asList(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME), // duplicate
583                                 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME, GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME) // weak_evidence, strand_bias, strict_stand|low_allele_frac
584 
585                         )},
586                 {NA12878_MITO_GVCF, .0009, Arrays.asList("MT:1", "MT:37", "MT:40", "MT:152", "MT:157"), Arrays.asList(
587                         Collections.emptySet(),
588                         ImmutableSet.of(GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME, GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME),
589                         ImmutableSet.of(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME,GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME),
590                         Collections.emptySet(),
591                         ImmutableSet.of(GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME, GATKVCFConstants.CONTAMINATION_FILTER_NAME, GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME,
592                                 GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME, GATKVCFConstants.READ_POSITION_FILTER_NAME, GATKVCFConstants.MEDIAN_MAPPING_QUALITY_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME)),
593                         Arrays.asList(
594                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE,
595                                 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS), //"weak_evidence, base_qual, strand_bias|SITE",
596                                 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS), // "weak_evidence, strict_strand, strand_bias|SITE",
597                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS), //".|weak_evidence, base_qual, strand_bias, low_allele_frac|SITE",
598                                 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_MAPPING_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.CONTAMINATION_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME + ", " + GATKVCFConstants.READ_POSITION_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS) // "weak_evidence, base_qual, map_qual, contamination, strand_artifact, position, low_allele_frac|SITE"
599                         )}
600         };
601     }
602 
603     @Test(dataProvider = "vcfsForFiltering")
testFilterMitochondria(File unfiltered, final double minAlleleFraction, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters)604     public void testFilterMitochondria(File unfiltered, final double minAlleleFraction, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters)  {
605         final File filteredVcf = createTempFile("filtered", ".vcf");
606 
607         // vcf sequence dicts don't match ref
608         runFilterMutectCalls(unfiltered, filteredVcf, MITO_REF.getAbsolutePath(),
609                 args -> args.add(M2ArgumentCollection.MITOCHONDRIA_MODE_LONG_NAME, true),
610                 args -> args.add(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true),
611                 args -> args.add(M2FiltersArgumentCollection.MIN_AF_LONG_NAME, minAlleleFraction),
612                 args -> args.add(M2FiltersArgumentCollection.MIN_READS_ON_EACH_STRAND_LONG_NAME, 1),
613                 args -> args.add(M2FiltersArgumentCollection.UNIQUE_ALT_READ_COUNT_LONG_NAME, 2),
614                 args -> {
615                     intervals.stream().map(SimpleInterval::new).forEach(args::addInterval);
616                     return args;
617                 });
618 
619         final List<Set<String>> actualFilters = VariantContextTestUtils.streamVcf(filteredVcf)
620                 .map(VariantContext::getFilters).collect(Collectors.toList());
621 
622         final List<List<String>> actualASFilters = VariantContextTestUtils.streamVcf(filteredVcf)
623                 .map(vc -> AnnotationUtils.decodeAnyASListWithRawDelim(vc.getCommonInfo().getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, ""))).collect(Collectors.toList());
624         Assert.assertEquals(actualASFilters, expectedASFilters);
625 
626         Assert.assertEquals(actualFilters.size(), expectedFilters.size());
627         for (int n = 0; n < actualFilters.size(); n++) {
628             Assert.assertTrue(actualFilters.get(n).containsAll(expectedFilters.get(n)), "Actual filters missing some expected filters: " + SetUtils.difference(expectedFilters.get(n), actualFilters.get(n)));
629             Assert.assertTrue(expectedFilters.get(n).containsAll(actualFilters.get(n)), "Expected filters missing some actual filters: " + SetUtils.difference(actualFilters.get(n), expectedFilters.get(n)));
630         }
631 
632         Assert.assertEquals(actualFilters, expectedFilters);
633     }
634 
635     @DataProvider(name = "vcfsForNuMTFiltering")
vcfsForNuMTFiltering()636     public Object[][] vcfsForNuMTFiltering() {
637         return new Object[][]{
638                 {NA12878_MITO_INITIAL_FILTERED_VCF, 30, Collections.emptyList(), Arrays.asList(
639                         ImmutableSet.of(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME),
640                         Collections.emptySet(),
641                         ImmutableSet.of( GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME,
642                                 GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME,
643                                 GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME),
644                         Collections.emptySet(),
645                         Collections.emptySet(),
646                         ImmutableSet.of(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME),
647                         ImmutableSet.of(GATKVCFConstants.FAIL)),
648                         Arrays.asList(
649                                 Arrays.asList(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME), // strand_bias, strict_stand
650                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE
651                                 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", "  + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME), // weak_evidence, low_allele_frac, possible_numt
652                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", " + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME, GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", " + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME), // SITE|weak_evidence, strand_bias, low_allele_frac, possible_numt|strand_bias, strict_strand, low_allele_frac, possible_numt
653                                 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS),  // SITE
654                                 Arrays.asList(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME), // duplicate
655                                 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME, GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", " + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME) // weak_evidence, strand_bias, strict_stand|low_allele_frac, possible_numt
656 
657                         )}
658         };
659     }
660 
661     @Test(dataProvider = "vcfsForNuMTFiltering")
testNuMTFilterMitochondria(File initialFilters, final double autosomalCoverage, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters)662     public void testNuMTFilterMitochondria(File initialFilters, final double autosomalCoverage, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters)  {
663         final File filteredVcf = createTempFile("filtered", ".vcf");
664 
665         final ArgumentsBuilder args = new ArgumentsBuilder()
666                 .addVCF(initialFilters)
667                 .addOutput(filteredVcf)
668                 .addReference(MITO_REF.getAbsolutePath())
669                 .add(NuMTFilterTool.MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME, autosomalCoverage)
670                 .add(NuMTFilterTool.MAX_NUMT_COPIES_IN_AUTOSOME_LONG_NAME, 4.0);
671 
672         intervals.stream().map(SimpleInterval::new).forEach(args::addInterval);
673 
674         // vcf sequence dicts don't match ref
675         runCommandLine(args, NuMTFilterTool.class.getSimpleName());
676 
677         final List<Set<String>> actualFilters = VariantContextTestUtils.streamVcf(filteredVcf)
678                 .map(VariantContext::getFilters).collect(Collectors.toList());
679 
680         final List<List<String>> actualASFilters = VariantContextTestUtils.streamVcf(filteredVcf)
681                 .map(vc -> AnnotationUtils.decodeAnyASListWithRawDelim(vc.getCommonInfo().getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, ""))).collect(Collectors.toList());
682         Assert.assertEquals(actualASFilters, expectedASFilters);
683 
684         Assert.assertEquals(actualFilters.size(), expectedFilters.size());
685         for (int n = 0; n < actualFilters.size(); n++) {
686             Assert.assertTrue(actualFilters.get(n).containsAll(expectedFilters.get(n)), "Actual filters missing some expected filters: " + SetUtils.difference(expectedFilters.get(n), actualFilters.get(n)));
687             Assert.assertTrue(expectedFilters.get(n).containsAll(actualFilters.get(n)), "Expected filters missing some actual filters: " + SetUtils.difference(actualFilters.get(n), expectedFilters.get(n)));
688         }
689 
690         Assert.assertEquals(actualFilters, expectedFilters);
691     }
692 
693     @Test
testMitochondrialRefConf()694     public void testMitochondrialRefConf()  {
695         Utils.resetRandomGenerator();
696         final File standardVcf = createTempFile("standard", ".vcf");
697         final File unthresholded = createTempFile("unthresholded", ".vcf");
698         final double minAF = 0.01;
699 
700         runMutect2(NA12878_MITO_BAM, standardVcf, "chrM:1-1000", MITO_REF.getAbsolutePath(), Optional.empty(),
701                 args -> args.add(AssemblyBasedCallerArgumentCollection.EMIT_REF_CONFIDENCE_LONG_NAME, ReferenceConfidenceMode.GVCF.toString()),
702                 args -> args.add(M2ArgumentCollection.MINIMUM_ALLELE_FRACTION_LONG_NAME, minAF),
703                 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, -2.0),
704                 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, 0.0));
705 
706         //check ref conf-specific headers are output
707         final Pair<VCFHeader, List<VariantContext>> result = VariantContextTestUtils.readEntireVCFIntoMemory(standardVcf.getAbsolutePath());
708         Assert.assertTrue(result.getLeft().hasFormatLine(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY));
709         Assert.assertTrue(result.getLeft().getMetaDataLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG) != null);
710 
711         final List<VariantContext> variants = result.getRight();
712         final Map<String, VariantContext> variantMap = variants.stream().collect(Collectors.toMap(VariantContextTestUtils::keyForVariant, Function.identity()));
713         final List<String> variantKeys = new ArrayList<>(variantMap.keySet());
714 
715         final List<String> expectedKeys = Arrays.asList(
716                 "chrM:152-152 T*, [<NON_REF>, C]",
717                 "chrM:263-263 A*, [<NON_REF>, G]",
718                 //"chrM:297-297 A*, [<NON_REF>, AC, C]",
719                 //"chrM:301-301 A*, [<NON_REF>, AC, ACC, ACCC]",
720                 "chrM:302-302 A*, [<NON_REF>, AC, ACC, ACCC, C]",  //one of these commented out variants has an allele that only appears in debug mode
721                 "chrM:310-310 T*, [<NON_REF>, TC]",
722                 "chrM:750-750 A*, [<NON_REF>, G]");
723         Assert.assertTrue(variantKeys.containsAll(expectedKeys));
724         //First entry should be a homRef block
725         Assert.assertTrue(VariantContextTestUtils.keyForVariant(variants.get(0)).contains("*, [<NON_REF>]"));
726 
727         final ArgumentsBuilder validateVariantsArgs = new ArgumentsBuilder()
728                 .add("R", MITO_REF.getAbsolutePath())
729                 .add("V", standardVcf.getAbsolutePath())
730                 .add("L", IntervalUtils.locatableToString(new SimpleInterval("chrM:1-1000")))
731                 .addRaw("-gvcf");
732         runCommandLine(validateVariantsArgs, ValidateVariants.class.getSimpleName());
733 
734         runMutect2(NA12878_MITO_BAM, unthresholded, "chrM:1-1000", MITO_REF.getAbsolutePath(), Optional.empty(),
735                 args -> args.add(AssemblyBasedCallerArgumentCollection.EMIT_REF_CONFIDENCE_LONG_NAME, ReferenceConfidenceMode.GVCF.toString()),
736                 args -> args.add(M2ArgumentCollection.MINIMUM_ALLELE_FRACTION_LONG_NAME, 0.00),
737                 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, -2.0),
738                 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, 0.0));
739 
740         final Pair<VCFHeader, List<VariantContext>> result_noThreshold = VariantContextTestUtils.readEntireVCFIntoMemory(unthresholded.getAbsolutePath());
741 
742         final Map<String, VariantContext> variantMap2 = result_noThreshold.getRight().stream().collect(Collectors.toMap(VariantContextTestUtils::keyForVariant, Function.identity()));
743 
744         //TLODs for variants should not change too much for variant allele, should change significantly for non-ref
745         // however, there are edge cases where this need not be true (this might indicate the need to fix our
746         // LOD calculation for the NON-REF allele), so we allow one anomalous site
747         final long changedRegularAlleleLodCount = expectedKeys.stream()
748                 .filter(key -> !onlyNonRefTlodsChange(variantMap.get(key), variantMap2.get(key), minAF))
749                 .count();
750 
751         Assert.assertTrue(changedRegularAlleleLodCount <= 1);
752 
753         final List<String> expectedRefKeys = Arrays.asList(
754                 //ref blocks will be dependent on TLOD band values
755                 "chrM:218-218 A*, [<NON_REF>]",
756                 "chrM:264-266 C*, [<NON_REF>]",
757                 "chrM:475-483 A*, [<NON_REF>]",
758                 "chrM:488-492 T*, [<NON_REF>]");
759 
760         //ref block boundaries aren't particularly stable, so try a few and make sure we check at least one
761         final List<String> refBlockKeys = expectedRefKeys.stream()
762                 .filter(key -> variantMap.containsKey(key) && variantMap2.containsKey(key))
763                 .collect(Collectors.toList());
764         Assert.assertFalse(refBlockKeys.isEmpty());
765 
766         refBlockKeys.forEach(key -> Assert.assertTrue(onlyNonRefTlodsChange(variantMap.get(key), variantMap2.get(key), minAF)));
767     }
768 
onlyNonRefTlodsChange(final VariantContext v1, final VariantContext v2, final double minAF)769     private boolean onlyNonRefTlodsChange(final VariantContext v1, final VariantContext v2, final double minAF) {
770         if (v1 == null || v2 == null || !v1.getReference().equals(v2.getReference()) ||
771                 !(v1.getAlternateAlleles().size() == v2.getAlternateAlleles().size())) {
772             return false;
773         }
774 
775         //ref blocks have TLOD in format field
776         final boolean isHomRef = v1.getGenotype(0).isHomRef();
777         final double[] tlods1 = !isHomRef ? VariantContextGetters.getAttributeAsDoubleArray(v1, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY)
778                 : new double[]{VariantContextGetters.getAttributeAsDouble(v1.getGenotype(0), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0)};
779         final double[] tlods2 = !isHomRef ? VariantContextGetters.getAttributeAsDoubleArray(v2, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY)
780                 : new double[]{VariantContextGetters.getAttributeAsDouble(v2.getGenotype(0), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0)};
781 
782         final double[] af1 = VariantContextGetters.getAttributeAsDoubleArray(v1, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 0);
783         final double[] af2 = VariantContextGetters.getAttributeAsDoubleArray(v2, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 0);
784 
785         for (int i = 0; i < v1.getAlternateAlleles().size(); i++) {
786             if (!v1.getAlternateAllele(i).equals(v2.getAlternateAllele(i))) {
787                 return false;
788             }
789             //we expect the AF threshold to have a significant effect on the NON_REF TLOD, but only for that allele
790             if (!v1.getAlternateAllele(i).equals(Allele.NON_REF_ALLELE)) {
791                 if (tlods1[i] > 0) {
792                     if (Math.abs(tlods1[i] - tlods2[i]) / tlods1[i] > VARIANT_TLOD_MATCH_PCT) {
793                         return false;
794                     }
795                 } else if (af2 != null && af2[i] > minAF && Math.abs(tlods1[i] - tlods2[i]) > TLOD_MATCH_EPSILON) {
796                     return false;
797                 }
798             } else {
799                 if (Math.abs(tlods1[i] - tlods2[i]) < TLOD_MATCH_EPSILON) {
800                     return false;
801                 }
802             }
803         }
804         return true;
805     }
806 
807     @Test
808     @SuppressWarnings("deprecation")
testAFAtHighDP()809     public void testAFAtHighDP()  {
810         Utils.resetRandomGenerator();
811         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
812 
813         runMutect2(DEEP_MITO_BAM, unfilteredVcf, "chrM:1-1018", MITO_REF.getAbsolutePath(), Optional.empty(),
814                 args -> args.add(IntervalArgumentCollection.INTERVAL_PADDING_LONG_NAME, 300));
815 
816         final List<VariantContext> variants = VariantContextTestUtils.streamVcf(unfilteredVcf).collect(Collectors.toList());
817 
818         for (final VariantContext vc : variants) {
819             Assert.assertTrue(vc.isBiallelic()); //I do some lazy parsing below that won't hold for multiple alternate alleles
820             final Genotype g = vc.getGenotype(DEEP_MITO_SAMPLE_NAME);
821             Assert.assertTrue(g.hasAD());
822             final int[] ADs = g.getAD();
823             Assert.assertTrue(g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY));
824             //Assert.assertEquals(Double.parseDouble(String.valueOf(vc.getGenotype(DEEP_MITO_SAMPLE_NAME).getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY,"0"))), (double)ADs[1]/(ADs[0]+ADs[1]), 1e-6);
825             Assert.assertEquals(Double.parseDouble(String.valueOf(vc.getGenotype(DEEP_MITO_SAMPLE_NAME).getAttributeAsString(GATKVCFConstants.ALLELE_FRACTION_KEY, "0"))), (double) ADs[1] / (ADs[0] + ADs[1]), 2e-3);
826         }
827     }
828 
829     // check that the somatic clustering model works with high-depth, low-AF cfDNA clustering
830     @Test
testBloodBiopsyFiltering()831     public void testBloodBiopsyFiltering() {
832         final File unfiltered = new File(toolsTestDir, "mutect/cfdna/cfdna-unfiltered.vcf");
833         final File filtered = createTempFile("filtered", ".vcf");
834 
835         runFilterMutectCalls(unfiltered, filtered, b37Reference);
836 
837         final Map<Integer, Set<String>> filtersBySite = VariantContextTestUtils.streamVcf(filtered).collect(Collectors.toMap(VariantContext::getStart, VariantContext::getFilters));
838 
839         // these are sites that caused trouble for a previous version of the somatic clustering model
840         final List<Integer> lowAFSitesWeShouldNotCall = Arrays.asList(25963056, 47162531, 142178205, 151841902, 31325209, 41521982);
841         for (final int site : lowAFSitesWeShouldNotCall) {
842             Assert.assertTrue(filtersBySite.get(site).contains(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME));
843         }
844     }
845 
846     @Test
testBamout()847     public void testBamout() {
848         final File outputVcf = createTempFile("output", ".vcf");
849         final File bamout = createTempFile("bamout", ".bam");
850 
851         runMutect2(DREAM_1_TUMOR, outputVcf, "20:10000000-13000000", b37Reference, Optional.empty(),
852                 args -> args.add(AssemblyBasedCallerArgumentCollection.BAM_OUTPUT_LONG_NAME, bamout));
853         Assert.assertTrue(bamout.exists());
854     }
855 
856     @Test
testFilteringHeaders()857     public void testFilteringHeaders() {
858         Utils.resetRandomGenerator();
859         final File tumor = new File(NA12878_20_21_WGS_bam);
860         final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
861         final File filteredVcf = createTempFile("filtered", ".vcf");
862 
863         runMutect2(tumor, unfilteredVcf, "20:10000000-10010000", b37Reference, Optional.of(GNOMAD));
864         runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference);
865 
866         final VCFHeader filteredHeader = VariantContextTestUtils.getVCFHeader(filteredVcf.getAbsolutePath());
867         //explicit check for PASS header
868         Assert.assertTrue(filteredHeader.hasFilterLine(VCFConstants.PASSES_FILTERS_v4));
869         for (final String header : GATKVCFConstants.MUTECT_FILTER_NAMES){
870             Assert.assertTrue(filteredHeader.hasFilterLine(header));
871         }
872     }
873 
874     @SafeVarargs
runMutect2(final List<File> tumors, final List<File> normals, final File output, final String interval, final String reference, final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)875     final private void runMutect2(final List<File> tumors, final List<File> normals, final File output,
876                                   final String interval, final String reference,
877                                   final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) {
878         final ArgumentsBuilder args = new ArgumentsBuilder()
879                 .addOutput(output)
880                 .addReference(reference);
881 
882         tumors.forEach(args::addInput);
883 
884         normals.forEach(normal -> {
885             args.addInput(normal);
886             args.add(M2ArgumentCollection.NORMAL_SAMPLE_LONG_NAME, getSampleName(normal));
887         });
888 
889         gnomad.ifPresent(g -> args.add(M2ArgumentCollection.GERMLINE_RESOURCE_LONG_NAME, g));
890 
891         args.addInterval(new SimpleInterval(interval));
892 
893         ArgumentsBuilder argsWithAdditions = args;
894 
895         for (final Function<ArgumentsBuilder, ArgumentsBuilder> extraArgument : appendExtraArguments) {
896             argsWithAdditions = extraArgument.apply(args);
897         }
898 
899         runCommandLine(argsWithAdditions);
900     }
901 
902     @SafeVarargs
runMutect2(final File tumor, final File normal, final File output, final String interval, final String reference, final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)903     final private void runMutect2(final File tumor, final File normal, final File output, final String interval, final String reference,
904                                   final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) {
905         runMutect2(Collections.singletonList(tumor), Collections.singletonList(normal), output, interval, reference, gnomad, appendExtraArguments);
906     }
907 
908     @SafeVarargs
runMutect2(final File tumor, final File output, final String interval, final String reference, final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)909     final private void runMutect2(final File tumor, final File output, final String interval, final String reference,
910                                   final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) {
911         runMutect2(Collections.singletonList(tumor), Collections.emptyList(), output, interval, reference, gnomad, appendExtraArguments);
912     }
913 
914     @SafeVarargs
runFilterMutectCalls(final File unfilteredVcf, final File filteredVcf, final String reference, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)915     final private void runFilterMutectCalls(final File unfilteredVcf, final File filteredVcf, final String reference,
916                                             final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) {
917         final ArgumentsBuilder args = new ArgumentsBuilder()
918                 .addVCF(unfilteredVcf)
919                 .addOutput(filteredVcf)
920                 .addReference(reference);
921 
922         ArgumentsBuilder argsWithAdditions = args;
923 
924         for (final Function<ArgumentsBuilder, ArgumentsBuilder> extraArgument : appendExtraArguments) {
925             argsWithAdditions = extraArgument.apply(args);
926         }
927 
928         runCommandLine(argsWithAdditions, FilterMutectCalls.class.getSimpleName());
929     }
930 
runConcordance(final File truth, final File eval, final File summary, final String interval, final File mask, final Optional<File> truePositivesFalseNegatives)931     private void runConcordance(final File truth, final File eval, final File summary, final String interval, final File mask, final Optional<File> truePositivesFalseNegatives) {
932         final ArgumentsBuilder concordanceArgs = new ArgumentsBuilder()
933                 .add(Concordance.TRUTH_VARIANTS_LONG_NAME, truth)
934                 .add(Concordance.EVAL_VARIANTS_LONG_NAME, eval)
935                 .addInterval(new SimpleInterval(interval))
936                 .add(IntervalArgumentCollection.EXCLUDE_INTERVALS_LONG_NAME, mask)
937                 .add(Concordance.SUMMARY_LONG_NAME, summary);
938 
939         truePositivesFalseNegatives.ifPresent(file -> concordanceArgs.add(Concordance.TRUE_POSITIVES_AND_FALSE_NEGATIVES_SHORT_NAME, file));
940         runCommandLine(concordanceArgs, Concordance.class.getSimpleName());
941     }
942 
getSampleName(final File bam)943     private String getSampleName(final File bam)  {
944         try {
945             final File nameFile = createTempFile("sample_name", ".txt");
946             new Main().instanceMain(makeCommandLineArgs(Arrays.asList("-I", bam.getAbsolutePath(), "-O", nameFile.getAbsolutePath(), "-encode"), "GetSampleName"));
947             return Files.readAllLines(nameFile.toPath()).get(0);
948         } catch (final IOException ex) {
949             throw new IllegalArgumentException(ex);
950         }
951     }
952 }
953