1 package org.broadinstitute.hellbender.tools.walkers.mutect; 2 3 import com.google.common.collect.ImmutableSet; 4 import htsjdk.variant.variantcontext.Allele; 5 import htsjdk.variant.variantcontext.Genotype; 6 import htsjdk.variant.variantcontext.VariantContext; 7 import htsjdk.variant.vcf.VCFConstants; 8 import htsjdk.variant.vcf.VCFHeader; 9 import org.apache.commons.collections4.SetUtils; 10 import org.apache.commons.lang3.tuple.Pair; 11 import org.broadinstitute.hellbender.CommandLineProgramTest; 12 import org.broadinstitute.hellbender.Main; 13 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 14 import org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection; 15 import org.broadinstitute.hellbender.engine.FeatureDataSource; 16 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; 17 import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; 18 import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils; 19 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection; 20 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReadThreadingAssemblerArgumentCollection; 21 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReferenceConfidenceMode; 22 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.FilterMutectCalls; 23 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.M2FiltersArgumentCollection; 24 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.NuMTFilterTool; 25 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.ReadOrientationFilter; 26 import org.broadinstitute.hellbender.tools.walkers.readorientation.LearnReadOrientationModel; 27 import org.broadinstitute.hellbender.tools.walkers.validation.Concordance; 28 import org.broadinstitute.hellbender.tools.walkers.validation.ConcordanceSummaryRecord; 29 import org.broadinstitute.hellbender.tools.walkers.variantutils.ValidateVariants; 30 import org.broadinstitute.hellbender.utils.*; 31 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 32 import org.broadinstitute.hellbender.utils.variant.VariantContextGetters; 33 import org.testng.Assert; 34 import org.testng.annotations.DataProvider; 35 import org.testng.annotations.Test; 36 37 import java.io.File; 38 import java.io.IOException; 39 import java.nio.file.Files; 40 import java.util.*; 41 import java.util.function.Function; 42 import java.util.stream.Collectors; 43 44 /** 45 * Created by davidben on 9/1/16. 46 */ 47 @Test(groups = {"variantcalling"}) 48 public class Mutect2IntegrationTest extends CommandLineProgramTest { 49 // positions 10,000,000 - 11,000,000 of chr 20 and with most annotations removed 50 private static final File GNOMAD = new File(largeFileTestDir, "very-small-gnomad.vcf"); 51 52 private static final String DREAM_BAMS_DIR = largeFileTestDir + "mutect/dream_synthetic_bams/"; 53 private static final File DREAM_4_NORMAL = new File(DREAM_BAMS_DIR, "normal_4.bam"); 54 private static final File DREAM_3_NORMAL = new File(DREAM_BAMS_DIR, "normal_3.bam"); 55 private static final File DREAM_4_TUMOR = new File(DREAM_BAMS_DIR, "tumor_4.bam"); 56 private static final File DREAM_3_TUMOR = new File(DREAM_BAMS_DIR, "tumor_3.bam"); 57 private static final File DREAM_2_NORMAL = new File(DREAM_BAMS_DIR, "normal_2.bam"); 58 private static final File DREAM_1_NORMAL = new File(DREAM_BAMS_DIR, "normal_1.bam"); 59 private static final File DREAM_2_TUMOR = new File(DREAM_BAMS_DIR, "tumor_2.bam"); 60 private static final File DREAM_1_TUMOR = new File(DREAM_BAMS_DIR, "tumor_1.bam"); 61 62 private static final String DREAM_VCFS_DIR = toolsTestDir + "mutect/dream/vcfs/"; 63 private static final File DREAM_4_TRUTH = new File(DREAM_VCFS_DIR, "sample_4.vcf"); 64 private static final File DREAM_3_TRUTH = new File(DREAM_VCFS_DIR, "sample_3.vcf"); 65 private static final File DREAM_2_TRUTH = new File(DREAM_VCFS_DIR, "sample_2.vcf"); 66 private static final File DREAM_1_TRUTH = new File(DREAM_VCFS_DIR, "sample_1.vcf"); 67 68 private static final String DREAM_MASKS_DIR = toolsTestDir + "mutect/dream/masks/"; 69 private static final File DREAM_4_MASK = new File(DREAM_MASKS_DIR, "mask4.list"); 70 private static final File DREAM_3_MASK = new File(DREAM_MASKS_DIR, "mask3.list"); 71 private static final File DREAM_2_MASK = new File(DREAM_MASKS_DIR, "mask2.list"); 72 private static final File DREAM_1_MASK = new File(DREAM_MASKS_DIR, "mask1.list"); 73 74 private static final File NO_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/no-contamination.table"); 75 private static final File FIVE_PCT_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/five-pct-contamination.table"); 76 private static final File TEN_PCT_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/ten-pct-contamination.table"); 77 78 private static final File NA12878_MITO_BAM = new File(toolsTestDir, "mutect/mito/NA12878.bam"); 79 private static final File NA12878_MITO_VCF = new File(toolsTestDir, "mutect/mito/unfiltered-with-assb.vcf"); 80 private static final File NA12878_MITO_GVCF = new File(toolsTestDir, "mitochondria/NA12878.MT.g.vcf"); 81 private static final File NA12878_MITO_INITIAL_FILTERED_VCF = new File(toolsTestDir, "mutect/mito/initialFiltered.vcf"); 82 private static final File MITO_REF = new File(toolsTestDir, "mutect/mito/Homo_sapiens_assembly38.mt_only.fasta"); 83 private static final File DEEP_MITO_BAM = new File(largeFileTestDir, "mutect/highDPMTsnippet.bam"); 84 private static final String DEEP_MITO_SAMPLE_NAME = "mixture"; 85 86 private static final File FILTERING_DIR = new File(toolsTestDir, "mutect/filtering"); 87 88 private static final File GNOMAD_WITHOUT_AF_SNIPPET = new File(toolsTestDir, "mutect/gnomad-without-af.vcf"); 89 90 private static final double TLOD_MATCH_EPSILON = 0.05; 91 private static final double VARIANT_TLOD_MATCH_PCT = 0.01; 92 93 private static final String CHROMOSOME_20 = "20"; 94 95 // tumor bams, normal bams, truth vcf, mask, required sensitivity, whether to error-correct 96 @DataProvider(name = "dreamSyntheticData") dreamSyntheticData()97 public Object[][] dreamSyntheticData() { 98 return new Object[][]{ 99 {DREAM_1_TUMOR, Optional.of(DREAM_1_NORMAL), DREAM_1_TRUTH, DREAM_1_MASK, 0.97, false}, 100 {DREAM_2_TUMOR, Optional.of(DREAM_2_NORMAL), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, false}, 101 {DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, false}, 102 {DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, true}, 103 {DREAM_3_TUMOR, Optional.of(DREAM_3_NORMAL), DREAM_3_TRUTH, DREAM_3_MASK, 0.90, false}, 104 {DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, false}, 105 {DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, true}, 106 {DREAM_4_TUMOR, Optional.empty(), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, false}, 107 }; 108 } 109 110 /** 111 * Several DREAM challenge bams with synthetic truth data. In order to keep file sizes manageable, bams are restricted 112 * to chromosome 20, leaving ~100-200 variants, and then further restricted to 400-bp intervals centered around 113 * these variants. 114 * <p> 115 * Because this truth data is synthetic, ideal performance is perfect concordance with the truth vcfs. 116 * <p> 117 * The characteristics of the data are as follows (note that we removed structural variants from the original 118 * DREAM challenge vcfs): 119 * <p> 120 * Sample 1: pure monoclonal sample, SNVs only 121 * Sample 2: 80% pure monoclonal sample, SNVs only 122 * Sample 3: pure triclonal sample, subclone minor allele frequencies are 1/2, 1/3, and 1/5, SNVs and indels 123 * Sample 4: 80% biclonal sample, subclone minor allele fractions are 50% and 35%, SNVs and indels 124 * 125 */ 126 @Test(dataProvider = "dreamSyntheticData") testDreamTumorNormal(final File tumor, final Optional<File> normal, final File truth, final File mask, final double requiredSensitivity, final boolean errorCorrectReads)127 public void testDreamTumorNormal(final File tumor, final Optional<File> normal, final File truth, final File mask, 128 final double requiredSensitivity, final boolean errorCorrectReads) throws Exception { 129 Utils.resetRandomGenerator(); 130 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 131 final File filteredVcf = createTempFile("filtered", ".vcf"); 132 final File f1r2Counts = createTempFile("f1r2", ".tar.gz"); 133 final File orientationModel = createTempFile("orientation", ".tar.gz"); 134 135 final List<File> normals = normal.isPresent() ? Collections.singletonList(normal.get()) : Collections.emptyList(); 136 runMutect2(Collections.singletonList(tumor), normals, unfilteredVcf, CHROMOSOME_20, b37Reference, Optional.of(GNOMAD), 137 args -> args.addMask(mask).add(M2ArgumentCollection.F1R2_TAR_GZ_NAME, f1r2Counts), 138 args -> errorCorrectReads ? args.add(ReadThreadingAssemblerArgumentCollection.PILEUP_ERROR_CORRECTION_LOG_ODDS_NAME, 3.0) : args 139 ); 140 141 // verify that alleles contained in likelihoods matrix but dropped from somatic calls do not show up in annotations 142 // also check that alleles have been properly clipped after dropping any non-called alleles, i.e. if we had AAA AA A 143 // and A got dropped, we need AAA AA -> AA A. The condition we don't want is that all alleles share a common first base 144 // and no allele has length 1. 145 VariantContextTestUtils.streamVcf(unfilteredVcf) 146 .forEach(vc -> { 147 for (final Genotype genotype : vc.getGenotypes()) { 148 final int[] f1r2 = ReadOrientationFilter.getF1R2(genotype); 149 Assert.assertEquals(f1r2.length, vc.getNAlleles()); 150 if (vc.getAlleles().stream().filter(a -> !a.isSymbolic()).map(a -> a.getBases()[0]).distinct().count() == 1) { 151 Assert.assertTrue(vc.getAlleles().stream().anyMatch(a -> a.getBases().length == 1)); 152 } 153 } 154 }); 155 156 final ArgumentsBuilder orientationBiasArgs = new ArgumentsBuilder().addInput(f1r2Counts).addOutput(orientationModel); 157 runCommandLine(orientationBiasArgs, LearnReadOrientationModel.class.getSimpleName()); 158 159 for (final boolean runOrientationFilter : new boolean[] { true, false}) { 160 161 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference, 162 args -> runOrientationFilter ? args.add(M2FiltersArgumentCollection.ARTIFACT_PRIOR_TABLE_NAME, orientationModel) : args); 163 164 final File concordanceSummary = createTempFile("concordance", ".txt"); 165 final File truePositivesFalseNegatives = createTempFile("tpfn", ".vcf"); 166 runConcordance(truth, filteredVcf,concordanceSummary, CHROMOSOME_20, mask, Optional.of (truePositivesFalseNegatives)); 167 168 final List<ConcordanceSummaryRecord> summaryRecords = new ConcordanceSummaryRecord.Reader(concordanceSummary.toPath()).toList(); 169 summaryRecords.forEach(rec -> { 170 if (rec.getTruePositives() + rec.getFalseNegatives() > 0) { 171 Assert.assertTrue(rec.getSensitivity() > requiredSensitivity); 172 // tumor-only will have germline variants sneak in 173 if (!normals.isEmpty()) { 174 Assert.assertTrue(rec.getPrecision() > 0.5); 175 } 176 } 177 }); 178 } 179 } 180 181 @Test testNA12878NormalNormalFiltering()182 public void testNA12878NormalNormalFiltering() { 183 Utils.resetRandomGenerator(); 184 final File unfilteredVcf = new File(FILTERING_DIR, "NA12878.vcf"); 185 final File contamination = new File(FILTERING_DIR, "contamination.table"); 186 final File segments = new File(FILTERING_DIR, "segments.table"); 187 188 final File filteredVcf = createTempFile("filtered", ".vcf"); 189 190 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference, 191 args -> args.add(M2FiltersArgumentCollection.TUMOR_SEGMENTATION_LONG_NAME, segments), 192 args -> args.add(M2FiltersArgumentCollection.CONTAMINATION_TABLE_LONG_NAME, contamination)); 193 194 final long numPassVariants = VariantContextTestUtils.streamVcf(filteredVcf) 195 .filter(vc -> vc.getFilters().isEmpty()).count(); 196 197 Assert.assertTrue(numPassVariants < 10); 198 } 199 200 // tumorBams, normalBam, truthVcf, mask, requiredSensitivity 201 @DataProvider(name = "twoTumorData") 202 public Object[][] twoTumorData() { 203 return new Object[][]{ 204 {Arrays.asList(DREAM_1_TUMOR, DREAM_2_TUMOR), Collections.singletonList(DREAM_1_NORMAL), DREAM_1_TRUTH, DREAM_1_MASK, 0.97}, 205 {Arrays.asList(DREAM_3_TUMOR, DREAM_4_TUMOR), Collections.singletonList(DREAM_3_NORMAL), DREAM_3_TRUTH, DREAM_3_MASK, 0.90} 206 }; 207 } 208 209 @Test(dataProvider = "twoTumorData") 210 public void testTwoDreamTumorSamples(final List<File> tumors, final List<File> normals, 211 final File truth, final File mask, final double requiredSensitivity) throws Exception { 212 Utils.resetRandomGenerator(); 213 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 214 final File filteredVcf = createTempFile("filtered", ".vcf"); 215 216 runMutect2(tumors, normals, unfilteredVcf, CHROMOSOME_20, b37Reference, Optional.of(GNOMAD), args -> args.addMask(mask)); 217 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference); 218 219 final File concordanceSummary = createTempFile("concordance", ".txt"); 220 runConcordance(truth, filteredVcf, concordanceSummary, CHROMOSOME_20, mask, Optional.empty()); 221 222 final List<ConcordanceSummaryRecord> summaryRecords = new ConcordanceSummaryRecord.Reader(concordanceSummary.toPath()).toList(); 223 summaryRecords.forEach(rec -> { 224 if (rec.getTruePositives() + rec.getFalseNegatives() > 0) { 225 Assert.assertTrue(rec.getSensitivity() > requiredSensitivity); 226 // tumor-only will have germline variants sneak in 227 if (!normals.isEmpty()) { 228 //Assert.assertTrue(rec.getPrecision() > 0.5); 229 } 230 } 231 }); 232 } 233 234 // make a pon with a tumor and then use this pon to call somatic variants on the same tumor 235 // if the pon is doing its job all calls should be filtered by this pon 236 @Test testPon()237 public void testPon() { 238 Utils.resetRandomGenerator(); 239 final File tumor = DREAM_1_TUMOR; 240 final File normal = DREAM_1_NORMAL; 241 final File ponVcf = createTempFile("pon", ".vcf"); 242 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 243 final File filteredVcf = createTempFile("filtered", ".vcf"); 244 245 runMutect2(tumor, normal, ponVcf, CHROMOSOME_20, b37Reference, Optional.empty()); 246 runMutect2(tumor, normal, unfilteredVcf, CHROMOSOME_20, b37Reference, Optional.empty(), 247 args -> args.add(M2ArgumentCollection.PANEL_OF_NORMALS_LONG_NAME, ponVcf)); 248 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference); 249 250 final long numVariants = VariantContextTestUtils.streamVcf(filteredVcf) 251 .filter(vc -> vc.getFilters().isEmpty()).count(); 252 253 Assert.assertEquals(numVariants, 0); 254 } 255 256 // run tumor-normal mode using the original DREAM synthetic sample 1 tumor and normal restricted to 257 // 1/3 of our dbSNP interval, in which there is only one true positive. 258 // we want to see that the number of false positives is small 259 @Test testTumorNormal()260 public void testTumorNormal() { 261 Utils.resetRandomGenerator(); 262 final File unfilteredVcf = createTempFile("output", ".vcf"); 263 final File filteredVcf = createTempFile("filtered", ".vcf"); 264 final List<File> tumor = Collections.singletonList(new File(DREAM_BAMS_DIR, "tumor.bam")); 265 final List<File> normals = Arrays.asList(new File(DREAM_BAMS_DIR, "normal.bam"), DREAM_2_NORMAL); 266 267 runMutect2(tumor, normals, unfilteredVcf, "20:10000000-10100000", b37Reference, Optional.empty()); 268 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference); 269 270 VariantContextTestUtils.streamVcf(unfilteredVcf).flatMap(vc -> vc.getGenotypes().stream()).forEach(g -> Assert.assertTrue(g.hasAD())); 271 final long numVariants = VariantContextTestUtils.streamVcf(filteredVcf) 272 .filter(VariantContext::isNotFiltered) 273 .count(); 274 Assert.assertTrue(numVariants < 4); 275 } 276 277 // run tumor-only using our mini gnomAD on NA12878, which is not a tumor 278 @Test 279 public void testTumorOnly() { 280 Utils.resetRandomGenerator(); 281 final File tumor = new File(NA12878_20_21_WGS_bam); 282 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 283 final File filteredVcf = createTempFile("filtered", ".vcf"); 284 285 runMutect2(tumor, unfilteredVcf, "20:10000000-10010000", b37Reference, Optional.of(GNOMAD)); 286 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference); 287 288 final long numVariantsBeforeFiltering = VariantContextTestUtils.streamVcf(unfilteredVcf).count(); 289 290 final long numVariantsPassingFilters = VariantContextTestUtils.streamVcf(filteredVcf) 291 .filter(vc -> vc.getFilters().isEmpty()).count(); 292 293 // just a sanity check that this bam has some germline variants on this interval so that our test doesn't pass trivially! 294 Assert.assertTrue(numVariantsBeforeFiltering > 15); 295 296 // every variant on this interval in this sample is in gnomAD 297 Assert.assertTrue(numVariantsPassingFilters < 2); 298 } 299 300 // make sure we can call tumor alts when the normal has a different alt at the same site 301 // regression test for https://github.com/broadinstitute/gatk/issues/6901 302 @Test 303 public void testDifferentAltsInTumorAndNormal() { 304 Utils.resetRandomGenerator(); 305 final File tumor = new File(toolsTestDir, "mutect/different-alts-tumor.bam"); 306 final File normal = new File(toolsTestDir, "mutect/different-alts-normal.bam"); 307 final File output = createTempFile("output", ".vcf"); 308 309 310 runMutect2(tumor, normal, output, "20:10020000-10021200", b37Reference, Optional.empty(), 311 args -> args.add(M2ArgumentCollection.NORMAL_LOG_10_ODDS_LONG_NAME, 0.0)); 312 313 final Map<Integer, Allele> altAllelesByPosition = VariantContextTestUtils.streamVcf(output) 314 .collect(Collectors.toMap(VariantContext::getStart, VariantContext::getAltAlleleWithHighestAlleleCount)); 315 316 Assert.assertTrue(altAllelesByPosition.get(10020042).basesMatch(Allele.ALT_C)); //tumor G->C, normal G->A 317 Assert.assertTrue(altAllelesByPosition.get(10020124).basesMatch(Allele.ALT_G)); //tumor A->G, normal A->T 318 } 319 320 // test on an artificial bam with several contrived MNPs 321 @Test testMnps()322 public void testMnps() { 323 Utils.resetRandomGenerator(); 324 final File bam = new File(toolsTestDir, "mnp.bam"); 325 326 for (final int maxMnpDistance : new int[]{0, 1, 2, 3, 5}) { 327 final File outputVcf = createTempFile("unfiltered", ".vcf"); 328 329 runMutect2(bam, outputVcf, "20:10019000-10022000", b37Reference, Optional.empty(), 330 args -> args.add(M2ArgumentCollection.EMISSION_LOG_SHORT_NAME, 15), 331 args -> args.add(AssemblyBasedCallerArgumentCollection.MAX_MNP_DISTANCE_SHORT_NAME, maxMnpDistance)); 332 333 // note that for testing HaplotypeCaller GVCF mode we will always have the symbolic <NON REF> allele 334 final Map<Integer, List<String>> alleles = VariantContextTestUtils.streamVcf(outputVcf) 335 .collect(Collectors.toMap(VariantContext::getStart, vc -> vc.getAlternateAlleles().stream().filter(a -> !a.isSymbolic()).map(Allele::getBaseString).collect(Collectors.toList()))); 336 337 // phased, two bases apart 338 if (maxMnpDistance < 2) { 339 Assert.assertEquals(alleles.get(10019968), Collections.singletonList("G")); 340 Assert.assertEquals(alleles.get(10019970), Collections.singletonList("G")); 341 } else { 342 Assert.assertEquals(alleles.get(10019968), Collections.singletonList("GAG")); 343 Assert.assertTrue(!alleles.containsKey(10019970)); 344 } 345 346 // adjacent and out of phase 347 Assert.assertEquals(alleles.get(10020229), Collections.singletonList("A")); 348 Assert.assertEquals(alleles.get(10020230), Collections.singletonList("G")); 349 350 // 4-substitution MNP w/ spacings 2, 3, 4 351 if (maxMnpDistance < 2) { 352 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("G")); 353 Assert.assertEquals(alleles.get(10020432), Collections.singletonList("G")); 354 Assert.assertEquals(alleles.get(10020435), Collections.singletonList("G")); 355 Assert.assertEquals(alleles.get(10020439), Collections.singletonList("G")); 356 } else if (maxMnpDistance < 3) { 357 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("GAG")); 358 Assert.assertEquals(alleles.get(10020435), Collections.singletonList("G")); 359 Assert.assertEquals(alleles.get(10020439), Collections.singletonList("G")); 360 } else if (maxMnpDistance < 4) { 361 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("GAGTTG")); 362 Assert.assertEquals(alleles.get(10020439), Collections.singletonList("G")); 363 } else { 364 Assert.assertEquals(alleles.get(10020430), Collections.singletonList("GAGTTGTCTG")); 365 } 366 367 // two out of phase DNPs that overlap and have a base in common 368 if (maxMnpDistance > 0) { 369 Assert.assertEquals(alleles.get(10020680), Collections.singletonList("TA")); 370 Assert.assertEquals(alleles.get(10020681), Collections.singletonList("AT")); 371 } 372 } 373 } 374 375 @Test testForceCalling()376 public void testForceCalling() { 377 Utils.resetRandomGenerator(); 378 final File tumor = new File(NA12878_20_21_WGS_bam); 379 380 // The kmerSize = 1 case is a ridiculous setting that forces assembly to fail. 381 for (final int kmerSize : new int[] {1, 20}) { 382 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 383 final File forceCalls = new File(toolsTestDir, "mutect/gga_mode.vcf"); 384 385 runMutect2(tumor, unfilteredVcf, "20:9998500-10010000", b37Reference, Optional.empty(), 386 args -> args.add(AssemblyBasedCallerArgumentCollection.FORCE_CALL_ALLELES_LONG_NAME, forceCalls), 387 args -> args.add(ReadThreadingAssemblerArgumentCollection.KMER_SIZE_LONG_NAME, kmerSize), 388 args -> args.add(ReadThreadingAssemblerArgumentCollection.DONT_INCREASE_KMER_SIZE_LONG_NAME, true)); 389 390 final Map<Integer, List<Allele>> altAllelesByPosition = VariantContextTestUtils.streamVcf(unfilteredVcf) 391 .collect(Collectors.toMap(VariantContext::getStart, VariantContext::getAlternateAlleles)); 392 for (final VariantContext vc : new FeatureDataSource<VariantContext>(forceCalls)) { 393 final List<Allele> altAllelesAtThisLocus = altAllelesByPosition.get(vc.getStart()); 394 vc.getAlternateAlleles().stream().filter(a-> a.length() > 0 && BaseUtils.isNucleotide(a.getBases()[0])).forEach(a -> Assert.assertTrue(altAllelesAtThisLocus.contains(a))); 395 } 396 } 397 } 398 399 // test that the dont-use-soft-clips option actually does something 400 @Test testDontUseSoftClips()401 public void testDontUseSoftClips() { 402 Utils.resetRandomGenerator(); 403 final File tumor = new File(NA12878_20_21_WGS_bam); 404 final int start = 10050000; 405 406 final SimpleInterval interval = new SimpleInterval("20", start, start + 25000); 407 408 final File calls1 = createTempFile("unfiltered", ".vcf"); 409 runMutect2(tumor, calls1, interval.toString(), b37Reference, Optional.of(GNOMAD)); 410 411 final File calls2 = createTempFile("unfiltered", ".vcf"); 412 runMutect2(tumor, calls2, interval.toString(), b37Reference, Optional.of(GNOMAD), 413 args -> args.addFlag(AssemblyBasedCallerArgumentCollection.DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME)); 414 415 final List<VariantContext> indelsWithSoftClips = VariantContextTestUtils.streamVcf(calls1).filter(VariantContext::isIndel).collect(Collectors.toList()); 416 final List<VariantContext> indelsWithoutSoftClips = VariantContextTestUtils.streamVcf(calls2).filter(VariantContext::isIndel).collect(Collectors.toList()); 417 418 Assert.assertTrue(indelsWithoutSoftClips.size() < indelsWithSoftClips.size()); 419 420 final int startOfDroppedVariant = 10068160; 421 final int endOfDroppedVariant = 10068174; 422 Assert.assertTrue(indelsWithSoftClips.stream().anyMatch(vc -> vc.getStart() == startOfDroppedVariant && vc.getEnd() == endOfDroppedVariant)); 423 Assert.assertFalse(indelsWithoutSoftClips.stream().anyMatch(vc -> vc.getStart() == startOfDroppedVariant && vc.getEnd() == endOfDroppedVariant)); 424 425 } 426 427 428 // make sure that force calling with given alleles that normally wouldn't be called due to complete lack of coverage 429 // doesn't run into any edge case bug involving empty likelihoods matrices 430 @Test testForceCallingZeroCoverage()431 public void testForceCallingZeroCoverage() { 432 Utils.resetRandomGenerator(); 433 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 434 final File forceCalls = new File(toolsTestDir, "mutect/gga_mode_2.vcf"); 435 436 runMutect2(DREAM_3_TUMOR, unfilteredVcf, "20:1119000-1120000", b37Reference, Optional.empty(), 437 args -> args.add(AssemblyBasedCallerArgumentCollection.FORCE_CALL_ALLELES_LONG_NAME, forceCalls)); 438 } 439 440 // make sure we have fixed a bug where germline resources with AF=. throw errors 441 @Test testMissingAF()442 public void testMissingAF() { 443 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 444 runMutect2(DREAM_4_TUMOR, unfilteredVcf, "20:1119000-1120000", b37Reference, Optional.of(GNOMAD_WITHOUT_AF_SNIPPET), 445 args -> args.addInterval(new SimpleInterval("20:10837425-10837426"))); 446 } 447 448 @Test testContaminationFilter()449 public void testContaminationFilter() { 450 Utils.resetRandomGenerator(); 451 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 452 runMutect2(new File(NA12878_20_21_WGS_bam), unfilteredVcf, "20:10000000-20010000", b37Reference, Optional.of(GNOMAD)); 453 454 final Map<Integer, Set<VariantContext>> filteredVariants = Arrays.stream(new int[] {0, 5, 10}).boxed().collect(Collectors.toMap(pct -> pct, pct -> { 455 final File filteredVcf = createTempFile("filtered-" + pct, ".vcf"); 456 final File contaminationTable = pct == 0 ? NO_CONTAMINATION_TABLE : 457 (pct == 5 ? FIVE_PCT_CONTAMINATION_TABLE : TEN_PCT_CONTAMINATION_TABLE); 458 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference, 459 args -> args.add(M2FiltersArgumentCollection.CONTAMINATION_TABLE_LONG_NAME, contaminationTable)); 460 461 return VariantContextTestUtils.streamVcf(filteredVcf).collect(Collectors.toSet()); 462 })); 463 464 465 final int variantsFilteredAtZeroPercent = (int) filteredVariants.get(0).stream() 466 .filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)) 467 .count(); 468 469 final List<VariantContext> variantsFilteredAtFivePercent = filteredVariants.get(5).stream() 470 .filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)).collect(Collectors.toList()); 471 Assert.assertEquals(variantsFilteredAtZeroPercent, 0); 472 Assert.assertTrue(variantsFilteredAtFivePercent.size() < 473 filteredVariants.get(10).stream().filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)).count()); 474 475 final List<VariantContext> missedObviousVariantsAtTenPercent = filteredVariants.get(10).stream() 476 .filter(vc -> !vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)) 477 .filter(VariantContext::isBiallelic) 478 .filter(vc -> { 479 final int[] AD = vc.getGenotype(0).getAD(); 480 return AD[1] < 0.15 * AD[0]; 481 }).collect(Collectors.toList()); 482 483 Assert.assertTrue(missedObviousVariantsAtTenPercent.isEmpty()); 484 485 // If the filter is smart, it won't filter variants with allele fraction much higher than the contamination 486 final List<VariantContext> highAlleleFractionFilteredVariantsAtFivePercent = variantsFilteredAtFivePercent.stream() 487 .filter(VariantContext::isBiallelic) 488 .filter(vc -> vc.getAttributeAsDouble(GATKVCFConstants.CONTAMINATION_QUAL_KEY, 100) < 30) 489 .filter(vc -> { 490 final int[] AD = vc.getGenotype(0).getAD(); 491 return MathUtils.sum(AD) > 30 && AD[1] > AD[0]; 492 }).collect(Collectors.toList()); 493 494 Assert.assertTrue(highAlleleFractionFilteredVariantsAtFivePercent.isEmpty()); 495 } 496 497 // test that ReadFilterLibrary.NON_ZERO_REFERENCE_LENGTH_ALIGNMENT removes reads that consume zero reference bases 498 // e.g. read name HAVCYADXX150109:1:2102:20528:2129 with cigar 23S53I 499 @Test testReadsThatConsumeZeroReferenceReads()500 public void testReadsThatConsumeZeroReferenceReads() { 501 final File bam = new File(publicTestDir + "org/broadinstitute/hellbender/tools/mutect/na12878-chr20-consumes-zero-reference-bases.bam"); 502 final File outputVcf = createTempFile("output", ".vcf"); 503 504 runMutect2(bam, outputVcf, CHROMOSOME_20, b37Reference, Optional.empty()); 505 } 506 507 // make sure that unpaired reads that pass filtering do not cause errors 508 // in particular, the read HAVCYADXX150109:1:1109:11610:46575 with SAM flag 16 fails without the patch 509 @Test testUnpairedReads()510 public void testUnpairedReads() { 511 final File bam = new File(toolsTestDir + "unpaired.bam"); 512 final File outputVcf = createTempFile("output", ".vcf"); 513 514 runMutect2(bam, outputVcf, CHROMOSOME_20, b37Reference, Optional.empty()); 515 } 516 517 // some bams from external pipelines use faulty adapter trimming programs that introduce identical repeated reads 518 // into bams. Although these bams fail the Picard tool ValidateSamFile, one can run HaplotypeCaller and Mutect on them 519 // and get fine results. This test ensures that this remains the case. The test bam is a small chunk of reads surrounding 520 // a germline SNP in NA12878, where we have duplicated 40 of the reads. (In practice bams of this nature might have one bad read 521 // per megabase). 522 @Test testBamWithRepeatedReads()523 public void testBamWithRepeatedReads() { 524 final File bam = new File(toolsTestDir + "mutect/repeated_reads.bam"); 525 final File outputVcf = createTempFile("output", ".vcf"); 526 527 runMutect2(bam, outputVcf, "20:10018000-10020000", b37Reference, Optional.empty()); 528 } 529 530 // In the rare case that a particular fragment is only supported by supplementary reads, that should not 531 // result in an exception. This test ensures that Mutect2 does not fail to finish in that case. 532 @Test testBamWithOnlySupplementaryReads()533 public void testBamWithOnlySupplementaryReads() { 534 final File bam = new File(toolsTestDir + "mutect/only_supplementary_reads.bam"); 535 final File outputVcf = createTempFile("output", ".vcf"); 536 537 runMutect2(bam, outputVcf, "20:10018000-10020000", b37Reference, Optional.empty()); 538 } 539 540 // basic test on a small chunk of NA12878 mitochondria. This is not a validation, but rather a sanity check 541 // that M2 makes obvious calls, doesn't trip up on the beginning of the circular chromosome, and can handle high depth 542 @Test testMitochondria()543 public void testMitochondria() { 544 Utils.resetRandomGenerator(); 545 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 546 547 runMutect2(NA12878_MITO_BAM, unfilteredVcf, "chrM:1-1000", MITO_REF.getAbsolutePath(), Optional.empty(), 548 args -> args.add(M2ArgumentCollection.MITOCHONDRIA_MODE_LONG_NAME, true)); 549 550 final List<VariantContext> variants = VariantContextTestUtils.streamVcf(unfilteredVcf).collect(Collectors.toList()); 551 final List<String> variantKeys = variants.stream().map(VariantContextTestUtils::keyForVariant).collect(Collectors.toList()); 552 553 final List<String> expectedKeys = Arrays.asList( 554 "chrM:152-152 T*, [C]", 555 "chrM:263-263 A*, [G]", 556 "chrM:302-302 A*, [AC, ACC, ACCCCCCCCCCCCC, C]", 557 "chrM:310-310 T*, [C, TC]", 558 "chrM:750-750 A*, [G]"); 559 Assert.assertTrue(variantKeys.containsAll(expectedKeys)); 560 561 Assert.assertEquals(variants.get(0).getAttributeAsInt(GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY, 0), 1741); 562 } 563 564 @DataProvider(name = "vcfsForFiltering") vcfsForFiltering()565 public Object[][] vcfsForFiltering() { 566 return new Object[][]{ 567 {NA12878_MITO_VCF, 0.5, Collections.emptyList(), Arrays.asList( 568 ImmutableSet.of(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME), 569 Collections.emptySet(), 570 ImmutableSet.of( GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME, 571 GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME), 572 Collections.emptySet(), 573 Collections.emptySet(), 574 ImmutableSet.of(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME), 575 ImmutableSet.of(GATKVCFConstants.FAIL)), 576 Arrays.asList( 577 Arrays.asList(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME), // strand_bias, strict_stand 578 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE 579 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME), // weak_evidence, low_allele_frac 580 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME, GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME), // SITE|weak_evidence, strand_bias, low_allele_frac|strand_bias, strict_strand, low_allele_frac 581 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE 582 Arrays.asList(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME), // duplicate 583 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME, GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME) // weak_evidence, strand_bias, strict_stand|low_allele_frac 584 585 )}, 586 {NA12878_MITO_GVCF, .0009, Arrays.asList("MT:1", "MT:37", "MT:40", "MT:152", "MT:157"), Arrays.asList( 587 Collections.emptySet(), 588 ImmutableSet.of(GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME, GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME), 589 ImmutableSet.of(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME,GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME), 590 Collections.emptySet(), 591 ImmutableSet.of(GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME, GATKVCFConstants.CONTAMINATION_FILTER_NAME, GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME, 592 GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME, GATKVCFConstants.READ_POSITION_FILTER_NAME, GATKVCFConstants.MEDIAN_MAPPING_QUALITY_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME)), 593 Arrays.asList( 594 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE, 595 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS), //"weak_evidence, base_qual, strand_bias|SITE", 596 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS), // "weak_evidence, strict_strand, strand_bias|SITE", 597 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS), //".|weak_evidence, base_qual, strand_bias, low_allele_frac|SITE", 598 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.MEDIAN_MAPPING_QUALITY_FILTER_NAME + ", " + GATKVCFConstants.CONTAMINATION_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME + ", " + GATKVCFConstants.READ_POSITION_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME, GATKVCFConstants.SITE_LEVEL_FILTERS) // "weak_evidence, base_qual, map_qual, contamination, strand_artifact, position, low_allele_frac|SITE" 599 )} 600 }; 601 } 602 603 @Test(dataProvider = "vcfsForFiltering") testFilterMitochondria(File unfiltered, final double minAlleleFraction, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters)604 public void testFilterMitochondria(File unfiltered, final double minAlleleFraction, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters) { 605 final File filteredVcf = createTempFile("filtered", ".vcf"); 606 607 // vcf sequence dicts don't match ref 608 runFilterMutectCalls(unfiltered, filteredVcf, MITO_REF.getAbsolutePath(), 609 args -> args.add(M2ArgumentCollection.MITOCHONDRIA_MODE_LONG_NAME, true), 610 args -> args.add(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true), 611 args -> args.add(M2FiltersArgumentCollection.MIN_AF_LONG_NAME, minAlleleFraction), 612 args -> args.add(M2FiltersArgumentCollection.MIN_READS_ON_EACH_STRAND_LONG_NAME, 1), 613 args -> args.add(M2FiltersArgumentCollection.UNIQUE_ALT_READ_COUNT_LONG_NAME, 2), 614 args -> { 615 intervals.stream().map(SimpleInterval::new).forEach(args::addInterval); 616 return args; 617 }); 618 619 final List<Set<String>> actualFilters = VariantContextTestUtils.streamVcf(filteredVcf) 620 .map(VariantContext::getFilters).collect(Collectors.toList()); 621 622 final List<List<String>> actualASFilters = VariantContextTestUtils.streamVcf(filteredVcf) 623 .map(vc -> AnnotationUtils.decodeAnyASListWithRawDelim(vc.getCommonInfo().getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, ""))).collect(Collectors.toList()); 624 Assert.assertEquals(actualASFilters, expectedASFilters); 625 626 Assert.assertEquals(actualFilters.size(), expectedFilters.size()); 627 for (int n = 0; n < actualFilters.size(); n++) { 628 Assert.assertTrue(actualFilters.get(n).containsAll(expectedFilters.get(n)), "Actual filters missing some expected filters: " + SetUtils.difference(expectedFilters.get(n), actualFilters.get(n))); 629 Assert.assertTrue(expectedFilters.get(n).containsAll(actualFilters.get(n)), "Expected filters missing some actual filters: " + SetUtils.difference(actualFilters.get(n), expectedFilters.get(n))); 630 } 631 632 Assert.assertEquals(actualFilters, expectedFilters); 633 } 634 635 @DataProvider(name = "vcfsForNuMTFiltering") vcfsForNuMTFiltering()636 public Object[][] vcfsForNuMTFiltering() { 637 return new Object[][]{ 638 {NA12878_MITO_INITIAL_FILTERED_VCF, 30, Collections.emptyList(), Arrays.asList( 639 ImmutableSet.of(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME), 640 Collections.emptySet(), 641 ImmutableSet.of( GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME, 642 GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME, 643 GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME), 644 Collections.emptySet(), 645 Collections.emptySet(), 646 ImmutableSet.of(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME), 647 ImmutableSet.of(GATKVCFConstants.FAIL)), 648 Arrays.asList( 649 Arrays.asList(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME), // strand_bias, strict_stand 650 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE 651 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", " + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME), // weak_evidence, low_allele_frac, possible_numt 652 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS, GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", " + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME, GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME + ", " + GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", " + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME), // SITE|weak_evidence, strand_bias, low_allele_frac, possible_numt|strand_bias, strict_strand, low_allele_frac, possible_numt 653 Arrays.asList(GATKVCFConstants.SITE_LEVEL_FILTERS), // SITE 654 Arrays.asList(GATKVCFConstants.DUPLICATED_EVIDENCE_FILTER_NAME), // duplicate 655 Arrays.asList(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME + ", " + GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME + ", " + GATKVCFConstants.STRICT_STRAND_BIAS_FILTER_NAME, GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME + ", " + GATKVCFConstants.POSSIBLE_NUMT_FILTER_NAME) // weak_evidence, strand_bias, strict_stand|low_allele_frac, possible_numt 656 657 )} 658 }; 659 } 660 661 @Test(dataProvider = "vcfsForNuMTFiltering") testNuMTFilterMitochondria(File initialFilters, final double autosomalCoverage, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters)662 public void testNuMTFilterMitochondria(File initialFilters, final double autosomalCoverage, final List<String> intervals, List<Set<String>> expectedFilters, List<List<String>> expectedASFilters) { 663 final File filteredVcf = createTempFile("filtered", ".vcf"); 664 665 final ArgumentsBuilder args = new ArgumentsBuilder() 666 .addVCF(initialFilters) 667 .addOutput(filteredVcf) 668 .addReference(MITO_REF.getAbsolutePath()) 669 .add(NuMTFilterTool.MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME, autosomalCoverage) 670 .add(NuMTFilterTool.MAX_NUMT_COPIES_IN_AUTOSOME_LONG_NAME, 4.0); 671 672 intervals.stream().map(SimpleInterval::new).forEach(args::addInterval); 673 674 // vcf sequence dicts don't match ref 675 runCommandLine(args, NuMTFilterTool.class.getSimpleName()); 676 677 final List<Set<String>> actualFilters = VariantContextTestUtils.streamVcf(filteredVcf) 678 .map(VariantContext::getFilters).collect(Collectors.toList()); 679 680 final List<List<String>> actualASFilters = VariantContextTestUtils.streamVcf(filteredVcf) 681 .map(vc -> AnnotationUtils.decodeAnyASListWithRawDelim(vc.getCommonInfo().getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, ""))).collect(Collectors.toList()); 682 Assert.assertEquals(actualASFilters, expectedASFilters); 683 684 Assert.assertEquals(actualFilters.size(), expectedFilters.size()); 685 for (int n = 0; n < actualFilters.size(); n++) { 686 Assert.assertTrue(actualFilters.get(n).containsAll(expectedFilters.get(n)), "Actual filters missing some expected filters: " + SetUtils.difference(expectedFilters.get(n), actualFilters.get(n))); 687 Assert.assertTrue(expectedFilters.get(n).containsAll(actualFilters.get(n)), "Expected filters missing some actual filters: " + SetUtils.difference(actualFilters.get(n), expectedFilters.get(n))); 688 } 689 690 Assert.assertEquals(actualFilters, expectedFilters); 691 } 692 693 @Test testMitochondrialRefConf()694 public void testMitochondrialRefConf() { 695 Utils.resetRandomGenerator(); 696 final File standardVcf = createTempFile("standard", ".vcf"); 697 final File unthresholded = createTempFile("unthresholded", ".vcf"); 698 final double minAF = 0.01; 699 700 runMutect2(NA12878_MITO_BAM, standardVcf, "chrM:1-1000", MITO_REF.getAbsolutePath(), Optional.empty(), 701 args -> args.add(AssemblyBasedCallerArgumentCollection.EMIT_REF_CONFIDENCE_LONG_NAME, ReferenceConfidenceMode.GVCF.toString()), 702 args -> args.add(M2ArgumentCollection.MINIMUM_ALLELE_FRACTION_LONG_NAME, minAF), 703 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, -2.0), 704 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, 0.0)); 705 706 //check ref conf-specific headers are output 707 final Pair<VCFHeader, List<VariantContext>> result = VariantContextTestUtils.readEntireVCFIntoMemory(standardVcf.getAbsolutePath()); 708 Assert.assertTrue(result.getLeft().hasFormatLine(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY)); 709 Assert.assertTrue(result.getLeft().getMetaDataLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG) != null); 710 711 final List<VariantContext> variants = result.getRight(); 712 final Map<String, VariantContext> variantMap = variants.stream().collect(Collectors.toMap(VariantContextTestUtils::keyForVariant, Function.identity())); 713 final List<String> variantKeys = new ArrayList<>(variantMap.keySet()); 714 715 final List<String> expectedKeys = Arrays.asList( 716 "chrM:152-152 T*, [<NON_REF>, C]", 717 "chrM:263-263 A*, [<NON_REF>, G]", 718 //"chrM:297-297 A*, [<NON_REF>, AC, C]", 719 //"chrM:301-301 A*, [<NON_REF>, AC, ACC, ACCC]", 720 "chrM:302-302 A*, [<NON_REF>, AC, ACC, ACCC, C]", //one of these commented out variants has an allele that only appears in debug mode 721 "chrM:310-310 T*, [<NON_REF>, TC]", 722 "chrM:750-750 A*, [<NON_REF>, G]"); 723 Assert.assertTrue(variantKeys.containsAll(expectedKeys)); 724 //First entry should be a homRef block 725 Assert.assertTrue(VariantContextTestUtils.keyForVariant(variants.get(0)).contains("*, [<NON_REF>]")); 726 727 final ArgumentsBuilder validateVariantsArgs = new ArgumentsBuilder() 728 .add("R", MITO_REF.getAbsolutePath()) 729 .add("V", standardVcf.getAbsolutePath()) 730 .add("L", IntervalUtils.locatableToString(new SimpleInterval("chrM:1-1000"))) 731 .addRaw("-gvcf"); 732 runCommandLine(validateVariantsArgs, ValidateVariants.class.getSimpleName()); 733 734 runMutect2(NA12878_MITO_BAM, unthresholded, "chrM:1-1000", MITO_REF.getAbsolutePath(), Optional.empty(), 735 args -> args.add(AssemblyBasedCallerArgumentCollection.EMIT_REF_CONFIDENCE_LONG_NAME, ReferenceConfidenceMode.GVCF.toString()), 736 args -> args.add(M2ArgumentCollection.MINIMUM_ALLELE_FRACTION_LONG_NAME, 0.00), 737 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, -2.0), 738 args -> args.add(M2ArgumentCollection.LOD_BAND_LONG_NAME, 0.0)); 739 740 final Pair<VCFHeader, List<VariantContext>> result_noThreshold = VariantContextTestUtils.readEntireVCFIntoMemory(unthresholded.getAbsolutePath()); 741 742 final Map<String, VariantContext> variantMap2 = result_noThreshold.getRight().stream().collect(Collectors.toMap(VariantContextTestUtils::keyForVariant, Function.identity())); 743 744 //TLODs for variants should not change too much for variant allele, should change significantly for non-ref 745 // however, there are edge cases where this need not be true (this might indicate the need to fix our 746 // LOD calculation for the NON-REF allele), so we allow one anomalous site 747 final long changedRegularAlleleLodCount = expectedKeys.stream() 748 .filter(key -> !onlyNonRefTlodsChange(variantMap.get(key), variantMap2.get(key), minAF)) 749 .count(); 750 751 Assert.assertTrue(changedRegularAlleleLodCount <= 1); 752 753 final List<String> expectedRefKeys = Arrays.asList( 754 //ref blocks will be dependent on TLOD band values 755 "chrM:218-218 A*, [<NON_REF>]", 756 "chrM:264-266 C*, [<NON_REF>]", 757 "chrM:475-483 A*, [<NON_REF>]", 758 "chrM:488-492 T*, [<NON_REF>]"); 759 760 //ref block boundaries aren't particularly stable, so try a few and make sure we check at least one 761 final List<String> refBlockKeys = expectedRefKeys.stream() 762 .filter(key -> variantMap.containsKey(key) && variantMap2.containsKey(key)) 763 .collect(Collectors.toList()); 764 Assert.assertFalse(refBlockKeys.isEmpty()); 765 766 refBlockKeys.forEach(key -> Assert.assertTrue(onlyNonRefTlodsChange(variantMap.get(key), variantMap2.get(key), minAF))); 767 } 768 onlyNonRefTlodsChange(final VariantContext v1, final VariantContext v2, final double minAF)769 private boolean onlyNonRefTlodsChange(final VariantContext v1, final VariantContext v2, final double minAF) { 770 if (v1 == null || v2 == null || !v1.getReference().equals(v2.getReference()) || 771 !(v1.getAlternateAlleles().size() == v2.getAlternateAlleles().size())) { 772 return false; 773 } 774 775 //ref blocks have TLOD in format field 776 final boolean isHomRef = v1.getGenotype(0).isHomRef(); 777 final double[] tlods1 = !isHomRef ? VariantContextGetters.getAttributeAsDoubleArray(v1, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY) 778 : new double[]{VariantContextGetters.getAttributeAsDouble(v1.getGenotype(0), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0)}; 779 final double[] tlods2 = !isHomRef ? VariantContextGetters.getAttributeAsDoubleArray(v2, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY) 780 : new double[]{VariantContextGetters.getAttributeAsDouble(v2.getGenotype(0), GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0)}; 781 782 final double[] af1 = VariantContextGetters.getAttributeAsDoubleArray(v1, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 0); 783 final double[] af2 = VariantContextGetters.getAttributeAsDoubleArray(v2, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 0); 784 785 for (int i = 0; i < v1.getAlternateAlleles().size(); i++) { 786 if (!v1.getAlternateAllele(i).equals(v2.getAlternateAllele(i))) { 787 return false; 788 } 789 //we expect the AF threshold to have a significant effect on the NON_REF TLOD, but only for that allele 790 if (!v1.getAlternateAllele(i).equals(Allele.NON_REF_ALLELE)) { 791 if (tlods1[i] > 0) { 792 if (Math.abs(tlods1[i] - tlods2[i]) / tlods1[i] > VARIANT_TLOD_MATCH_PCT) { 793 return false; 794 } 795 } else if (af2 != null && af2[i] > minAF && Math.abs(tlods1[i] - tlods2[i]) > TLOD_MATCH_EPSILON) { 796 return false; 797 } 798 } else { 799 if (Math.abs(tlods1[i] - tlods2[i]) < TLOD_MATCH_EPSILON) { 800 return false; 801 } 802 } 803 } 804 return true; 805 } 806 807 @Test 808 @SuppressWarnings("deprecation") testAFAtHighDP()809 public void testAFAtHighDP() { 810 Utils.resetRandomGenerator(); 811 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 812 813 runMutect2(DEEP_MITO_BAM, unfilteredVcf, "chrM:1-1018", MITO_REF.getAbsolutePath(), Optional.empty(), 814 args -> args.add(IntervalArgumentCollection.INTERVAL_PADDING_LONG_NAME, 300)); 815 816 final List<VariantContext> variants = VariantContextTestUtils.streamVcf(unfilteredVcf).collect(Collectors.toList()); 817 818 for (final VariantContext vc : variants) { 819 Assert.assertTrue(vc.isBiallelic()); //I do some lazy parsing below that won't hold for multiple alternate alleles 820 final Genotype g = vc.getGenotype(DEEP_MITO_SAMPLE_NAME); 821 Assert.assertTrue(g.hasAD()); 822 final int[] ADs = g.getAD(); 823 Assert.assertTrue(g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY)); 824 //Assert.assertEquals(Double.parseDouble(String.valueOf(vc.getGenotype(DEEP_MITO_SAMPLE_NAME).getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY,"0"))), (double)ADs[1]/(ADs[0]+ADs[1]), 1e-6); 825 Assert.assertEquals(Double.parseDouble(String.valueOf(vc.getGenotype(DEEP_MITO_SAMPLE_NAME).getAttributeAsString(GATKVCFConstants.ALLELE_FRACTION_KEY, "0"))), (double) ADs[1] / (ADs[0] + ADs[1]), 2e-3); 826 } 827 } 828 829 // check that the somatic clustering model works with high-depth, low-AF cfDNA clustering 830 @Test testBloodBiopsyFiltering()831 public void testBloodBiopsyFiltering() { 832 final File unfiltered = new File(toolsTestDir, "mutect/cfdna/cfdna-unfiltered.vcf"); 833 final File filtered = createTempFile("filtered", ".vcf"); 834 835 runFilterMutectCalls(unfiltered, filtered, b37Reference); 836 837 final Map<Integer, Set<String>> filtersBySite = VariantContextTestUtils.streamVcf(filtered).collect(Collectors.toMap(VariantContext::getStart, VariantContext::getFilters)); 838 839 // these are sites that caused trouble for a previous version of the somatic clustering model 840 final List<Integer> lowAFSitesWeShouldNotCall = Arrays.asList(25963056, 47162531, 142178205, 151841902, 31325209, 41521982); 841 for (final int site : lowAFSitesWeShouldNotCall) { 842 Assert.assertTrue(filtersBySite.get(site).contains(GATKVCFConstants.TUMOR_EVIDENCE_FILTER_NAME)); 843 } 844 } 845 846 @Test testBamout()847 public void testBamout() { 848 final File outputVcf = createTempFile("output", ".vcf"); 849 final File bamout = createTempFile("bamout", ".bam"); 850 851 runMutect2(DREAM_1_TUMOR, outputVcf, "20:10000000-13000000", b37Reference, Optional.empty(), 852 args -> args.add(AssemblyBasedCallerArgumentCollection.BAM_OUTPUT_LONG_NAME, bamout)); 853 Assert.assertTrue(bamout.exists()); 854 } 855 856 @Test testFilteringHeaders()857 public void testFilteringHeaders() { 858 Utils.resetRandomGenerator(); 859 final File tumor = new File(NA12878_20_21_WGS_bam); 860 final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); 861 final File filteredVcf = createTempFile("filtered", ".vcf"); 862 863 runMutect2(tumor, unfilteredVcf, "20:10000000-10010000", b37Reference, Optional.of(GNOMAD)); 864 runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference); 865 866 final VCFHeader filteredHeader = VariantContextTestUtils.getVCFHeader(filteredVcf.getAbsolutePath()); 867 //explicit check for PASS header 868 Assert.assertTrue(filteredHeader.hasFilterLine(VCFConstants.PASSES_FILTERS_v4)); 869 for (final String header : GATKVCFConstants.MUTECT_FILTER_NAMES){ 870 Assert.assertTrue(filteredHeader.hasFilterLine(header)); 871 } 872 } 873 874 @SafeVarargs runMutect2(final List<File> tumors, final List<File> normals, final File output, final String interval, final String reference, final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)875 final private void runMutect2(final List<File> tumors, final List<File> normals, final File output, 876 final String interval, final String reference, 877 final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) { 878 final ArgumentsBuilder args = new ArgumentsBuilder() 879 .addOutput(output) 880 .addReference(reference); 881 882 tumors.forEach(args::addInput); 883 884 normals.forEach(normal -> { 885 args.addInput(normal); 886 args.add(M2ArgumentCollection.NORMAL_SAMPLE_LONG_NAME, getSampleName(normal)); 887 }); 888 889 gnomad.ifPresent(g -> args.add(M2ArgumentCollection.GERMLINE_RESOURCE_LONG_NAME, g)); 890 891 args.addInterval(new SimpleInterval(interval)); 892 893 ArgumentsBuilder argsWithAdditions = args; 894 895 for (final Function<ArgumentsBuilder, ArgumentsBuilder> extraArgument : appendExtraArguments) { 896 argsWithAdditions = extraArgument.apply(args); 897 } 898 899 runCommandLine(argsWithAdditions); 900 } 901 902 @SafeVarargs runMutect2(final File tumor, final File normal, final File output, final String interval, final String reference, final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)903 final private void runMutect2(final File tumor, final File normal, final File output, final String interval, final String reference, 904 final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) { 905 runMutect2(Collections.singletonList(tumor), Collections.singletonList(normal), output, interval, reference, gnomad, appendExtraArguments); 906 } 907 908 @SafeVarargs runMutect2(final File tumor, final File output, final String interval, final String reference, final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)909 final private void runMutect2(final File tumor, final File output, final String interval, final String reference, 910 final Optional<File> gnomad, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) { 911 runMutect2(Collections.singletonList(tumor), Collections.emptyList(), output, interval, reference, gnomad, appendExtraArguments); 912 } 913 914 @SafeVarargs runFilterMutectCalls(final File unfilteredVcf, final File filteredVcf, final String reference, final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments)915 final private void runFilterMutectCalls(final File unfilteredVcf, final File filteredVcf, final String reference, 916 final Function<ArgumentsBuilder, ArgumentsBuilder>... appendExtraArguments) { 917 final ArgumentsBuilder args = new ArgumentsBuilder() 918 .addVCF(unfilteredVcf) 919 .addOutput(filteredVcf) 920 .addReference(reference); 921 922 ArgumentsBuilder argsWithAdditions = args; 923 924 for (final Function<ArgumentsBuilder, ArgumentsBuilder> extraArgument : appendExtraArguments) { 925 argsWithAdditions = extraArgument.apply(args); 926 } 927 928 runCommandLine(argsWithAdditions, FilterMutectCalls.class.getSimpleName()); 929 } 930 runConcordance(final File truth, final File eval, final File summary, final String interval, final File mask, final Optional<File> truePositivesFalseNegatives)931 private void runConcordance(final File truth, final File eval, final File summary, final String interval, final File mask, final Optional<File> truePositivesFalseNegatives) { 932 final ArgumentsBuilder concordanceArgs = new ArgumentsBuilder() 933 .add(Concordance.TRUTH_VARIANTS_LONG_NAME, truth) 934 .add(Concordance.EVAL_VARIANTS_LONG_NAME, eval) 935 .addInterval(new SimpleInterval(interval)) 936 .add(IntervalArgumentCollection.EXCLUDE_INTERVALS_LONG_NAME, mask) 937 .add(Concordance.SUMMARY_LONG_NAME, summary); 938 939 truePositivesFalseNegatives.ifPresent(file -> concordanceArgs.add(Concordance.TRUE_POSITIVES_AND_FALSE_NEGATIVES_SHORT_NAME, file)); 940 runCommandLine(concordanceArgs, Concordance.class.getSimpleName()); 941 } 942 getSampleName(final File bam)943 private String getSampleName(final File bam) { 944 try { 945 final File nameFile = createTempFile("sample_name", ".txt"); 946 new Main().instanceMain(makeCommandLineArgs(Arrays.asList("-I", bam.getAbsolutePath(), "-O", nameFile.getAbsolutePath(), "-encode"), "GetSampleName")); 947 return Files.readAllLines(nameFile.toPath()).get(0); 948 } catch (final IOException ex) { 949 throw new IllegalArgumentException(ex); 950 } 951 } 952 } 953