1 package org.broadinstitute.hellbender.tools.spark.pipelines; 2 3 import htsjdk.samtools.ValidationStringency; 4 import org.broadinstitute.hellbender.CommandLineProgramTest; 5 import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; 6 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerIntegrationTest; 7 import org.broadinstitute.hellbender.testutils.BaseTest; 8 import org.broadinstitute.hellbender.GATKBaseTest; 9 import org.broadinstitute.hellbender.testutils.SamAssertionUtils; 10 import org.testng.Assert; 11 import org.testng.annotations.DataProvider; 12 import org.testng.annotations.Test; 13 14 import java.io.File; 15 import java.io.IOException; 16 import java.util.ArrayList; 17 import java.util.stream.Stream; 18 19 public class ReadsPipelineSparkIntegrationTest extends CommandLineProgramTest { 20 21 private static final class PipelineTest { 22 final String referenceURL; 23 final String bam; 24 final String outputExtension; 25 final String knownSites; 26 final String args; 27 final String expectedBamFileName; 28 final String expectedVcfFileName; 29 PipelineTest(String referenceURL, String bam, String outputExtension, String knownSites, String args, String expectedBamFileName, String expectedVcfFileName)30 private PipelineTest(String referenceURL, String bam, String outputExtension, String knownSites, String args, String expectedBamFileName, String expectedVcfFileName) { 31 this.referenceURL = referenceURL; 32 this.bam = bam; 33 this.outputExtension = outputExtension; 34 this.knownSites = knownSites; 35 this.args = args; 36 this.expectedBamFileName = expectedBamFileName; 37 this.expectedVcfFileName = expectedVcfFileName; 38 } 39 40 @Override toString()41 public String toString() { 42 return String.format("ReadsPipeline(bam='%s', args='%s')", bam, args); 43 } 44 } 45 getResourceDir()46 private String getResourceDir(){ 47 return getTestDataDir() + "/" + "BQSR" + "/"; 48 } 49 50 @DataProvider(name = "ReadsPipeline") createReadsPipelineSparkTestData()51 public Object[][] createReadsPipelineSparkTestData() { 52 final String GRCh37Ref_2021 = GATKBaseTest.b37_reference_20_21; 53 final String GRCh37Ref_2021_img = GATKBaseTest.b37_reference_20_21_img; 54 final String hiSeqBam_chr20 = getResourceDir() + "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.bam"; 55 final String hiSeqBam_chr20_queryNameSorted = getResourceDir() + "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.queryNameSorted.bam"; 56 final String hiSeqCram_chr20 = getResourceDir() + "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.cram"; 57 final String unalignedBam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.tiny.unaligned.bam"; 58 final String dbSNPb37_20 = getResourceDir() + DBSNP_138_B37_CH20_1M_1M1K_VCF; 59 final String more20Sites = getResourceDir() + "dbsnp_138.b37.20.10m-10m100.vcf"; //for testing 2 input files 60 61 final String expectedSingleKnownSites = "expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.md.bqsr.bam"; 62 final String expectedSingleKnownSitesVcf = "expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.md.bqsr.vcf"; 63 final String expectedMultipleKnownSites = "expected.MultiSite.reads.pipeline.bam"; 64 final String expectedMultipleKnownSitesCram = "expected.MultiSite.reads.pipeline.cram"; 65 final String expectedMultipleKnownSitesVcf = "expected.MultiSite.reads.pipeline.vcf"; 66 final String expectedMultipleKnownSitesFromUnalignedVcf = "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.tiny.vcf"; 67 68 return new Object[][]{ 69 // input local, computation local. 70 // Note: these output files were created by running Picard 1.130 and GATK3.46. 71 // VCF files were generated using GATK4 with a command like: 72 // gatk HaplotypeCaller \ 73 // -I src/test/resources/org/broadinstitute/hellbender/tools/BQSR/expected.MultiSite.reads.pipeline.bam \ 74 // -R src/test/resources/large/human_g1k_v37.20.21.fasta \ 75 // -O src/test/resources/org/broadinstitute/hellbender/tools/BQSR/expected.MultiSite.reads.pipeline.vcf \ 76 // -pairHMM AVX_LOGLESS_CACHING \ 77 // -stand_call_conf 30.0 78 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, null, null, getResourceDir() + expectedSingleKnownSitesVcf)}, // don't write intermediate BAM 79 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, null, getResourceDir() + expectedSingleKnownSites, getResourceDir() + expectedSingleKnownSitesVcf)}, 80 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20_queryNameSorted, ".bam", dbSNPb37_20, null, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedSingleKnownSitesVcf)}, 81 82 // Output generated with GATK4 83 {new PipelineTest(GRCh37Ref_2021, hiSeqCram_chr20, ".cram", dbSNPb37_20, "--known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSitesCram, getResourceDir() + expectedMultipleKnownSitesVcf)}, 84 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedMultipleKnownSitesVcf)}, 85 86 // BWA-MEM 87 {new PipelineTest(GRCh37Ref_2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)}, 88 }; 89 } 90 91 // disabled pending https://github.com/broadinstitute/gatk/issues/5680 92 @Test(enabled = false, dataProvider = "ReadsPipeline", groups = "spark") testReadsPipelineSpark(PipelineTest params)93 public void testReadsPipelineSpark(PipelineTest params) throws IOException { 94 File outFile = BaseTest.createTempFile("readSparkPipelineTest", ".vcf"); 95 File outFileBam = BaseTest.createTempFile("readSparkPipelineTest", params.outputExtension); 96 final ArrayList<String> args = new ArrayList<>(); 97 98 args.add("-I"); 99 args.add(new File(params.bam).getAbsolutePath()); 100 args.add("-O"); 101 args.add(outFile.getAbsolutePath()); 102 if (params.expectedBamFileName != null) { 103 args.add("--output-bam"); 104 args.add(outFileBam.getAbsolutePath()); 105 } 106 107 File referenceFile = null; 108 if (params.referenceURL != null) { 109 referenceFile = new File(params.referenceURL); 110 args.add("-R"); 111 args.add(referenceFile.getAbsolutePath()); 112 } 113 args.add("--"+ MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME); 114 args.add("-indels"); 115 args.add("--enable-baq"); 116 args.add("-pairHMM"); 117 args.add("AVX_LOGLESS_CACHING"); 118 args.add("-stand-call-conf"); 119 args.add("30.0"); 120 args.add("--known-sites"); 121 args.add(params.knownSites); 122 if (params.args != null) { 123 Stream.of(params.args.trim().split(" ")).forEach(args::add); 124 } 125 126 runCommandLine(args); 127 128 if (params.expectedBamFileName != null) { 129 SamAssertionUtils.assertEqualBamFiles(outFileBam, new File(params.expectedBamFileName), referenceFile, true, ValidationStringency.SILENT); 130 } 131 132 final double concordance = HaplotypeCallerIntegrationTest.calculateConcordance(outFile, new File(params.expectedVcfFileName)); 133 Assert.assertTrue(concordance >= 0.99, "Concordance with GATK 4 in VCF mode is < 99% (" + concordance + ")"); 134 } 135 } 136