1 package org.broadinstitute.hellbender.tools.spark.pipelines;
2 
3 import htsjdk.samtools.ValidationStringency;
4 import org.broadinstitute.hellbender.CommandLineProgramTest;
5 import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
6 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerIntegrationTest;
7 import org.broadinstitute.hellbender.testutils.BaseTest;
8 import org.broadinstitute.hellbender.GATKBaseTest;
9 import org.broadinstitute.hellbender.testutils.SamAssertionUtils;
10 import org.testng.Assert;
11 import org.testng.annotations.DataProvider;
12 import org.testng.annotations.Test;
13 
14 import java.io.File;
15 import java.io.IOException;
16 import java.util.ArrayList;
17 import java.util.stream.Stream;
18 
19 public class ReadsPipelineSparkIntegrationTest extends CommandLineProgramTest {
20 
21     private static final class PipelineTest {
22         final String referenceURL;
23         final String bam;
24         final String outputExtension;
25         final String knownSites;
26         final String args;
27         final String expectedBamFileName;
28         final String expectedVcfFileName;
29 
PipelineTest(String referenceURL, String bam, String outputExtension, String knownSites, String args, String expectedBamFileName, String expectedVcfFileName)30         private PipelineTest(String referenceURL, String bam, String outputExtension, String knownSites, String args, String expectedBamFileName, String expectedVcfFileName) {
31             this.referenceURL = referenceURL;
32             this.bam = bam;
33             this.outputExtension = outputExtension;
34             this.knownSites = knownSites;
35             this.args = args;
36             this.expectedBamFileName = expectedBamFileName;
37             this.expectedVcfFileName = expectedVcfFileName;
38         }
39 
40         @Override
toString()41         public String toString() {
42             return String.format("ReadsPipeline(bam='%s', args='%s')", bam, args);
43         }
44     }
45 
getResourceDir()46     private String getResourceDir(){
47         return getTestDataDir() + "/" + "BQSR" + "/";
48     }
49 
50     @DataProvider(name = "ReadsPipeline")
createReadsPipelineSparkTestData()51     public Object[][] createReadsPipelineSparkTestData() {
52         final String GRCh37Ref_2021 = GATKBaseTest.b37_reference_20_21;
53         final String GRCh37Ref_2021_img = GATKBaseTest.b37_reference_20_21_img;
54         final String hiSeqBam_chr20 = getResourceDir() + "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.bam";
55         final String hiSeqBam_chr20_queryNameSorted = getResourceDir() + "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.queryNameSorted.bam";
56         final String hiSeqCram_chr20 = getResourceDir() + "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.cram";
57         final String unalignedBam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.tiny.unaligned.bam";
58         final String dbSNPb37_20 = getResourceDir() + DBSNP_138_B37_CH20_1M_1M1K_VCF;
59         final String more20Sites = getResourceDir() + "dbsnp_138.b37.20.10m-10m100.vcf"; //for testing 2 input files
60 
61         final String expectedSingleKnownSites = "expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.md.bqsr.bam";
62         final String expectedSingleKnownSitesVcf = "expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.noMD.noBQSR.md.bqsr.vcf";
63         final String expectedMultipleKnownSites = "expected.MultiSite.reads.pipeline.bam";
64         final String expectedMultipleKnownSitesCram = "expected.MultiSite.reads.pipeline.cram";
65         final String expectedMultipleKnownSitesVcf = "expected.MultiSite.reads.pipeline.vcf";
66         final String expectedMultipleKnownSitesFromUnalignedVcf = "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.tiny.vcf";
67 
68         return new Object[][]{
69                 // input local, computation local.
70                 // Note: these output files were created by running Picard 1.130 and GATK3.46.
71                 // VCF files were generated using GATK4 with a command like:
72                 // gatk HaplotypeCaller \
73                 //        -I src/test/resources/org/broadinstitute/hellbender/tools/BQSR/expected.MultiSite.reads.pipeline.bam \
74                 //        -R src/test/resources/large/human_g1k_v37.20.21.fasta \
75                 //        -O src/test/resources/org/broadinstitute/hellbender/tools/BQSR/expected.MultiSite.reads.pipeline.vcf \
76                 //        -pairHMM AVX_LOGLESS_CACHING \
77                 //        -stand_call_conf 30.0
78                 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, null, null, getResourceDir() + expectedSingleKnownSitesVcf)}, // don't write intermediate BAM
79                 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, null, getResourceDir() + expectedSingleKnownSites, getResourceDir() + expectedSingleKnownSitesVcf)},
80                 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20_queryNameSorted, ".bam", dbSNPb37_20, null, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedSingleKnownSitesVcf)},
81 
82                 // Output generated with GATK4
83                 {new PipelineTest(GRCh37Ref_2021, hiSeqCram_chr20, ".cram", dbSNPb37_20, "--known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSitesCram, getResourceDir() + expectedMultipleKnownSitesVcf)},
84                 {new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedMultipleKnownSitesVcf)},
85 
86                 // BWA-MEM
87                 {new PipelineTest(GRCh37Ref_2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)},
88         };
89     }
90 
91     // disabled pending https://github.com/broadinstitute/gatk/issues/5680
92     @Test(enabled = false, dataProvider = "ReadsPipeline", groups = "spark")
testReadsPipelineSpark(PipelineTest params)93     public void testReadsPipelineSpark(PipelineTest params) throws IOException {
94         File outFile = BaseTest.createTempFile("readSparkPipelineTest", ".vcf");
95         File outFileBam = BaseTest.createTempFile("readSparkPipelineTest", params.outputExtension);
96         final ArrayList<String> args = new ArrayList<>();
97 
98         args.add("-I");
99         args.add(new File(params.bam).getAbsolutePath());
100         args.add("-O");
101         args.add(outFile.getAbsolutePath());
102         if (params.expectedBamFileName != null) {
103             args.add("--output-bam");
104             args.add(outFileBam.getAbsolutePath());
105         }
106 
107         File referenceFile = null;
108         if (params.referenceURL != null) {
109             referenceFile = new File(params.referenceURL);
110             args.add("-R");
111             args.add(referenceFile.getAbsolutePath());
112         }
113         args.add("--"+ MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME);
114         args.add("-indels");
115         args.add("--enable-baq");
116         args.add("-pairHMM");
117         args.add("AVX_LOGLESS_CACHING");
118         args.add("-stand-call-conf");
119         args.add("30.0");
120         args.add("--known-sites");
121         args.add(params.knownSites);
122         if (params.args != null) {
123             Stream.of(params.args.trim().split(" ")).forEach(args::add);
124         }
125 
126         runCommandLine(args);
127 
128         if (params.expectedBamFileName != null) {
129             SamAssertionUtils.assertEqualBamFiles(outFileBam, new File(params.expectedBamFileName), referenceFile, true, ValidationStringency.SILENT);
130         }
131 
132         final double concordance = HaplotypeCallerIntegrationTest.calculateConcordance(outFile, new File(params.expectedVcfFileName));
133         Assert.assertTrue(concordance >= 0.99, "Concordance with GATK 4 in VCF mode is < 99% (" +  concordance + ")");
134     }
135 }
136