1 package org.broadinstitute.hellbender.tools.spark; 2 3 import htsjdk.samtools.ValidationStringency; 4 import org.broadinstitute.barclay.argparser.CommandLineException; 5 import org.broadinstitute.hellbender.CommandLineProgramTest; 6 import org.broadinstitute.hellbender.GATKBaseTest; 7 import org.broadinstitute.hellbender.exceptions.UserException; 8 import org.broadinstitute.hellbender.tools.walkers.bqsr.BQSRTestData; 9 import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; 10 import org.broadinstitute.hellbender.utils.Utils; 11 import org.broadinstitute.hellbender.testutils.SamAssertionUtils; 12 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; 13 import org.testng.annotations.DataProvider; 14 import org.testng.annotations.Test; 15 16 import java.io.File; 17 import java.io.IOException; 18 import java.util.Arrays; 19 20 public final class BaseRecalibratorSparkIntegrationTest extends CommandLineProgramTest { 21 22 private final static String THIS_TEST_FOLDER = toolsTestDir + "BQSR/"; 23 24 private static final class BQSRTest { 25 final String referenceURL; 26 final String bam; 27 final String knownSites; 28 final String args; 29 final String expectedFileName; 30 BQSRTest(String referenceURL, String bam, String knownSites, String args, String expectedFileName)31 private BQSRTest(String referenceURL, String bam, String knownSites, String args, String expectedFileName) { 32 this.referenceURL = referenceURL; 33 this.bam = bam; 34 this.knownSites = knownSites; 35 this.args = args; 36 this.expectedFileName = expectedFileName; 37 } 38 getCommandLine()39 public String getCommandLine() { 40 return " -R " + referenceURL + 41 " -I " + bam + 42 " " + args + 43 (knownSites.isEmpty() ? "": " --known-sites " + knownSites) + 44 " -O %s"; 45 } 46 47 @Override toString()48 public String toString() { 49 return String.format("BQSR(bam='%s', args='%s')", bam, args); 50 } 51 } 52 getResourceDir()53 private String getResourceDir(){ 54 return getTestDataDir() + "/" + "BQSR" + "/"; 55 } 56 getCloudInputs()57 private String getCloudInputs() { 58 return getGCPTestInputPath() + THIS_TEST_FOLDER; 59 } 60 61 @DataProvider(name = "BQSRTest") createBQSRTestData()62 public Object[][] createBQSRTestData() { 63 final String localResources = getResourceDir(); 64 65 final String GRCh37Ref_chr2021 = b37_reference_20_21; 66 final String GRCh37Ref_chr2021_gz = b37_reference_20_21_gz; 67 final String hiSeqBam_chr20 = localResources + WGS_B37_CH20_1M_1M1K_BAM; 68 final String hiSeqBam_1read = localResources + "overlappingRead.bam"; 69 final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF; 70 final String dbSNPb37_chr2021 = dbsnp_138_b37_20_21_vcf; 71 72 final String hg19Chr171Mb = publicTestDir + "human_g1k_v37.chr17_1Mb.fasta"; 73 final String HiSeqBam_chr17 = localResources + "NA12878.chr17_69k_70k.dictFix.bam"; 74 final String dbSNPb37_chr17 = localResources + "dbsnp_132.b37.excluding_sites_after_129.chr17_69k_70k.vcf"; 75 final String more17Sites = localResources + "bqsr.fakeSitesForTesting.b37.chr17.vcf"; 76 77 final String hiSeqBam_20_21_100000 = localResources + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.bam"; 78 final String hiSeqCram_20_21_100000 = localResources + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.cram"; 79 final String more20Sites = localResources + "dbsnp_138.b37.20.10m-10m100.vcf"; //for testing 2 input files 80 final String more21Sites = localResources + "dbsnp_138.b37.21.10m-10m100.vcf"; //for testing 2 input files 81 82 return new Object[][]{ 83 //Note: recal tables were created using GATK3.4 with one change from 2.87 to 2.88 in expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.recal.txt 84 // The reason is that GATK4 uses a multiplier in summing doubles in RecalDatum. 85 // See MathUtilsUniTest.testAddDoubles for a demonstration how that can change the results. 86 // See RecalDatum for explanation of why the multiplier is needed. 87 88 // local input/computation/reference 89 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_1read, dbSNPb37_chr2021, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)}, 90 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)}, 91 {new BQSRTest(GRCh37Ref_chr2021_gz, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)}, 92 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)}, 93 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --indels-context-size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)}, 94 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --low-quality-tail 5", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)}, 95 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --quantizing-levels 6", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_QUANTIZING_LEVELS_6_RECAL)}, 96 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --mismatches-context-size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)}, 97 // multiple known sites; output generated with GATK4 walker 98 {new BQSRTest(GRCh37Ref_chr2021 , hiSeqBam_20_21_100000, more20Sites, "-indels --enable-baq --known-sites " + more21Sites, getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.ch21.10m-10m100.recal.txt")}, 99 {new BQSRTest(GRCh37Ref_chr2021 , hiSeqCram_20_21_100000, more20Sites, "-indels --enable-baq --known-sites " + more21Sites, getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.ch21.10m-10m100.recal.txt")}, 100 // multiple known sites; entire test case shared with walker version 101 {new BQSRTest(hg19Chr171Mb, HiSeqBam_chr17, dbSNPb37_chr17, "-indels --enable-baq --known-sites " + more17Sites, getResourceDir() + "expected.NA12878.chr17_69k_70k.2inputs.txt")}, 102 }; 103 } 104 105 @Test(dataProvider = "BQSRTest", groups = "spark") testBQSRSpark(BQSRTest params)106 public void testBQSRSpark(BQSRTest params) throws IOException { 107 ArgumentsBuilder ab = new ArgumentsBuilder().addRaw(params.getCommandLine()); 108 IntegrationTestSpec spec = new IntegrationTestSpec( 109 ab.getString(), 110 Arrays.asList(params.expectedFileName)); 111 spec.executeTest("testBQSRSpark-" + params.args, this); 112 } 113 114 //This data provider is for tests that use reference (but not BAM) files stored in buckets 115 @DataProvider(name = "BQSRCloudTest") createBQSRCloudTestData()116 public Object[][] createBQSRCloudTestData() { 117 final String localResources = getResourceDir(); 118 119 final String GRCh37RefCloud = GCS_b37_CHR20_21_REFERENCE; 120 final String hiSeqBam_chr20 = localResources + WGS_B37_CH20_1M_1M1K_BAM; 121 final String hiSeqBam_1read = localResources + "overlappingRead.bam"; 122 final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF; 123 final String dbSNPb37_chr2021 = dbsnp_138_b37_20_21_vcf; 124 125 return new Object[][]{ 126 //Note: recal tables were created using GATK3.4 with one change from 2.87 to 2.88 in expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.recal.txt 127 // The reason is that GATK4 uses a multiplier in summing doubles in RecalDatum. 128 // See MathUtilsUniTest.testAddDoubles for a demonstration how that can change the results. 129 // See RecalDatum for explanation of why the multiplier is needed. 130 131 // local input/computation, using a gs:// reference 132 {new BQSRTest(GRCh37RefCloud, hiSeqBam_1read, dbSNPb37_chr2021, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)}, 133 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)}, 134 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)}, 135 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --indels-context-size 4", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)}, 136 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --low-quality-tail 5", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)}, 137 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --quantizing-levels 6", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_QUANTIZING_LEVELS_6_RECAL)}, 138 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --mismatches-context-size 4", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)}, 139 }; 140 } 141 142 @Test(dataProvider = "BQSRCloudTest", groups = {"bucket", "spark"}) testBQSRSparkCloud(final BQSRTest params)143 public void testBQSRSparkCloud(final BQSRTest params) throws IOException { 144 ArgumentsBuilder ab = new ArgumentsBuilder().addRaw(params.getCommandLine()); 145 IntegrationTestSpec spec = new IntegrationTestSpec( 146 ab.getString(), 147 Arrays.asList(params.expectedFileName)); 148 spec.executeTest("testBQSRSparkCloud-" + params.args, this); 149 } 150 151 //This data provider is for tests that use BAM files stored in buckets 152 @DataProvider(name = "BQSRTestBucket") createBQSRTestDataBucket()153 public Object[][] createBQSRTestDataBucket() { 154 final String GRCh37RefCloud = GCS_b37_CHR20_21_REFERENCE; 155 final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT; 156 final String localResources = getResourceDir(); 157 final String HiSeqBamCloud_chr20 = getCloudInputs() + WGS_B37_CH20_1M_1M1K_BAM; 158 final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF; 159 160 return new Object[][]{ 161 // input in cloud, computation local. 162 {new BQSRTest(GRCh37RefCloud, HiSeqBamCloud_chr20, dbSNPb37_chr20, "-indels --enable-baq "+" --join-strategy SHUFFLE", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)}, 163 {new BQSRTest(GRCh37RefCloud, HiSeqBamCloud_chr20, dbSNPb37_chr20, " --join-strategy SHUFFLE", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)}, 164 {new BQSRTest(chr2021Reference2bit, HiSeqBamCloud_chr20, dbSNPb37_chr20, "-indels --enable-baq "+" --join-strategy BROADCAST", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)}, 165 {new BQSRTest(chr2021Reference2bit, HiSeqBamCloud_chr20, dbSNPb37_chr20, " --join-strategy BROADCAST", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)}, 166 }; 167 } 168 169 // TODO: re-enable once ReadsSparkSource natively supports files in GCS buckets 170 @Test(dataProvider = "BQSRTestBucket", groups = {"spark", "bucket"}, enabled = false) testBQSRBucket(BQSRTest params)171 public void testBQSRBucket(BQSRTest params) throws IOException { 172 ArgumentsBuilder ab = new ArgumentsBuilder().addRaw(params.getCommandLine()); 173 IntegrationTestSpec spec = new IntegrationTestSpec( 174 ab.getString(), 175 Arrays.asList(params.expectedFileName)); 176 spec.executeTest("testBQSRBucket-" + params.args, this); 177 } 178 179 // TODO: This test is disabled because a new expected result needs to be created. 180 @Test(description = "This is to test https://github.com/broadinstitute/hellbender/issues/322", groups = {"cloud", "spark"}, enabled = false) testPlottingWorkflow()181 public void testPlottingWorkflow() throws IOException { 182 final String resourceDir = getTestDataDir() + "/" + "BQSR" + "/"; 183 final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT; 184 final String dbSNPb37_chr2021 = resourceDir + DBSNP_138_B37_CH20_1M_1M1K_VCF; 185 final String HiSeqBam_chr20 = getResourceDir() + WGS_B37_CH20_1M_1M1K_BAM; 186 187 final File actualHiSeqBam_recalibrated = createTempFile("actual.recalibrated", ".bam"); 188 189 final String tablePre = createTempFile("gatk4.pre.cols", ".table").getAbsolutePath(); 190 final String argPre = " -R " + chr2021Reference2bit + "-indels --enable-baq " +" --known-sites " + dbSNPb37_chr2021 + " -I " + HiSeqBam_chr20 191 + " -O " + tablePre; 192 new BaseRecalibratorSpark().instanceMain(Utils.escapeExpressions(argPre)); 193 194 final String argApply = "-I " + HiSeqBam_chr20 + " --bqsr-recal-file " + tablePre + " -O " + actualHiSeqBam_recalibrated.getAbsolutePath(); 195 new ApplyBQSRSpark().instanceMain(Utils.escapeExpressions(argApply)); 196 197 final File actualTablePost = createTempFile("gatk4.post.cols", ".table"); 198 final String argsPost = " -R " + chr2021Reference2bit + "-indels --enable-baq " +" --known-sites " + dbSNPb37_chr2021 + " -I " + actualHiSeqBam_recalibrated.getAbsolutePath() 199 + " -O " + actualTablePost.getAbsolutePath(); 200 new BaseRecalibratorSpark().instanceMain(Utils.escapeExpressions(argsPost)); 201 202 final File expectedHiSeqBam_recalibrated = new File(resourceDir + "expected.NA12878.chr17_69k_70k.dictFix.recalibrated.DIQ.bam"); 203 204 SamAssertionUtils.assertSamsEqual(actualHiSeqBam_recalibrated, expectedHiSeqBam_recalibrated, ValidationStringency.LENIENT); 205 206 final File expectedTablePost = new File(getResourceDir() + "expected.NA12878.chr17_69k_70k.postRecalibrated.txt"); 207 IntegrationTestSpec.assertEqualTextFiles(actualTablePost, expectedTablePost); 208 } 209 210 @Test(groups = {"spark", "bucket"}) testBQSRFailWithoutDBSNP()211 public void testBQSRFailWithoutDBSNP() throws IOException { 212 final String resourceDir = getTestDataDir() + "/" + "BQSR" + "/"; 213 final String localResources = getResourceDir(); 214 215 final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT; 216 final String HiSeqBam_chr17 = resourceDir + "NA12878.chr17_69k_70k.dictFix.bam"; 217 218 final String NO_DBSNP = ""; 219 final BQSRTest params = new BQSRTest(chr2021Reference2bit, HiSeqBam_chr17, NO_DBSNP, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL); 220 IntegrationTestSpec spec = new IntegrationTestSpec( 221 params.getCommandLine(), 222 1, 223 CommandLineException.class); 224 spec.executeTest("testBQSRFailWithoutDBSNP", this); 225 } 226 227 @Test(groups = {"spark", "bucket"}) testBQSRFailWithIncompatibleReference()228 public void testBQSRFailWithIncompatibleReference() throws IOException { 229 final String resourceDir = getTestDataDir() + "/" + "BQSR" + "/"; 230 final String localResources = getResourceDir(); 231 232 final String HiSeqBam_chr17 = resourceDir + "NA12878.chr17_69k_70k.dictFix.bam"; 233 234 final String dbSNPb37_chr2021 = resourceDir + DBSNP_138_B37_CH20_1M_1M1K_VCF; 235 final BQSRTest params = new BQSRTest(GATKBaseTest.hg19MiniReference, HiSeqBam_chr17, dbSNPb37_chr2021, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL); 236 IntegrationTestSpec spec = new IntegrationTestSpec( 237 params.getCommandLine(), 238 1, 239 UserException.IncompatibleSequenceDictionaries.class); 240 spec.executeTest("testBQSRFailWithIncompatibleReference", this); 241 } 242 } 243