1 package org.broadinstitute.hellbender.tools.spark;
2 
3 import htsjdk.samtools.ValidationStringency;
4 import org.broadinstitute.barclay.argparser.CommandLineException;
5 import org.broadinstitute.hellbender.CommandLineProgramTest;
6 import org.broadinstitute.hellbender.GATKBaseTest;
7 import org.broadinstitute.hellbender.exceptions.UserException;
8 import org.broadinstitute.hellbender.tools.walkers.bqsr.BQSRTestData;
9 import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
10 import org.broadinstitute.hellbender.utils.Utils;
11 import org.broadinstitute.hellbender.testutils.SamAssertionUtils;
12 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
13 import org.testng.annotations.DataProvider;
14 import org.testng.annotations.Test;
15 
16 import java.io.File;
17 import java.io.IOException;
18 import java.util.Arrays;
19 
20 public final class BaseRecalibratorSparkIntegrationTest extends CommandLineProgramTest {
21 
22     private final static String THIS_TEST_FOLDER = toolsTestDir + "BQSR/";
23 
24     private static final class BQSRTest {
25         final String referenceURL;
26         final String bam;
27         final String knownSites;
28         final String args;
29         final String expectedFileName;
30 
BQSRTest(String referenceURL, String bam, String knownSites, String args, String expectedFileName)31         private BQSRTest(String referenceURL, String bam, String knownSites, String args, String expectedFileName) {
32             this.referenceURL = referenceURL;
33             this.bam = bam;
34             this.knownSites = knownSites;
35             this.args = args;
36             this.expectedFileName = expectedFileName;
37         }
38 
getCommandLine()39         public String getCommandLine() {
40             return  " -R " + referenceURL +
41                     " -I " + bam +
42                     " " + args +
43                     (knownSites.isEmpty() ? "": " --known-sites " + knownSites) +
44                     " -O %s";
45         }
46 
47         @Override
toString()48         public String toString() {
49             return String.format("BQSR(bam='%s', args='%s')", bam, args);
50         }
51     }
52 
getResourceDir()53     private String getResourceDir(){
54         return getTestDataDir() + "/" + "BQSR" + "/";
55     }
56 
getCloudInputs()57     private String getCloudInputs() {
58         return getGCPTestInputPath() + THIS_TEST_FOLDER;
59     }
60 
61     @DataProvider(name = "BQSRTest")
createBQSRTestData()62     public Object[][] createBQSRTestData() {
63         final String localResources =  getResourceDir();
64 
65         final String GRCh37Ref_chr2021 = b37_reference_20_21;
66         final String GRCh37Ref_chr2021_gz = b37_reference_20_21_gz;
67         final String hiSeqBam_chr20 = localResources + WGS_B37_CH20_1M_1M1K_BAM;
68         final String hiSeqBam_1read = localResources + "overlappingRead.bam";
69         final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF;
70         final String dbSNPb37_chr2021 = dbsnp_138_b37_20_21_vcf;
71 
72         final String hg19Chr171Mb = publicTestDir + "human_g1k_v37.chr17_1Mb.fasta";
73         final String HiSeqBam_chr17 = localResources + "NA12878.chr17_69k_70k.dictFix.bam";
74         final String dbSNPb37_chr17 =  localResources + "dbsnp_132.b37.excluding_sites_after_129.chr17_69k_70k.vcf";
75         final String more17Sites = localResources + "bqsr.fakeSitesForTesting.b37.chr17.vcf";
76 
77         final String hiSeqBam_20_21_100000 = localResources + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.bam";
78         final String hiSeqCram_20_21_100000 = localResources + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.cram";
79         final String more20Sites = localResources + "dbsnp_138.b37.20.10m-10m100.vcf"; //for testing 2 input files
80         final String more21Sites = localResources + "dbsnp_138.b37.21.10m-10m100.vcf"; //for testing 2 input files
81 
82         return new Object[][]{
83                 //Note: recal tables were created using GATK3.4 with one change from 2.87 to 2.88 in expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.recal.txt
84                 // The reason is that GATK4 uses a multiplier in summing doubles in RecalDatum.
85                 // See MathUtilsUniTest.testAddDoubles for a demonstration how that can change the results.
86                 // See RecalDatum for explanation of why the multiplier is needed.
87 
88                 // local input/computation/reference
89                 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_1read, dbSNPb37_chr2021, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)},
90                 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
91                 {new BQSRTest(GRCh37Ref_chr2021_gz, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
92                 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)},
93                 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --indels-context-size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)},
94                 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --low-quality-tail 5", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)},
95                 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --quantizing-levels 6", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_QUANTIZING_LEVELS_6_RECAL)},
96                 {new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --mismatches-context-size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)},
97                 // multiple known sites; output generated with GATK4 walker
98                 {new BQSRTest(GRCh37Ref_chr2021 , hiSeqBam_20_21_100000, more20Sites, "-indels --enable-baq --known-sites " + more21Sites, getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.ch21.10m-10m100.recal.txt")},
99                 {new BQSRTest(GRCh37Ref_chr2021 , hiSeqCram_20_21_100000, more20Sites, "-indels --enable-baq --known-sites " + more21Sites, getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.ch21.10m-10m100.recal.txt")},
100                 // multiple known sites; entire test case shared with walker version
101                 {new BQSRTest(hg19Chr171Mb, HiSeqBam_chr17, dbSNPb37_chr17, "-indels --enable-baq --known-sites " + more17Sites, getResourceDir() + "expected.NA12878.chr17_69k_70k.2inputs.txt")},
102         };
103     }
104 
105     @Test(dataProvider = "BQSRTest", groups = "spark")
testBQSRSpark(BQSRTest params)106     public void testBQSRSpark(BQSRTest params) throws IOException {
107         ArgumentsBuilder ab = new ArgumentsBuilder().addRaw(params.getCommandLine());
108         IntegrationTestSpec spec = new IntegrationTestSpec(
109                 ab.getString(),
110                 Arrays.asList(params.expectedFileName));
111         spec.executeTest("testBQSRSpark-" + params.args, this);
112     }
113 
114     //This data provider is for tests that use reference (but not BAM) files stored in buckets
115     @DataProvider(name = "BQSRCloudTest")
createBQSRCloudTestData()116     public Object[][] createBQSRCloudTestData() {
117         final String localResources =  getResourceDir();
118 
119         final String GRCh37RefCloud = GCS_b37_CHR20_21_REFERENCE;
120         final String hiSeqBam_chr20 = localResources + WGS_B37_CH20_1M_1M1K_BAM;
121         final String hiSeqBam_1read = localResources + "overlappingRead.bam";
122         final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF;
123         final String dbSNPb37_chr2021 = dbsnp_138_b37_20_21_vcf;
124 
125         return new Object[][]{
126                 //Note: recal tables were created using GATK3.4 with one change from 2.87 to 2.88 in expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.recal.txt
127                 // The reason is that GATK4 uses a multiplier in summing doubles in RecalDatum.
128                 // See MathUtilsUniTest.testAddDoubles for a demonstration how that can change the results.
129                 // See RecalDatum for explanation of why the multiplier is needed.
130 
131                 // local input/computation, using a gs:// reference
132                 {new BQSRTest(GRCh37RefCloud, hiSeqBam_1read, dbSNPb37_chr2021, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)},
133                 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
134                 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)},
135                 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --indels-context-size 4",  localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)},
136                 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --low-quality-tail 5",     localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)},
137                 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --quantizing-levels 6",    localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_QUANTIZING_LEVELS_6_RECAL)},
138                 {new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --mismatches-context-size 4", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)},
139         };
140     }
141 
142     @Test(dataProvider = "BQSRCloudTest", groups = {"bucket", "spark"})
testBQSRSparkCloud(final BQSRTest params)143     public void testBQSRSparkCloud(final BQSRTest params) throws IOException {
144         ArgumentsBuilder ab = new ArgumentsBuilder().addRaw(params.getCommandLine());
145         IntegrationTestSpec spec = new IntegrationTestSpec(
146                 ab.getString(),
147                 Arrays.asList(params.expectedFileName));
148         spec.executeTest("testBQSRSparkCloud-" + params.args, this);
149     }
150 
151     //This data provider is for tests that use BAM files stored in buckets
152     @DataProvider(name = "BQSRTestBucket")
createBQSRTestDataBucket()153     public Object[][] createBQSRTestDataBucket() {
154         final String GRCh37RefCloud = GCS_b37_CHR20_21_REFERENCE;
155         final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT;
156         final String localResources = getResourceDir();
157         final String HiSeqBamCloud_chr20 = getCloudInputs() + WGS_B37_CH20_1M_1M1K_BAM;
158         final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF;
159 
160         return new Object[][]{
161                 // input in cloud, computation local.
162                 {new BQSRTest(GRCh37RefCloud, HiSeqBamCloud_chr20, dbSNPb37_chr20, "-indels --enable-baq "+" --join-strategy SHUFFLE", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
163                 {new BQSRTest(GRCh37RefCloud, HiSeqBamCloud_chr20, dbSNPb37_chr20, " --join-strategy SHUFFLE", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)},
164                 {new BQSRTest(chr2021Reference2bit, HiSeqBamCloud_chr20, dbSNPb37_chr20, "-indels --enable-baq "+" --join-strategy BROADCAST", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
165                 {new BQSRTest(chr2021Reference2bit, HiSeqBamCloud_chr20, dbSNPb37_chr20, " --join-strategy BROADCAST", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)},
166         };
167     }
168 
169     // TODO: re-enable once ReadsSparkSource natively supports files in GCS buckets
170     @Test(dataProvider = "BQSRTestBucket", groups = {"spark", "bucket"}, enabled = false)
testBQSRBucket(BQSRTest params)171     public void testBQSRBucket(BQSRTest params) throws IOException {
172         ArgumentsBuilder ab = new ArgumentsBuilder().addRaw(params.getCommandLine());
173         IntegrationTestSpec spec = new IntegrationTestSpec(
174                 ab.getString(),
175                 Arrays.asList(params.expectedFileName));
176         spec.executeTest("testBQSRBucket-" + params.args, this);
177     }
178 
179     // TODO: This test is disabled because a new expected result needs to be created.
180     @Test(description = "This is to test https://github.com/broadinstitute/hellbender/issues/322", groups = {"cloud", "spark"}, enabled = false)
testPlottingWorkflow()181     public void testPlottingWorkflow() throws IOException {
182         final String resourceDir = getTestDataDir() + "/" + "BQSR" + "/";
183         final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT;
184         final String dbSNPb37_chr2021 = resourceDir + DBSNP_138_B37_CH20_1M_1M1K_VCF;
185         final String HiSeqBam_chr20 = getResourceDir() + WGS_B37_CH20_1M_1M1K_BAM;
186 
187         final File actualHiSeqBam_recalibrated = createTempFile("actual.recalibrated", ".bam");
188 
189         final String tablePre = createTempFile("gatk4.pre.cols", ".table").getAbsolutePath();
190         final String argPre = " -R " + chr2021Reference2bit + "-indels --enable-baq " +" --known-sites " + dbSNPb37_chr2021 + " -I " + HiSeqBam_chr20
191                 + " -O " + tablePre;
192         new BaseRecalibratorSpark().instanceMain(Utils.escapeExpressions(argPre));
193 
194         final String argApply = "-I " + HiSeqBam_chr20 + " --bqsr-recal-file " + tablePre + " -O " + actualHiSeqBam_recalibrated.getAbsolutePath();
195         new ApplyBQSRSpark().instanceMain(Utils.escapeExpressions(argApply));
196 
197         final File actualTablePost = createTempFile("gatk4.post.cols", ".table");
198         final String argsPost = " -R " + chr2021Reference2bit + "-indels --enable-baq " +" --known-sites " + dbSNPb37_chr2021 + " -I " + actualHiSeqBam_recalibrated.getAbsolutePath()
199                 + " -O " + actualTablePost.getAbsolutePath();
200         new BaseRecalibratorSpark().instanceMain(Utils.escapeExpressions(argsPost));
201 
202         final File expectedHiSeqBam_recalibrated = new File(resourceDir + "expected.NA12878.chr17_69k_70k.dictFix.recalibrated.DIQ.bam");
203 
204         SamAssertionUtils.assertSamsEqual(actualHiSeqBam_recalibrated, expectedHiSeqBam_recalibrated, ValidationStringency.LENIENT);
205 
206         final File expectedTablePost = new File(getResourceDir() + "expected.NA12878.chr17_69k_70k.postRecalibrated.txt");
207         IntegrationTestSpec.assertEqualTextFiles(actualTablePost, expectedTablePost);
208     }
209 
210     @Test(groups = {"spark", "bucket"})
testBQSRFailWithoutDBSNP()211     public void testBQSRFailWithoutDBSNP() throws IOException {
212         final String resourceDir =  getTestDataDir() + "/" + "BQSR" + "/";
213         final String localResources =  getResourceDir();
214 
215         final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT;
216         final String HiSeqBam_chr17 = resourceDir + "NA12878.chr17_69k_70k.dictFix.bam";
217 
218         final String  NO_DBSNP = "";
219         final BQSRTest params = new BQSRTest(chr2021Reference2bit, HiSeqBam_chr17, NO_DBSNP, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL);
220         IntegrationTestSpec spec = new IntegrationTestSpec(
221                 params.getCommandLine(),
222                 1,
223                 CommandLineException.class);
224         spec.executeTest("testBQSRFailWithoutDBSNP", this);
225     }
226 
227     @Test(groups = {"spark", "bucket"})
testBQSRFailWithIncompatibleReference()228     public void testBQSRFailWithIncompatibleReference() throws IOException {
229         final String resourceDir =  getTestDataDir() + "/" + "BQSR" + "/";
230         final String localResources =  getResourceDir();
231 
232         final String HiSeqBam_chr17 = resourceDir + "NA12878.chr17_69k_70k.dictFix.bam";
233 
234         final String dbSNPb37_chr2021 = resourceDir + DBSNP_138_B37_CH20_1M_1M1K_VCF;
235         final BQSRTest params = new BQSRTest(GATKBaseTest.hg19MiniReference, HiSeqBam_chr17, dbSNPb37_chr2021, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL);
236         IntegrationTestSpec spec = new IntegrationTestSpec(
237                 params.getCommandLine(),
238                 1,
239                 UserException.IncompatibleSequenceDictionaries.class);
240         spec.executeTest("testBQSRFailWithIncompatibleReference", this);
241     }
242 }
243