1 package org.broadinstitute.hellbender.tools.genomicsdb;
2 
3 import htsjdk.samtools.SAMSequenceDictionary;
4 import htsjdk.samtools.SAMSequenceRecord;
5 import htsjdk.samtools.util.IntervalList;
6 import htsjdk.tribble.AbstractFeatureReader;
7 import htsjdk.tribble.CloseableTribbleIterator;
8 import htsjdk.tribble.FeatureReader;
9 import htsjdk.tribble.readers.LineIterator;
10 import htsjdk.variant.bcf2.BCF2Codec;
11 import htsjdk.variant.variantcontext.Allele;
12 import htsjdk.variant.variantcontext.Genotype;
13 import htsjdk.variant.variantcontext.GenotypeBuilder;
14 import htsjdk.variant.variantcontext.VariantContext;
15 import htsjdk.variant.variantcontext.VariantContextBuilder;
16 import htsjdk.variant.variantcontext.writer.Options;
17 import htsjdk.variant.variantcontext.writer.VariantContextWriter;
18 import htsjdk.variant.vcf.VCFCodec;
19 import htsjdk.variant.vcf.VCFFormatHeaderLine;
20 import htsjdk.variant.vcf.VCFHeader;
21 import htsjdk.variant.vcf.VCFHeaderLine;
22 import htsjdk.variant.vcf.VCFHeaderLineType;
23 import htsjdk.variant.vcf.VCFStandardHeaderLines;
24 import java.io.File;
25 import java.io.IOException;
26 import java.nio.file.Files;
27 import java.nio.file.Paths;
28 import java.nio.file.StandardCopyOption;
29 import java.util.ArrayList;
30 import java.util.Arrays;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.HashSet;
34 import java.util.Iterator;
35 import java.util.LinkedHashMap;
36 import java.util.LinkedList;
37 import java.util.List;
38 import java.util.Map;
39 import java.util.Optional;
40 import java.util.Set;
41 import java.util.stream.Collectors;
42 import org.broadinstitute.barclay.argparser.CommandLineException;
43 import org.broadinstitute.hellbender.CommandLineProgramTest;
44 import org.broadinstitute.hellbender.Main;
45 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
46 import org.broadinstitute.hellbender.exceptions.UserException;
47 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
48 import org.broadinstitute.hellbender.testutils.BaseTest;
49 import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
50 import org.broadinstitute.hellbender.utils.SimpleInterval;
51 import org.broadinstitute.hellbender.utils.Utils;
52 import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
53 import org.broadinstitute.hellbender.utils.io.IOUtils;
54 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
55 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
56 import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
57 import org.genomicsdb.GenomicsDBUtils;
58 import org.genomicsdb.model.GenomicsDBExportConfiguration;
59 import org.genomicsdb.model.GenomicsDBVidMapProto;
60 import org.genomicsdb.reader.GenomicsDBFeatureReader;
61 import org.testng.Assert;
62 import org.testng.annotations.DataProvider;
63 import org.testng.annotations.Test;
64 
65 @Test(groups = {"variantcalling"})
66 public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTest {
67     private static final String HG_00096 = largeFileTestDir + "gvcfs/HG00096.g.vcf.gz";
68     private static final String HG_00268 = largeFileTestDir + "gvcfs/HG00268.g.vcf.gz";
69     private static final String NA_19625 = largeFileTestDir + "gvcfs/NA19625.g.vcf.gz";
70     //The following 3 files were obtained by running CombineGVCFs on the above 3 files (separately). This introduces spanning
71     //deletions in the files. Hence, these files can be used to test for spanning deletions in the input VCF.
72     private static final String HG_00096_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz";
73     private static final String HG_00268_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz";
74     private static final String NA_19625_after_combine_gvcfs = largeFileTestDir + "gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz";
75     private static final String NA_24385 = largeFileTestDir + "NA24385.vcf.gz";
76     private static final String NA_12878_PHASED = largeFileTestDir + "NA12878.phasedData.Chr20.vcf"; //NOTE: this is not phased according to the vcf spec but it reflects phasing currently produced by haplotype caller
77     private static final String MULTIPLOID_DATA_HG37 = largeFileTestDir + "gvcfs/HapMap5plex.ploidy10.b37.g.vcf";
78     private static final String NA12878_HG37 = toolsTestDir + "GenomicsDBImport/expected.testGVCFMode.gatk4.g.vcf";
79     //This file was generated by running CombineGVCFs on the input files
80     //./gatk CombineGVCFs -V src/test/resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/expected.testGVCFMode.gatk4.g.vcf -V src/test/resources/large/gvcfs/HapMap5plex.ploidy10.b37.g.vcf -R src/test/resources/large/human_g1k_v37.20.21.fasta -L 20:10000000-10100000 -O src/test/resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/expected.testGenomicsDBImportWithNonDiploidData.vcf
81     private static final String MULTIPLOID_EXPECTED_RESULT = toolsTestDir + "GenomicsDBImport/expected.testGenomicsDBImportWithNonDiploidData.vcf";
82     private static final String MNP_GVCF = toolsTestDir + "GenomicsDBImport/mnp.input.g.vcf";
83     private static final String ARTIFICIAL_PHASED = getTestDataDir() + "/ArtificalPhasedData.1.g.vcf";
84     private static final String HG_00268_WITH_SPACES = largeFileTestDir + "gvcfs/HG00268.spaceInSampleName.g.vcf";
85     private static final List<String> LOCAL_GVCFS = Arrays.asList(HG_00096, HG_00268, NA_19625);
86     private static final List<String> LOCAL_GVCFS_AFTER_COMBINE_GVCFS = Arrays.asList(HG_00096_after_combine_gvcfs,
87             HG_00268_after_combine_gvcfs,
88             NA_19625_after_combine_gvcfs);
89     private static final String GENOMICSDB_TEST_DIR = toolsTestDir + "GenomicsDBImport/";
90     private static final String COMBINEGVCFS_TEST_DIR = toolsTestDir + "walkers/CombineGVCFs/";
91     private static final String COMBINED = largeFileTestDir + "gvcfs/combined.gatk3.7.g.vcf.gz";
92     private static final String COMBINED_WITH_GENOTYPES = largeFileTestDir + "gvcfs/combined_with_genotypes.g.vcf.gz";
93     //This file was obtained from combined.gatk3.7.g.vcf.gz by dropping all the samples
94     private static final String COMBINED_SITES_ONLY = largeFileTestDir + "gvcfs/combined.gatk3.7_sites_only.g.vcf.gz";
95     private static final String INTERVAL_PICARD_STYLE_EXPECTED = toolsTestDir + "GenomicsDBImport/interval_expected.interval_list";
96     private static final String MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED =
97             toolsTestDir + "GenomicsDBImport/multiple_non_adjacent_intervals_combine_gvcfs_expected.interval_list";
98     private static final String MERGED_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED =
99             toolsTestDir + "GenomicsDBImport/chr20_chr21_merged_contigs_expected.interval_list";
100     private static final String TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE = GENOMICSDB_TEST_DIR + "/int64_test.tar.gz";
101     //Consider a gVCF with a REF block chr20:50-150. Importing this data into GenomicsDB using multiple intervals
102     //-L chr20:1-100 and -L chr20:101-200 will cause the REF block to be imported into both the arrays
103     //Now, when reading data from the workspace (assume full scan) - the data is split into 2 REF block intervals chr20:50-100
104     //and chr20:101-150 one from each array
105     //The following COMBINED_MULTI_INTERVAL gvcf is identical to the gVCF in the previous line except at the partition break
106     //position
107     //The previous file has the following line:
108     //chr20   17970000        .       G       <NON_REF>       .       .       END=17970001
109     //
110     //while this file has:
111     //chr20   17970000        .       G       <NON_REF>       .       .       .
112     //chr20   17970001        .       G       <NON_REF>       .       .       .
113     //
114     private static final String COMBINED_MULTI_INTERVAL = largeFileTestDir + "gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz";
115     private static final String COMBINED_WITHSPACES = largeFileTestDir + "gvcfs/combined.gatk3.7.smaller_interval.g.vcf";
116     private static final ArrayList<SimpleInterval> INTERVAL =
117             new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 17960187, 17981445)));
118     private static final ArrayList<SimpleInterval> INTERVAL_NOTFULL =
119             new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 1, 17960187)));
120     private static final ArrayList<SimpleInterval> INTERVAL_20_21 =
121             new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20"),new SimpleInterval("chr21")));
122     private static final ArrayList<SimpleInterval> MULTIPLE_INTERVALS = new ArrayList<SimpleInterval>(Arrays.asList(
123         new SimpleInterval("chr20", 17960187, 17970000),
124         new SimpleInterval("chr20", 17970001, 17980000),
125         new SimpleInterval("chr20", 17980001, 17981445)
126     ));
127     private static final ArrayList<SimpleInterval> MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS =
128         new ArrayList<SimpleInterval>(Arrays.asList(
129             new SimpleInterval("chr20", 17960187, 17969999),
130             new SimpleInterval("chr20", 17970000, 17980000),
131             new SimpleInterval("chr20", 17980001, 17981445)
132     ));
133     private static final ArrayList<SimpleInterval> MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS =
134         new ArrayList<SimpleInterval>(Arrays.asList(
135             new SimpleInterval("chr20", 17960187, 17969999),
136             new SimpleInterval("chr20", 17980001, 17981445),
137             new SimpleInterval("chr21", 29477554, 29486255)
138     ));
139     private static final ArrayList<SimpleInterval> INTERVAL_3736 =
140             new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr6",130365070,146544250)));
141     private static final ArrayList<SimpleInterval> INTERVAL_NONDIPLOID =
142             new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10000000, 10100000)));
143     private static final ArrayList<SimpleInterval> SMALLER_INTERVAL =
144             new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 17960187, 17961973)));
145     private static final VCFHeader VCF_HEADER = VariantContextTestUtils.getCompleteHeader();
146     private static final String SAMPLE_NAME_KEY = "SN";
147     private static final String ANOTHER_ATTRIBUTE_KEY = "AA";
148 
149     private static final List<String> GVCFS_WITH_NEW_MQ = Arrays.asList(NA12878_HG37, getTestDataDir() + "/walkers/CombineGVCFs/YRIoffspring.chr20snippet.g.vcf");
150     private static final String COMBINED_WITH_NEW_MQ = toolsTestDir + "/walkers/GenomicsDBImport/newMQcalc.combined.g.vcf";
151     private static final List<SimpleInterval> INTERVAL2 = Arrays.asList(new SimpleInterval("20", 1, 11_000_000));
152     private static final List<String> ATTRIBUTES_TO_IGNORE = Arrays.asList("RAW_MQ","RAW_MQandDP");  //CombineGVCFs doesn't support the old RAW_MQ anymore
153     // we're using vcfs instead of gvcfs for many contigs test, and these attributes don't have default combine operations in GenomicsDB
154     private static final List<String> MANY_CONTIGS_ATTRIBUTES_TO_IGNORE = Arrays.asList("HaplotypeScore","MLEAC", "MLEAF");
155     private static final String P717 = largeFileTestDir + "Ptrichocarpa.v3.sorted.p717.vcf";
156     private static final String P717_2 = largeFileTestDir + "Ptrichocarpa.v3.sorted.p717_2.vcf";
157     private static final List<String> MANY_CONTIGS_VCF = Arrays.asList(P717, P717_2);
158     private static final String EXPECTED_SEVERAL_CONTIGS_VCF = largeFileTestDir + "Ptrichocarpa.v3.p717.p717_2.combined.final.expected.vcf";
159     private static final String MANY_CONTIGS_REF = largeFileTestDir + "Populus_trichocarpa.Pop_tri_v3.dna.nonchromosomal_subset_renamed.fa";
160     // scaffold_3123 has been removed to test non adjacent interval list works (after scaffold_3381 in header)
161     private static final List<String> MANY_CONTIGS_NON_ADJACENT_INTERVALS = Arrays.asList("scaffold_3121", "scaffold_3427", "scaffold_3213", "scaffold_3050", "scaffold_3381",
162         "scaffold_3472", "scaffold_2907", "scaffold_3046", "scaffold_3412", "scaffold_3304", "scaffold_3332", "scaffold_3326", "scaffold_3230",
163         "scaffold_3160", "scaffold_3403", "scaffold_2851", "scaffold_3416", "scaffold_3340", "scaffold_2911", "scaffold_3442", "scaffold_3681", "scaffold_2889",
164         "scaffold_3305", "scaffold_3335", "scaffold_3316", "scaffold_3126", "scaffold_3363", "scaffold_2844", "scaffold_3388", "scaffold_3285", "scaffold_2968",
165         "scaffold_3074", "scaffold_3436", "scaffold_3289", "scaffold_3264", "scaffold_2919", "scaffold_3422", "scaffold_3393", "scaffold_3387", "scaffold_3453",
166         "scaffold_3171", "scaffold_3372", "scaffold_3389", "scaffold_3259", "scaffold_2930", "scaffold_3129", "scaffold_3044", "scaffold_3147", "scaffold_2885",
167         "scaffold_3452", "scaffold_3202", "scaffold_3263", "scaffold_3354", "scaffold_3134", "scaffold_3255", "scaffold_3320", "scaffold_3523", "scaffold_3432",
168         "scaffold_3239", "scaffold_3206", "scaffold_3437", "scaffold_2922", "scaffold_3136", "scaffold_3292", "scaffold_3391", "scaffold_3061", "scaffold_3250",
169         "scaffold_3226", "scaffold_2857", "scaffold_3528", "scaffold_3325", "scaffold_3296", "scaffold_3298", "scaffold_2924", "scaffold_3157", "scaffold_2855",
170         "scaffold_3275", "scaffold_3007", "scaffold_3306", "scaffold_3179", "scaffold_3060", "scaffold_3222", "scaffold_3648", "scaffold_3005", "scaffold_3020",
171         "scaffold_3194", "scaffold_3328", "scaffold_3251", "scaffold_3547", "scaffold_3342", "scaffold_3139", "scaffold_3262", "scaffold_3210", "scaffold_2981",
172         "scaffold_2933", "scaffold_3056", "scaffold_3413", "scaffold_3064", "scaffold_3353", "scaffold_2913", "scaffold_3445", "scaffold_3374", "scaffold_3214",
173         "scaffold_3423", "scaffold_3095", "scaffold_2965", "scaffold_3357", "scaffold_3021", "scaffold_3228", "scaffold_3300", "scaffold_3042", "scaffold_3312",
174         "scaffold_3537", "scaffold_3058", "scaffold_3425", "scaffold_3431", "scaffold_3368", "scaffold_2951", "scaffold_3356", "scaffold_3116", "scaffold_3257",
175         "scaffold_3478", "scaffold_3068", "scaffold_3008", "scaffold_2893", "scaffold_3088", "scaffold_3269", "scaffold_3245", "scaffold_3190", "scaffold_3054",
176         "scaffold_3383", "scaffold_3346", "scaffold_3223", "scaffold_3446", "scaffold_3370", "scaffold_3252", "scaffold_3053", "scaffold_3100", "scaffold_2838",
177         "scaffold_3272", "scaffold_3384", "scaffold_2868", "scaffold_3398", "scaffold_3107", "scaffold_3014", "scaffold_3364", "scaffold_2987", "scaffold_3191",
178         "scaffold_3076", "scaffold_3246", "scaffold_3011", "scaffold_3348", "scaffold_3231", "scaffold_3448", "scaffold_3360", "scaffold_3352", "scaffold_3294",
179         "scaffold_2853", "scaffold_3024", "scaffold_3426", "scaffold_3379", "scaffold_3440", "scaffold_3550", "scaffold_2879", "scaffold_3362", "scaffold_3236");
180     private static final int SEVERAL_CONTIGS = 7;
181     private static final String MANY_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED =
182             toolsTestDir + "GenomicsDBImport/Ptrichocarpa.v3.expected.interval_list";
183 
184     @Override
getTestedClassName()185     public String getTestedClassName() {
186         return GenomicsDBImport.class.getSimpleName();
187     }
188 
189     @DataProvider(name="batchSizes")
batchSizes()190     public Object[][] batchSizes() {
191         return new Object[][] {
192                 new Object[]{1},
193                 new Object[]{2},
194                 new Object[]{3},
195                 new Object[]{4},
196                 new Object[]{100},
197         };
198     }
199 
200     @Test
testGenomicsDBImportFileInputs()201     public void testGenomicsDBImportFileInputs() throws IOException {
202         testGenomicsDBImporter(LOCAL_GVCFS, INTERVAL, COMBINED, b38_reference_20_21, true, 1);
203     }
204 
205     @Test
testGenomicsDBImportFileInputs_newMQ()206     public void testGenomicsDBImportFileInputs_newMQ() throws IOException {
207         testGenomicsDBImporter_newMQ(GVCFS_WITH_NEW_MQ, INTERVAL2, COMBINED_WITH_NEW_MQ, b37_reference_20_21, true, Collections.emptyList());
208     }
209 
210     @Test
testGenomicsDBImportFileInputsWithMultipleIntervals()211     public void testGenomicsDBImportFileInputsWithMultipleIntervals() throws IOException {
212         testGenomicsDBImporter(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, b38_reference_20_21, true, 1);
213     }
214 
215     @Test(timeOut = 1000000)
testGenomicsDBImportWith1000IntervalsToBeMerged()216     public void testGenomicsDBImportWith1000IntervalsToBeMerged() throws IOException {
217         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
218         LinkedList<SimpleInterval> intervals = new LinkedList<SimpleInterval>();
219         //[ 17960187, 17981445 ]
220         int base = 17960187;
221         for (int i = 0; i < 1000; ++i)
222             intervals.add(new SimpleInterval("chr20", base + 20 * i, base + 20 * i + 10)); //intervals of size 10 separated by 10
223         writeToGenomicsDB(new ArrayList<String>(Arrays.asList(LOCAL_GVCFS.get(0))), intervals, workspace, 0,
224                 false, 0, 1, true);
225     }
226 
227     @Test
testGenomicsDBImportFileInputsAgainstCombineGVCF()228     public void testGenomicsDBImportFileInputsAgainstCombineGVCF() throws IOException {
229         testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL, b38_reference_20_21, new String[0]);
230     }
231 
232     @Test
testGenomicsDBImportFileInputsAgainstCombineGVCFMergeContigsToSinglePartition()233     public void testGenomicsDBImportFileInputsAgainstCombineGVCFMergeContigsToSinglePartition() throws IOException {
234         testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL_20_21, b38_reference_20_21, new String[0], 1, 1, false);
235     }
236 
237     @Test
testGenomicsDBImportMergeContigsManyNonAdjacentContigsToSeveralContigs()238     public void testGenomicsDBImportMergeContigsManyNonAdjacentContigsToSeveralContigs() throws IOException {
239         List<SimpleInterval> manyContigs = MANY_CONTIGS_NON_ADJACENT_INTERVALS.stream().map(SimpleInterval::new).collect(Collectors.toList());
240         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
241 
242         writeToGenomicsDB(MANY_CONTIGS_VCF, manyContigs, workspace, 0, false, 0, 1, false, false, false, SEVERAL_CONTIGS, false);
243         checkJSONFilesAreWritten(workspace);
244         checkGenomicsDBAgainstExpected(workspace, manyContigs, EXPECTED_SEVERAL_CONTIGS_VCF, MANY_CONTIGS_REF, true,
245                 MANY_CONTIGS_ATTRIBUTES_TO_IGNORE, true, false);
246     }
247 
248     @Test(expectedExceptions = {UserException.class}, expectedExceptionsMessageRegExp=".*entire contigs be specified.*")
testGenomicsDBMergeContigsThrowsOnNotInputIntervalLessThanContigLength()249     public void testGenomicsDBMergeContigsThrowsOnNotInputIntervalLessThanContigLength() throws IOException {
250         testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL_NOTFULL, b38_reference_20_21, new String[0], 1, 1, false);
251     }
252 
253     @Test
testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervals()254     public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervals() throws IOException {
255         testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, new String[0]);
256     }
257 
258     @Test
testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsWithMultipleThreads()259     public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsWithMultipleThreads() throws IOException {
260         testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21,
261                 new String[0], 4);
262     }
263 
264     @Test
testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervals()265     public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervals() throws IOException {
266         testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS,
267             b38_reference_20_21, new String[0]);
268     }
269 
270     @Test
testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsForFilesProducedAfterCombineGVCFs()271     public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsForFilesProducedAfterCombineGVCFs()
272         throws IOException {
273         //this test covers the scenario where the input vcfs have spanning deletions
274         testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS_AFTER_COMBINE_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS,
275             b38_reference_20_21, new String[0]);
276     }
277 
278     @Test
testGenomicsDBImportFileInputsAgainstCombineGVCFWithNonDiploidData()279     public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithNonDiploidData() throws IOException {
280         testGenomicsDBImporterWithGenotypes(Arrays.asList(NA12878_HG37, MULTIPLOID_DATA_HG37), INTERVAL_NONDIPLOID,
281                 MULTIPLOID_EXPECTED_RESULT, b37_reference_20_21,
282                 true,
283                 false,
284                 false);
285     }
286 
287     @Test
testGenomicsDBImportPhasedData()288     public void testGenomicsDBImportPhasedData() throws IOException {
289         testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), INTERVAL, NA_12878_PHASED, b37_reference_20_21);
290     }
291 
292     @Test
testGenomicsDBImportPhasedDataWithMultipleIntervals()293     public void testGenomicsDBImportPhasedDataWithMultipleIntervals() throws IOException {
294         testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), MULTIPLE_INTERVALS, NA_12878_PHASED, b37_reference_20_21);
295     }
296 
297     @Test
testGenomicsDBImportArtificialPhasedData()298     public void testGenomicsDBImportArtificialPhasedData() throws IOException {
299         ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1", 10109, 10297)));
300         testGenomicsDBImporterWithGenotypes(Arrays.asList(ARTIFICIAL_PHASED), intervals, ARTIFICIAL_PHASED, b37_reference_20_21);
301     }
302 
303     @Test
testGenomicsDBThreeLargeSamplesWithGenotypes()304     public void testGenomicsDBThreeLargeSamplesWithGenotypes() throws IOException {
305         ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 1, 64444167)));
306         testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_WITH_GENOTYPES, b38_reference_20_21, true, true, false);
307     }
308 
309     @Test
testGenomicsDBThreeLargeSamplesSitesOnlyQuery()310     public void testGenomicsDBThreeLargeSamplesSitesOnlyQuery() throws IOException {
311         ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(
312                     new SimpleInterval("chr20", 1, 64444167),
313                     new SimpleInterval("chr21", 1, 46709983)));
314         testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_SITES_ONLY, b38_reference_20_21, true, true, true);
315     }
316 
317     @Test(expectedExceptions={UserException.BadInput.class}, expectedExceptionsMessageRegExp=".*GenomicsDBImport does not support GVCFs.*")
testGenomicsDbImportThrowsOnMnp()318     public void testGenomicsDbImportThrowsOnMnp() throws IOException {
319         for (int threads = 1; threads <= 2; ++threads) {
320             testGenomicsDBImporter(
321                     Collections.singletonList(MNP_GVCF),
322                     Collections.singletonList(new SimpleInterval("20", 69700, 69900)),
323                     null, // Should never produce a VCF
324                     b38_reference_20_21,
325                     true,
326                     threads
327             );
328         }
329     }
330 
testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile)331     private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals,
332                                                      final String expectedCombinedVCF,
333                                                       final String referenceFile) throws IOException {
334         testGenomicsDBImporterWithGenotypes(vcfInputs, intervals,
335                 expectedCombinedVCF, referenceFile,
336                 false,
337                 true,
338                 false);
339     }
340 
testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll)341     private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals,
342                                                       final String expectedCombinedVCF, final String referenceFile,
343                                                      final boolean testAll) throws IOException {
344         testGenomicsDBImporterWithGenotypes(vcfInputs, intervals,
345                 expectedCombinedVCF, referenceFile,
346                 testAll,
347                 false,
348                 false);
349     }
350 
testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final boolean produceGTField, final boolean sitesOnlyQuery)351     private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals,
352                                                       final String expectedCombinedVCF, final String referenceFile,
353                                                      final boolean testAll,
354                                                      final boolean produceGTField,
355                                                      final boolean sitesOnlyQuery) throws IOException {
356          testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, expectedCombinedVCF, referenceFile, testAll, produceGTField,
357                  sitesOnlyQuery, false);
358     }
359 
testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final boolean produceGTField, final boolean sitesOnlyQuery, final boolean useNativeReader)360     private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals,
361                                                       final String expectedCombinedVCF, final String referenceFile,
362                                                      final boolean testAll,
363                                                      final boolean produceGTField,
364                                                      final boolean sitesOnlyQuery,
365                                                      final boolean useNativeReader) throws IOException {
366         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
367 
368         writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1, false, false, false, 0, useNativeReader);
369         checkJSONFilesAreWritten(workspace);
370         checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE, produceGTField, sitesOnlyQuery);
371         checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE, produceGTField, sitesOnlyQuery, true);
372     }
373 
runCombineGVCFs(final List<String> inputs, final List<SimpleInterval> intervals, final String reference, final String[] extraArgs)374     private File runCombineGVCFs(final List<String> inputs, final List<SimpleInterval> intervals, final String reference, final String[] extraArgs) {
375         final File output = createTempFile("genotypegvcf", ".vcf");
376 
377         final ArgumentsBuilder args = new ArgumentsBuilder();
378         args.addReference(new File(reference))
379                 .addOutput(output);
380         for (String input: inputs) {
381             args.add("V", input);
382         }
383         intervals.forEach(args::addInterval);
384         Arrays.stream(extraArgs).forEach(args::addRaw);
385 
386         Utils.resetRandomGenerator();
387         new Main().instanceMain(makeCommandLineArgs(args.getArgsList(), "CombineGVCFs"));
388         return output;
389     }
390 
testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String referenceFile, final String[] CombineGVCFArgs)391     private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals,
392                                                    final String referenceFile, final String[] CombineGVCFArgs) throws IOException {
393         testGenomicsDBAgainstCombineGVCFs(vcfInputs, intervals, referenceFile, CombineGVCFArgs, 1);
394     }
395 
testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String referenceFile, final String[] CombineGVCFArgs, final int numVCFReaderThreadsInImporter)396     private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals,
397                                                    final String referenceFile, final String[] CombineGVCFArgs,
398                                                    final int numVCFReaderThreadsInImporter) throws IOException {
399         testGenomicsDBAgainstCombineGVCFs(vcfInputs, intervals, referenceFile, CombineGVCFArgs, numVCFReaderThreadsInImporter, 0, false);
400     }
401 
testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String referenceFile, final String[] CombineGVCFArgs, final int numVCFReaderThreadsInImporter, final int chrsToPartitions, final boolean useNativeReader)402     private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals,
403                                                    final String referenceFile, final String[] CombineGVCFArgs,
404                                                    final int numVCFReaderThreadsInImporter, final int chrsToPartitions,
405                                                    final boolean useNativeReader) throws IOException {
406         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
407 
408         writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, numVCFReaderThreadsInImporter, false, false, false,
409                           chrsToPartitions, useNativeReader);
410         checkJSONFilesAreWritten(workspace);
411         for(SimpleInterval currInterval : intervals) {
412             List<SimpleInterval> tmpList = new ArrayList<SimpleInterval>(Arrays.asList(currInterval));
413             File expectedCombinedVCF = runCombineGVCFs(vcfInputs, tmpList, referenceFile, CombineGVCFArgs);
414             checkGenomicsDBAgainstExpected(workspace, tmpList, expectedCombinedVCF.getAbsolutePath(), referenceFile, true, ATTRIBUTES_TO_IGNORE);
415         }
416     }
417 
418     @Test(groups = {"bucket"})
testGenomicsDBImportGCSInputs()419     public void testGenomicsDBImportGCSInputs() throws IOException {
420         testGenomicsDBImporter(resolveLargeFilesAsCloudURIs(LOCAL_GVCFS), INTERVAL, COMBINED, b38_reference_20_21, true, 1);
421     }
422 
423     @Test
testGenomicsDBAbsolutePathDependency()424     public void testGenomicsDBAbsolutePathDependency() throws IOException {
425         final File workspace = createTempDir("genomicsdb-tests-");
426         final File workspace2 = createTempDir("genomicsdb-secondary-tests-");
427 
428         writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace.getAbsolutePath() + "/workspace", 0, false, 0, 1);
429         checkJSONFilesAreWritten(workspace.getAbsolutePath() + "/workspace");
430         Files.move(workspace.toPath(), workspace2.toPath(), StandardCopyOption.REPLACE_EXISTING);
431         checkGenomicsDBAgainstExpected(workspace2.getAbsolutePath() + "/workspace", INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
432     }
433 
434     @Test (enabled = true)
testGenomicsDBAlleleSpecificAnnotations()435     public void testGenomicsDBAlleleSpecificAnnotations() throws IOException {
436         testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.chr20snippet.g.vcf", COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"),
437                 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10433000, 10700000))),
438                 b37_reference_20_21,
439                 new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"});
440     }
441 
442     @Test (enabled = true)
testGenomicsDBAlleleSpecificAnnotationsInTheMiddleOfSpanningDeletion()443     public void testGenomicsDBAlleleSpecificAnnotationsInTheMiddleOfSpanningDeletion() throws IOException {
444         testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.chr20snippet.g.vcf", COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"),
445                 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10433313, 10700000))),
446                 b37_reference_20_21,
447                 new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"});
448     }
449 
450     /**
451      * Converts a list of large file paths into equivalent cloud paths
452      * This must be done non-statically because any failure during static initialization results in hard to understand
453      * TestNG errors and it is possible for {@link BaseTest#getGCPTestInputPath()} to fail if the environment isn't
454      * fully set up.
455      *
456      * The cloud bucket must be organized the same way as the local test files in order to resolve correctly.
457      */
resolveLargeFilesAsCloudURIs(final List<String> filenames)458     private static List<String> resolveLargeFilesAsCloudURIs(final List<String> filenames){
459         return filenames.stream()
460                 .map( filename -> filename.replace(publicTestDir, getGCPTestInputPath()))
461                 .peek( filename -> Assert.assertTrue(BucketUtils.isGcsUrl(filename)))
462                 .collect(Collectors.toList());
463     }
464 
465     @Test(dataProvider = "batchSizes")
testGenomicsDBImportFileInputsInBatches(final int batchSize)466     public void testGenomicsDBImportFileInputsInBatches(final int batchSize) throws IOException {
467         testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, INTERVAL, COMBINED, batchSize);
468     }
469 
470     @Test(dataProvider = "batchSizes")
testGenomicsDBImportFileInputsInBatchesWithMultipleIntervals(final int batchSize)471     public void testGenomicsDBImportFileInputsInBatchesWithMultipleIntervals(final int batchSize) throws IOException {
472         testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, batchSize);
473     }
474 
475     @Test(groups = {"bucket"}, dataProvider = "batchSizes")
testGenomicsDBImportGCSInputsInBatches(final int batchSize)476     public void testGenomicsDBImportGCSInputsInBatches(final int batchSize) throws IOException {
477         testGenomicsDBImporterWithBatchSize(resolveLargeFilesAsCloudURIs(LOCAL_GVCFS), INTERVAL, COMBINED, batchSize);
478     }
479 
480     @DataProvider
getThreads()481     public Object[][] getThreads(){
482         return new Object[][] {
483                 {1}, {2}, {5}
484         };
485     }
486 
487     @Test(groups = {"bucket"}, dataProvider = "getThreads")
testDifferentThreadValuesFromABucket(final int threads)488     public void testDifferentThreadValuesFromABucket(final int threads) throws IOException {
489         final List<String> vcfInputs = resolveLargeFilesAsCloudURIs(LOCAL_GVCFS);
490         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
491 
492         writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, threads);
493         checkJSONFilesAreWritten(workspace);
494         checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
495     }
496 
497     @Test(dataProvider = "getThreads")
testDifferentThreadValuesLocally(final int threads)498     public void testDifferentThreadValuesLocally(final int threads) throws IOException {
499         final List<String> vcfInputs = LOCAL_GVCFS;
500         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
501 
502         writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, threads);
503         checkJSONFilesAreWritten(workspace);
504         checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
505     }
506     /**
507      *
508      * @throws CommandLineException.OutOfRangeArgumentValue  Value must be >= 1024 bytes
509      */
510     @Test(expectedExceptions = CommandLineException.OutOfRangeArgumentValue.class)
testZeroVCFBufferSize()511     public void testZeroVCFBufferSize() throws IOException {
512         testGenomicsDBImportWithZeroBufferSize(LOCAL_GVCFS, INTERVAL, COMBINED);
513     }
514 
515 
testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final int threads)516     private void testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals,
517                                         final String expectedCombinedVCF, final String referenceFile,
518                                         final boolean testAll, final int threads) throws IOException {
519         testGenomicsDBImporter(vcfInputs, intervals, expectedCombinedVCF, referenceFile, testAll, threads, false);
520     }
521 
testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final int threads, final boolean useNativeReader)522     private void testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals,
523                                         final String expectedCombinedVCF, final String referenceFile,
524                                         final boolean testAll, final int threads, final boolean useNativeReader) throws IOException {
525         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
526         writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1, false, false, false, 0, useNativeReader);
527 
528         checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE);
529     }
530 
testGenomicsDBImporter_newMQ(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore)531     private void testGenomicsDBImporter_newMQ(final List<String> vcfInputs, final List<SimpleInterval> intervals,
532                                         final String expectedCombinedVCF, final String referenceFile,
533                                         final boolean testAll, final List<String> attributesToIgnore) throws IOException {
534         final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
535 
536         writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1);
537         checkJSONFilesAreWritten(workspace);
538 
539         checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, attributesToIgnore);
540     }
541 
testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final int batchSize)542     private void testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals,
543                                                      final String expectedCombinedVCF, final int batchSize) throws IOException {
544         testGenomicsDBImporterWithBatchSize(vcfInputs, intervals, expectedCombinedVCF, batchSize, false);
545     }
546 
testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final int batchSize, final boolean useNativeReader)547     private void testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals,
548                                                      final String expectedCombinedVCF, final int batchSize,
549                                                      final boolean useNativeReader) throws IOException {
550         final String workspace = createTempDir("genomicsdb-batchsize-tests-").getAbsolutePath() + "/workspace-" + batchSize;
551 
552         writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, false, 0, 1, false, false, false, 0, true);
553         checkJSONFilesAreWritten(workspace);
554         checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
555     }
556 
testGenomicsDBImportWithZeroBufferSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF)557     private void testGenomicsDBImportWithZeroBufferSize(final List<String> vcfInputs, final List<SimpleInterval> intervals,
558                                                         final String expectedCombinedVCF) throws IOException {
559         final String workspace = createTempDir("genomicsdb-buffersize-tests-").getAbsolutePath() + "/workspace";
560 
561         writeToGenomicsDB(vcfInputs, intervals, workspace, 0, true, 0, 1);
562         checkJSONFilesAreWritten(workspace);
563         checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
564 
565     }
566 
writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads)567     private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace,
568                                    final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads) {
569         writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, false);
570     }
571 
writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals)572     private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace,
573                                    final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals) {
574         writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, mergeIntervals, false, false);
575     }
576 
writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental)577     private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace,
578                                    final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads,
579                                    final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental) {
580         writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, mergeIntervals,
581                           overwriteWorkspace, incremental, 0, false);
582     }
583 
writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental, final int chrsToPartitions, final boolean useNativeReader)584     private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace,
585                                    final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads,
586                                    final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental,
587                                    final int chrsToPartitions, final boolean useNativeReader) {
588         final ArgumentsBuilder args = new ArgumentsBuilder();
589         if (incremental) {
590             args.add(GenomicsDBImport.INCREMENTAL_WORKSPACE_ARG_LONG_NAME, workspace);
591         } else {
592             args.add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace);
593         }
594         intervals.forEach(args::addInterval);
595         vcfInputs.forEach(vcf -> args.add("V", vcf));
596         args.add("batch-size", String.valueOf(batchSize));
597         args.add(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, String.valueOf(threads));
598         args.add(GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME, mergeIntervals);
599         args.add(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, overwriteWorkspace);
600         if (chrsToPartitions != 0) {
601             args.add(GenomicsDBImport.MERGE_CONTIGS_INTO_NUM_PARTITIONS, String.valueOf(chrsToPartitions));
602         }
603         if (useBufferSize) {
604             args.add("genomicsdb-vcf-buffer-size", String.valueOf(bufferSizePerSample));
605         }
606 
607         runCommandLine(args);
608         if (chrsToPartitions != 0) {
609             String[] partitions = GenomicsDBUtils.listGenomicsDBArrays(workspace);
610             // it may not always be the case that the number of partitions created matches
611             // the number we specified, but will be true for our tests
612             Assert.assertTrue(partitions.length == chrsToPartitions);
613         }
614     }
615 
checkJSONFilesAreWritten(final String workspace)616     private static void checkJSONFilesAreWritten(final String workspace) {
617         Assert.assertTrue(BucketUtils.fileExists(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME)));
618         Assert.assertTrue(BucketUtils.fileExists(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME)));
619         Assert.assertTrue(BucketUtils.fileExists(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME)));
620     }
621 
checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore)622     private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals,
623                                                        final String expectedCombinedVCF, final String referenceFile,
624                                                        final boolean testAll, final List<String> attributesToIgnore) throws IOException {
625         checkGenomicsDBAgainstExpected(workspace, intervals,
626                 expectedCombinedVCF, referenceFile,
627                 testAll,
628                 attributesToIgnore,
629                 false,
630                 false,
631                 false);
632     }
633 
checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore, final boolean produceGTfield, final boolean sitesOnlyQuery)634     private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals,
635                                                        final String expectedCombinedVCF, final String referenceFile,
636                                                        final boolean testAll, final List<String> attributesToIgnore,
637                                                        final boolean produceGTfield, final boolean sitesOnlyQuery) throws IOException {
638         checkGenomicsDBAgainstExpected(workspace, intervals,
639                 expectedCombinedVCF, referenceFile,
640                 testAll,
641                 attributesToIgnore,
642                 produceGTfield,
643                 sitesOnlyQuery,
644                 false);
645     }
646 
checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore, final boolean produceGTField, final boolean sitesOnlyQuery, final boolean useVCFCodec)647     private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals,
648                                                        final String expectedCombinedVCF, final String referenceFile,
649                                                        final boolean testAll,
650                                                        final List<String> attributesToIgnore,
651                                                        final boolean produceGTField,
652                                                        final boolean sitesOnlyQuery,
653                                                        final boolean useVCFCodec) throws IOException {
654         final FeatureReader<VariantContext> genomicsDBFeatureReader =
655                 getGenomicsDBFeatureReader(workspace, referenceFile, produceGTField, sitesOnlyQuery, useVCFCodec);
656 
657         final AbstractFeatureReader<VariantContext, LineIterator> combinedVCFReader =
658                 AbstractFeatureReader.getFeatureReader(expectedCombinedVCF, new VCFCodec(), true);
659 
660 
661         intervals.forEach(interval -> {
662             try (CloseableTribbleIterator<VariantContext> actualVcs =
663                          genomicsDBFeatureReader.query(interval.getContig(), interval.getStart(), interval.getEnd());
664 
665                  CloseableTribbleIterator<VariantContext> expectedVcs =
666                          combinedVCFReader.query(interval.getContig(), interval.getStart(), interval.getEnd())) {
667 
668                 BaseTest.assertCondition(actualVcs, expectedVcs, (a, e) -> {
669                         // Test that the VCs match
670                     if (testAll) {
671                         // To correct a discrepancy between genotypeGVCFs which outputs empty genotypes as "./." and GenomicsDB
672                         // which returns them as "." we simply remap the empty ones to be consistent for comparison
673                         List<Genotype> genotypes = a.getGenotypes().stream()
674                                 .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(2)).make():g)
675                                 .collect(Collectors.toList());
676                         a = new VariantContextBuilder(a).genotypes(genotypes).make();
677                         VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, attributesToIgnore, Collections.emptyList(), VCF_HEADER);
678 
679                         // Test only that the genotypes match
680                     } else {
681                         List<Genotype> genotypes = e.getGenotypes().stream()
682                                 .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(Collections.emptyList()).make():g)
683                                 .collect(Collectors.toList());
684                         e = new VariantContextBuilder(e).genotypes(genotypes).make();
685                         VariantContextTestUtils.assertVariantContextsHaveSameGenotypes(a, e);
686                     }
687                 });
688             } catch (IOException e) {
689                 Assert.fail(e.getMessage(), e);
690             }
691         });
692     }
693 
694     @DataProvider
getOrderingTests()695     public Iterator<Object[]> getOrderingTests(){
696         final File outOfOrderSampleMap = getSampleMapFile(
697                         "HG00268\t" + HG_00268 + "\n" +
698                         "NA19625\t" + NA_19625 + "\n" +
699                         "HG00096\t" + HG_00096);
700 
701         final List<Integer> batchSizes = Arrays.asList(0, 1, 2, 3, 4);
702         final List<Object[]> results = new ArrayList<>();
703         for( final Integer batchSize: batchSizes){
704             // -V in order
705             results.add(new Object[] {new ArgumentsBuilder()
706                     .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize))
707                     .addVCF(new File(HG_00096))
708                     .addVCF(new File(HG_00268))
709                     .addVCF(new File(NA_19625))});
710 
711             // -V out of order
712             results.add(new Object[] {new ArgumentsBuilder()
713                     .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize))
714                     .addVCF(new File(HG_00268))
715                     .addVCF(new File(NA_19625))
716                     .addVCF(new File(HG_00096))});
717 
718             //in order sample map
719             results.add(new Object[] {new ArgumentsBuilder()
720                     .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize))
721                     .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, createInOrderSampleMap())});
722 
723             //out of order sample map
724             results.add(new Object[] {new ArgumentsBuilder()
725                     .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize))
726                     .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap)});
727 
728             //out of order sample map with multiple threads
729             results.add(new Object[] {new ArgumentsBuilder()
730                     .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize))
731                     .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap)
732                     .add(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, "2")});
733         }
734         return results.iterator();
735     }
736 
737     @Test
testSampleNameWithSpaces()738     public void testSampleNameWithSpaces() throws IOException {
739         final File outOfOrderSampleMap = getSampleMapFile(
740                 "HG00268 withSpaces\t" + HG_00268_WITH_SPACES + "\n" +
741                         "NA19625\t" + NA_19625 + "\n" +
742                         "HG00096\t" + HG_00096 );
743 
744         final String workspace = createTempDir("gendbtest").getAbsolutePath() + "/workspace";
745 
746         ArgumentsBuilder args = new ArgumentsBuilder()
747                 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(2))
748                 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap)
749                 .addInterval(SMALLER_INTERVAL.get(0))
750                 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace);
751 
752         runCommandLine(args);
753         checkJSONFilesAreWritten(workspace);
754         checkGenomicsDBAgainstExpected(workspace, SMALLER_INTERVAL, COMBINED_WITHSPACES, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
755         checkGenomicsDBAgainstExpected(workspace, SMALLER_INTERVAL, COMBINED_WITHSPACES, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true);
756     }
757 
758     @Test(dataProvider = "getOrderingTests")
testSampleNameOrdering(final ArgumentsBuilder args)759     public void testSampleNameOrdering(final ArgumentsBuilder args) throws IOException {
760         final String workspace = createTempDir("gendbtest").getAbsolutePath() + "/workspace";
761 
762         args.addInterval(INTERVAL.get(0))
763             .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace);
764 
765         runCommandLine(args);
766         checkJSONFilesAreWritten(workspace);
767         checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
768         checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true);
769     }
770 
createInOrderSampleMap()771     private static File createInOrderSampleMap() {
772         final String sampleFileContents =
773                 "HG00096\t" +HG_00096 +"\n" +
774                 "HG00268\t"+ HG_00268 + "\n" +
775                 "NA19625\t"+ NA_19625;
776 
777         return getSampleMapFile(sampleFileContents);
778     }
779 
getSampleMapFile(final String sampleFileContents)780     private static File getSampleMapFile(final String sampleFileContents) {
781         final File sampleNameMap = IOUtils.writeTempFile(sampleFileContents, "sampleNameMap", ".txt");
782         sampleNameMap.deleteOnExit();
783         return sampleNameMap;
784     }
785 
getSampleMapFile(final Map<String, String> mapping)786     private static File getSampleMapFile(final Map<String, String> mapping){
787         return getSampleMapFile(mapping.entrySet()
788                 .stream()
789                 .map( pair -> pair.getKey() + "\t" + pair.getValue())
790                 .collect(Collectors.joining("\n")));
791     }
792 
793     @DataProvider
getRenameCombinations()794     public static Iterator<Object[]> getRenameCombinations() {
795         final Map<String,String> noRemapping = new LinkedHashMap<>();
796         noRemapping.put("s1", "s1");
797         noRemapping.put("s2", "s2");
798         noRemapping.put("s3", "s3");
799 
800         final Map<String,String> sameInput = new LinkedHashMap<>();
801         sameInput.put("s1", "s1");
802         sameInput.put("s2", "s1");
803         sameInput.put("s3", "s1");
804 
805 
806         final Map<String,String> sameInputWeirdOrder = new LinkedHashMap<>();
807         sameInputWeirdOrder.put("s3", "s1");
808         sameInputWeirdOrder.put("s1", "s1");
809         sameInputWeirdOrder.put("s2", "s1");
810 
811         final Map<String,String> swizzled = new LinkedHashMap<>();
812         swizzled.put("s2","s1");
813         swizzled.put("s3","s2");
814         swizzled.put("s1","s3");
815 
816         final Map<String,String> multipleOutOfOrderRenamingsAcrossBatches = new LinkedHashMap<>();
817         multipleOutOfOrderRenamingsAcrossBatches.put("s1", "s1");
818         multipleOutOfOrderRenamingsAcrossBatches.put("s2", "s2");
819         multipleOutOfOrderRenamingsAcrossBatches.put("s1_Renamed", "s1");
820         multipleOutOfOrderRenamingsAcrossBatches.put("Renamed_s2", "s2");
821         multipleOutOfOrderRenamingsAcrossBatches.put("s4", "s3");
822         multipleOutOfOrderRenamingsAcrossBatches.put("s3", "s3");
823         multipleOutOfOrderRenamingsAcrossBatches.put("someOtherSample", "s4");
824 
825 
826         final List<Integer> batchSizes = Arrays.asList(0, 1, 4);
827         final List<Integer> threads = Arrays.asList(1, 2);
828         final List<Map<String, String>> mappings = Arrays.asList(noRemapping, sameInput, sameInputWeirdOrder, swizzled, multipleOutOfOrderRenamingsAcrossBatches);
829         final List<Object[]> out = new ArrayList<>();
830         for(final Map<String,String> mapping : mappings){
831             for(final int batchSize :batchSizes){
832                 for(final int threading : threads){
833                     out.add( new Object[]{mapping, threading, batchSize});
834                 }
835             }
836         }
837         return out.iterator();
838     }
839 
840     @Test(dataProvider = "getRenameCombinations")
testRenamingSamples(final Map<String, String> renamingMap, final int threads, final int batchSize)841     public void testRenamingSamples(final Map<String, String> renamingMap, final int threads, final int batchSize) throws IOException {
842         final LinkedHashMap<String, String> sampleMap = new LinkedHashMap<>(renamingMap);
843         sampleMap.replaceAll( (newSampleName, originalSampleName)-> createInputVCF(originalSampleName).getAbsolutePath());
844 
845         final File sampleMapFile = getSampleMapFile(sampleMap);
846 
847         final String workspace = createTempDir("workspace").getAbsolutePath();
848         Files.delete(Paths.get(workspace));
849         final ArgumentsBuilder args = new ArgumentsBuilder()
850                 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleMapFile.getAbsolutePath())
851                 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, new File(workspace).getAbsolutePath())
852                 .add(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, String.valueOf(threads))
853                 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize))
854                 .addInterval(INTERVAL.get(0));
855 
856         runCommandLine(args);
857         final Set<String> expectedSampleNames = sampleMap.keySet();
858         try(final FeatureReader<VariantContext> reader = getGenomicsDBFeatureReader(workspace, b37_reference_20_21)) {
859             final CloseableTribbleIterator<VariantContext> iterator = reader.iterator();
860             Assert.assertTrue(iterator.hasNext(), "expected to see a variant");
861             Assert.assertTrue(expectedSampleNames.size() > 0);
862             Assert.assertEquals(expectedSampleNames.size(), renamingMap.size());
863             iterator.forEachRemaining(vc -> {
864                 Assert.assertEquals(vc.getSampleNames().size(), expectedSampleNames.size());
865                 Assert.assertEqualsNoOrder(vc.getSampleNames().toArray(), expectedSampleNames.toArray());
866                 expectedSampleNames.forEach( sample -> {
867                     Assert.assertEquals(vc.getGenotype(sample).getAnyAttribute(SAMPLE_NAME_KEY), renamingMap.get(sample));
868                     //check another attribute just to make sure we're not mangling things
869                     Assert.assertEquals(VariantContextGetters.getAttributeAsInt(vc.getGenotype(sample), ANOTHER_ATTRIBUTE_KEY, -1), 10);
870                 });
871             });
872         }
873 
874     }
875 
createInputVCF(final String sampleName)876     private static File createInputVCF(final String sampleName) {
877         final String contig = "chr20";
878         final SAMSequenceDictionary dict = new SAMSequenceDictionary(
879                 Collections.singletonList(new SAMSequenceRecord(contig, 64444167)));
880 
881         final VCFFormatHeaderLine formatField = new VCFFormatHeaderLine(SAMPLE_NAME_KEY, 1, VCFHeaderLineType.String,
882                                                                         "the name of the sample this genotype came from");
883         final Set<VCFHeaderLine> headerLines = new HashSet<>();
884         headerLines.add(formatField);
885         headerLines.add(new VCFFormatHeaderLine(ANOTHER_ATTRIBUTE_KEY, 1, VCFHeaderLineType.Integer, "Another value"));
886         headerLines.add(VCFStandardHeaderLines.getFormatLine("GT"));
887 
888         final File out = createTempFile(sampleName +"_", ".vcf");
889         try (final VariantContextWriter writer = GATKVariantContextUtils.createVCFWriter(out.toPath(), dict, false,
890                                                                                          Options.INDEX_ON_THE_FLY)) {
891             final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.singleton(sampleName));
892             vcfHeader.setSequenceDictionary(dict);
893             writer.writeHeader(vcfHeader);
894             final Allele Aref = Allele.create("A", true);
895             final Allele C = Allele.create("C");
896             final List<Allele> alleles = Arrays.asList(Aref, C);
897             final VariantContext variant = new VariantContextBuilder("invented", contig, INTERVAL.get(0).getStart(), INTERVAL.get(0).getStart(), alleles)
898                     .genotypes(new GenotypeBuilder(sampleName, alleles).attribute(SAMPLE_NAME_KEY, sampleName)
899                                        .attribute(ANOTHER_ATTRIBUTE_KEY, 10).make())
900                     .make();
901             writer.add(variant);
902             return out;
903         }
904     }
905 
906     @Test(expectedExceptions = CommandLineException.class)
testCantSpecifyVCFAndSampleNameFile()907     public void testCantSpecifyVCFAndSampleNameFile(){
908         final ArgumentsBuilder args = new ArgumentsBuilder()
909                 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, createInOrderSampleMap().getAbsolutePath())
910                 .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, HG_00096)
911                 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, createTempDir("workspace").getAbsolutePath())
912                 .addInterval(INTERVAL.get(0));
913         runCommandLine(args);
914     }
915 
916     @Test(expectedExceptions = CommandLineException.MissingArgument.class)
testRequireOneOfVCFOrSampleNameFile()917     public void testRequireOneOfVCFOrSampleNameFile(){
918         final ArgumentsBuilder args = new ArgumentsBuilder()
919                 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, createTempDir("workspace").getAbsolutePath())
920                 .add("L", "1:1-10");
921 
922         runCommandLine(args);
923     }
924 
925     @Test
testGenomicsDBImportWithoutDBField()926     public void testGenomicsDBImportWithoutDBField() throws IOException {
927         //Test for https://github.com/broadinstitute/gatk/issues/3736
928         final List<String> vcfInputs = Arrays.asList(NA_24385);
929         final String workspace = createTempDir("genomicsdb-tests").getAbsolutePath() + "/workspace";
930 	writeToGenomicsDB(vcfInputs, INTERVAL_3736, workspace, 0, false, 0, 1);
931     }
932 
933     @Test
testLongWorkspacePath()934     public void testLongWorkspacePath() throws IOException {
935         //Test for https://github.com/broadinstitute/gatk/issues/4160
936         final List<String> vcfInputs = LOCAL_GVCFS;
937         final String workspace = createTempDir("long_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_genomicsdb").getAbsolutePath() + "/should_not_fail_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
938         writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, 1);
939     }
940 
941     @Test
testCommandIncludedInOutputHeader()942     public void testCommandIncludedInOutputHeader() throws IOException {
943         final List<String> vcfInputs = LOCAL_GVCFS;
944         final String workspace = createTempDir("genomicsdb-tests").getAbsolutePath() + "/workspace";
945 
946         writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, 1);
947         try(final FeatureReader<VariantContext> genomicsDBFeatureReader =
948                     getGenomicsDBFeatureReader(workspace, b38_reference_20_21))
949         {
950             final VCFHeader header = (VCFHeader) genomicsDBFeatureReader.getHeader();
951             final Optional<VCFHeaderLine> commandLineHeaderLine = header.getMetaDataInSortedOrder().stream()
952                     .filter(line -> line.getValue().contains(GenomicsDBImport.class.getSimpleName()))
953                     .findAny();
954 
955             Assert.assertTrue(commandLineHeaderLine.isPresent(), "no headerline was present containing information about the GenomicsDBImport command");
956         }
957 
958 
959     }
960 
961     @Test
testPreserveContigOrderingInHeader()962     public void testPreserveContigOrderingInHeader() throws IOException {
963         final String workspace = createTempDir("testPreserveContigOrderingInHeader-").getAbsolutePath() + "/workspace";
964         ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 17959479, 17959479)));
965         writeToGenomicsDB(Arrays.asList(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf",
966                 GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting2.g.vcf"), intervals, workspace, 0, false, 0, 1);
967 
968         try ( final FeatureReader<VariantContext> genomicsDBFeatureReader =
969                       getGenomicsDBFeatureReader(workspace, b38_reference_20_21);
970 
971              final AbstractFeatureReader<VariantContext, LineIterator> inputGVCFReader =
972                       AbstractFeatureReader.getFeatureReader(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf", new VCFCodec(), true);
973         ) {
974             final SAMSequenceDictionary dictionaryFromGenomicsDB = ((VCFHeader)genomicsDBFeatureReader.getHeader()).getSequenceDictionary();
975             final SAMSequenceDictionary dictionaryFromInputGVCF =  ((VCFHeader)inputGVCFReader.getHeader()).getSequenceDictionary();
976 
977             Assert.assertEquals(dictionaryFromGenomicsDB, dictionaryFromInputGVCF, "Sequence dictionary from GenomicsDB does not match original sequence dictionary from input GVCF");
978         }
979 
980     }
getGenomicsDBFeatureReader( final String workspace, final String reference, final boolean produceGTField)981     private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(
982             final String workspace, final String reference,
983             final boolean produceGTField) throws IOException {
984         return getGenomicsDBFeatureReader(workspace, reference,
985                 produceGTField, false);
986     }
987 
getGenomicsDBFeatureReader( final String workspace, final String reference, final boolean produceGTField, final boolean sitesOnlyQuery)988     private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(
989             final String workspace, final String reference,
990             final boolean produceGTField,
991             final boolean sitesOnlyQuery) throws IOException {
992         return getGenomicsDBFeatureReader(workspace, reference,
993                 produceGTField, sitesOnlyQuery, false);
994     }
995 
getGenomicsDBFeatureReader( final String workspace, final String reference, final boolean produceGTField, final boolean sitesOnlyQuery, final boolean useVCFCodec)996     private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(
997             final String workspace, final String reference,
998             final boolean produceGTField,
999             final boolean sitesOnlyQuery,
1000             final boolean useVCFCodec) throws IOException {
1001        String workspaceAbsPath = BucketUtils.makeFilePathAbsolute(workspace);
1002        GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder = GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
1003                 .setWorkspace(workspace)
1004                 .setReferenceGenome(reference)
1005                 .setVidMappingFile(IOUtils.appendPathToDir(workspaceAbsPath, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME))
1006                 .setCallsetMappingFile(IOUtils.appendPathToDir(workspaceAbsPath, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME))
1007                 .setVcfHeaderFilename(IOUtils.appendPathToDir(workspaceAbsPath, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME))
1008                 .setProduceGTField(produceGTField)
1009                 .setSitesOnlyQuery(sitesOnlyQuery)
1010                .setGenerateArrayNameFromPartitionBounds(true);
1011         GenomicsDBVidMapProto.VidMappingPB vidMapPB = null;
1012         try {
1013             vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getProtobufVidMappingFromJsonFile(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME));
1014         }
1015         catch (final IOException e) {
1016             throw new UserException("Could not open vid json file "+GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME, e);
1017         }
1018         HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
1019                 org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);
1020 
1021         vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
1022                 GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum");
1023 
1024         if(vidMapPB != null) {
1025             exportConfigurationBuilder.setVidMapping(vidMapPB);
1026         }
1027 
1028         if (useVCFCodec) {
1029             return new GenomicsDBFeatureReader<>(exportConfigurationBuilder.build(), new VCFCodec(), Optional.empty());
1030         } else {
1031             return new GenomicsDBFeatureReader<>(exportConfigurationBuilder.build(), new BCF2Codec(), Optional.empty());
1032         }
1033     }
1034 
getGenomicsDBFeatureReader( final String workspace, final String reference)1035     private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(
1036             final String workspace, final String reference) throws IOException {
1037         return getGenomicsDBFeatureReader(workspace, reference, false);
1038     }
1039 
1040     @Test(expectedExceptions = GenomicsDBImport.UnableToCreateGenomicsDBWorkspace.class)
testYouCantWriteIntoAnExistingDirectory()1041     public void testYouCantWriteIntoAnExistingDirectory(){
1042         // this actually creates the directory on disk, not just the file name.
1043         final String workspace = createTempDir("workspace").getAbsolutePath();
1044         writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1);
1045     }
1046 
1047     @Test(expectedExceptions = CommandLineException.class)
testOverwriteWorkspaceAndIncrementalImportCannotBothBeTrue()1048     public void testOverwriteWorkspaceAndIncrementalImportCannotBothBeTrue() {
1049         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
1050         writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1, false, true, true);
1051     }
1052 
1053     @Test(expectedExceptions = UserException.class)
testIncrementalMustHaveExistingWorkspace()1054     public void testIncrementalMustHaveExistingWorkspace() {
1055         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath();
1056         writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace + "workspace2", 0, false, 0, 1, false, false, true);
1057     }
1058 
testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final boolean produceGTField, final boolean useVCFCodec, final String expected, final int chrsToPartitions, final boolean useNativeReader)1059     private void testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace,
1060                                        final int batchSize, final boolean produceGTField, final boolean useVCFCodec, final String expected,
1061                                        final int chrsToPartitions, final boolean useNativeReader) throws IOException {
1062         for(int i=0; i<LOCAL_GVCFS.size(); i+=stepSize) {
1063             int upper = Math.min(i+stepSize, LOCAL_GVCFS.size());
1064             writeToGenomicsDB(LOCAL_GVCFS.subList(i, upper), intervals, workspace, batchSize, false, 0, 1, false, false, i!=0,
1065                               chrsToPartitions, i!=0 && useNativeReader);
1066             checkJSONFilesAreWritten(workspace);
1067         }
1068         for(SimpleInterval currInterval : intervals) {
1069             List<SimpleInterval> tmpList = new ArrayList<SimpleInterval>(Arrays.asList(currInterval));
1070             String expectedVcf = expected;
1071             if (expected.isEmpty()) {
1072                 File expectedCombinedVCF = runCombineGVCFs(LOCAL_GVCFS, tmpList, b38_reference_20_21, new String[0]);
1073                 expectedVcf = expectedCombinedVCF.getAbsolutePath();
1074             }
1075             checkGenomicsDBAgainstExpected(workspace, tmpList, expectedVcf, b38_reference_20_21, true,
1076                                            ATTRIBUTES_TO_IGNORE, produceGTField, false);
1077             if (useVCFCodec) {
1078                 checkGenomicsDBAgainstExpected(workspace, tmpList, expectedVcf, b38_reference_20_21, true,
1079                                                ATTRIBUTES_TO_IGNORE, produceGTField, false, true);
1080             }
1081         }
1082     }
1083 
1084     @Test
testGenomicsDBBasicIncremental()1085     public void testGenomicsDBBasicIncremental() throws IOException {
1086         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
1087         testIncrementalImport(2, INTERVAL, workspace, 0, true, true, COMBINED_WITH_GENOTYPES, 0, false);
1088         createAndCheckIntervalListFromExistingWorkspace(workspace, INTERVAL_PICARD_STYLE_EXPECTED);
1089     }
1090 
1091     @Test
testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervals()1092     public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervals() throws IOException {
1093         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
1094         testIncrementalImport(2, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 1, false, true, "", 0, false);
1095         createAndCheckIntervalListFromExistingWorkspace(workspace, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED);
1096     }
1097 
1098     @Test
testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsMergeContigsIntoPartitions()1099     public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsMergeContigsIntoPartitions() throws IOException {
1100         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
1101         testIncrementalImport(2, INTERVAL_20_21, workspace, 1, false, true, "", 1, false);
1102         createAndCheckIntervalListFromExistingWorkspace(workspace, MERGED_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED);
1103     }
1104 
1105     @Test
testGenomicsDBIncrementalAndBatchSize2()1106     public void testGenomicsDBIncrementalAndBatchSize2() throws IOException {
1107         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
1108         testIncrementalImport(2, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, false,
1109                               COMBINED_WITH_GENOTYPES, 0, false);
1110     }
1111 
1112     @Test
testGenomicsDBMultipleIncrementalImports()1113     public void testGenomicsDBMultipleIncrementalImports() throws IOException {
1114         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
1115         testIncrementalImport(1, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, true,
1116                               COMBINED_WITH_GENOTYPES, 0, false);
1117     }
1118 
1119     @Test
testGenomicsDBIncrementalWithManyNonAdjacentContigsToSeveralPartitions()1120     public void testGenomicsDBIncrementalWithManyNonAdjacentContigsToSeveralPartitions() throws IOException {
1121         List<SimpleInterval> manyContigs = MANY_CONTIGS_NON_ADJACENT_INTERVALS.stream().map(SimpleInterval::new).collect(Collectors.toList());
1122         final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace";
1123 
1124         writeToGenomicsDB(MANY_CONTIGS_VCF.subList(0, 1), manyContigs, workspace, 0, false, 0, 1, false, false, false, SEVERAL_CONTIGS, false);
1125         writeToGenomicsDB(MANY_CONTIGS_VCF.subList(1, 2), manyContigs, workspace, 0, false, 0, 1, false, false, true, SEVERAL_CONTIGS, false);
1126         checkJSONFilesAreWritten(workspace);
1127         checkGenomicsDBAgainstExpected(workspace, manyContigs, EXPECTED_SEVERAL_CONTIGS_VCF, MANY_CONTIGS_REF, true,
1128                 MANY_CONTIGS_ATTRIBUTES_TO_IGNORE, true, false);
1129 
1130         createAndCheckIntervalListFromExistingWorkspace(workspace, MANY_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED);
1131     }
1132 
createAndCheckIntervalListFromExistingWorkspace(final String workspace, final String expectedOutput)1133     private void createAndCheckIntervalListFromExistingWorkspace(final String workspace, final String expectedOutput) {
1134         final ArgumentsBuilder args = new ArgumentsBuilder();
1135         final String outputIntervalList = workspace + "interval_output";
1136         args.add(GenomicsDBImport.INCREMENTAL_WORKSPACE_ARG_LONG_NAME, workspace);
1137         args.add(GenomicsDBImport.INTERVAL_LIST_LONG_NAME, outputIntervalList);
1138 
1139         runCommandLine(args);
1140 
1141         final IntervalList generatedInterval = IntervalList.fromFile(new File(outputIntervalList));
1142         final IntervalList expectedInterval = IntervalList.fromFile(new File(expectedOutput));
1143         Assert.assertTrue(generatedInterval.sorted().equals(expectedInterval.sorted()));
1144     }
1145 
basicWriteAndQueryWithOptions(String workspace, Map<String, Object> options)1146     void basicWriteAndQueryWithOptions(String workspace, Map<String, Object> options) throws IOException {
1147         final ArgumentsBuilder args = new ArgumentsBuilder();
1148         args.add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace);
1149         INTERVAL.forEach(args::addInterval);
1150         LOCAL_GVCFS.forEach(vcf -> args.add("V", vcf));
1151         for ( String key : options.keySet()) {
1152             if (key.equals(GenomicsDBImport.SHARED_POSIXFS_OPTIMIZATIONS)) {
1153                 Assert.assertTrue(options.get(key) instanceof Boolean);
1154                 args.add(GenomicsDBImport.SHARED_POSIXFS_OPTIMIZATIONS, (Boolean)options.get(key));
1155             }
1156             if (key.equals(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME)) {
1157                 Assert.assertTrue(options.get(key) instanceof Boolean);
1158                 args.add(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, (Boolean)options.get(key));
1159             }
1160         }
1161         runCommandLine(args);
1162         checkJSONFilesAreWritten(workspace);
1163         checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
1164     }
1165 
1166     @Test
testWithMiscOptions()1167     public void testWithMiscOptions() throws IOException {
1168         final String workspace = createTempDir("genomicsdb-misc-options").getAbsolutePath() + "/workspace";
1169         IOUtils.deleteOnExit(IOUtils.getPath(workspace));
1170         Map<String, Object> options = new HashMap<String, Object>();
1171 
1172         // Test with shared posixfs optimizations set
1173         options.put(GenomicsDBImport.SHARED_POSIXFS_OPTIMIZATIONS, true);
1174         basicWriteAndQueryWithOptions(workspace, options);
1175 
1176         // Test with shared posixfs optimizations and overwrite workspace set
1177         options.put(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, true);
1178         basicWriteAndQueryWithOptions(workspace, options);
1179     }
1180 
1181     @Test(expectedExceptions = GenomicsDBImport.UnableToCreateGenomicsDBWorkspace.class)
testWithMiscOptionsNoOverwrite()1182     public void testWithMiscOptionsNoOverwrite() throws IOException {
1183         final String workspace = createTempDir("genomicsdb-misc-options-nooverwrite").getAbsolutePath() + "/workspace";
1184         IOUtils.deleteOnExit(IOUtils.getPath(workspace));
1185         Map<String, Object> options = new HashMap<String, Object>();
1186         basicWriteAndQueryWithOptions(workspace, options);
1187 
1188         // Test with overwrite workspace set to false - should throw an exception - GenomicsDBImport.UnableToCreateGenomicsDBWorkspace
1189         options.replace(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, false);
1190         basicWriteAndQueryWithOptions(workspace, options);
1191     }
1192 
1193     @Test
testQueryWithComputationsExceeding32BitsDefault()1194     public void testQueryWithComputationsExceeding32BitsDefault() throws IOException {
1195         final String folder = createTempDir("computations_exceed_32bits").getAbsolutePath();
1196         IOUtils.extractTarGz(Paths.get(TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE), Paths.get(folder));
1197         IOUtils.deleteOnExit(IOUtils.getPath(folder));
1198         final String workspace = folder + "/bigint_genomicsdb_ws";
1199         checkGenomicsDBAgainstExpected(workspace, new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1"))), folder+"/expected_combined_bigint.vcf",
1200                 folder+"/reference/chr1_10MB.fasta.gz", true, ATTRIBUTES_TO_IGNORE, false, false, true);
1201     }
1202 
1203     // The following test should fail with a Throwable because of limitations in BCF2Codec - see https://github.com/broadinstitute/gatk/issues/6548
1204     @Test(expectedExceptions = Throwable.class)
testQueryWithComputationsExceeding32BitsBCFCodec()1205     public void testQueryWithComputationsExceeding32BitsBCFCodec() throws IOException {
1206         final String folder = createTempDir("computations_exceed_32bits_bcf2codec").getAbsolutePath() + "/testQueryWithComputationsExceed32Bits";
1207         IOUtils.extractTarGz(Paths.get(TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE), Paths.get(folder));
1208         IOUtils.deleteOnExit(IOUtils.getPath(folder));
1209         final String workspace = folder + "/bigint_genomicsdb_ws";
1210         checkGenomicsDBAgainstExpected(workspace, new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1"))), folder+"/expected_combined_bigint.vcf",
1211                 folder+"/reference/chr1_10MB.fasta.gz", true, ATTRIBUTES_TO_IGNORE, false, false, false);
1212     }
1213 
1214     @Test(groups = {"bucket"})
testWriteToAndQueryFromGCS()1215     public void testWriteToAndQueryFromGCS() throws IOException {
1216         final String workspace = BucketUtils.randomRemotePath(getGCPTestStaging(), "", "") + "/";
1217         IOUtils.deleteOnExit(IOUtils.getPath(workspace));
1218         writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1);
1219         checkJSONFilesAreWritten(workspace);
1220         checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
1221     }
1222 
1223     @Test(groups = {"bucket"}, expectedExceptions = GenomicsDBImport.UnableToCreateGenomicsDBWorkspace.class)
testWriteToExistingGCSDirectory()1224     public void testWriteToExistingGCSDirectory() throws IOException {
1225         final String workspace = BucketUtils.randomRemotePath(getGCPTestStaging(), "", "") + "/";
1226         IOUtils.deleteOnExit(IOUtils.getPath(workspace));
1227         int rc = GenomicsDBUtils.createTileDBWorkspace(workspace, false);
1228         Assert.assertEquals(rc, 0);
1229         writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1);
1230     }
1231 }
1232