1 package org.broadinstitute.hellbender.tools.genomicsdb; 2 3 import htsjdk.samtools.SAMSequenceDictionary; 4 import htsjdk.samtools.SAMSequenceRecord; 5 import htsjdk.samtools.util.IntervalList; 6 import htsjdk.tribble.AbstractFeatureReader; 7 import htsjdk.tribble.CloseableTribbleIterator; 8 import htsjdk.tribble.FeatureReader; 9 import htsjdk.tribble.readers.LineIterator; 10 import htsjdk.variant.bcf2.BCF2Codec; 11 import htsjdk.variant.variantcontext.Allele; 12 import htsjdk.variant.variantcontext.Genotype; 13 import htsjdk.variant.variantcontext.GenotypeBuilder; 14 import htsjdk.variant.variantcontext.VariantContext; 15 import htsjdk.variant.variantcontext.VariantContextBuilder; 16 import htsjdk.variant.variantcontext.writer.Options; 17 import htsjdk.variant.variantcontext.writer.VariantContextWriter; 18 import htsjdk.variant.vcf.VCFCodec; 19 import htsjdk.variant.vcf.VCFFormatHeaderLine; 20 import htsjdk.variant.vcf.VCFHeader; 21 import htsjdk.variant.vcf.VCFHeaderLine; 22 import htsjdk.variant.vcf.VCFHeaderLineType; 23 import htsjdk.variant.vcf.VCFStandardHeaderLines; 24 import java.io.File; 25 import java.io.IOException; 26 import java.nio.file.Files; 27 import java.nio.file.Paths; 28 import java.nio.file.StandardCopyOption; 29 import java.util.ArrayList; 30 import java.util.Arrays; 31 import java.util.Collections; 32 import java.util.HashMap; 33 import java.util.HashSet; 34 import java.util.Iterator; 35 import java.util.LinkedHashMap; 36 import java.util.LinkedList; 37 import java.util.List; 38 import java.util.Map; 39 import java.util.Optional; 40 import java.util.Set; 41 import java.util.stream.Collectors; 42 import org.broadinstitute.barclay.argparser.CommandLineException; 43 import org.broadinstitute.hellbender.CommandLineProgramTest; 44 import org.broadinstitute.hellbender.Main; 45 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 46 import org.broadinstitute.hellbender.exceptions.UserException; 47 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; 48 import org.broadinstitute.hellbender.testutils.BaseTest; 49 import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; 50 import org.broadinstitute.hellbender.utils.SimpleInterval; 51 import org.broadinstitute.hellbender.utils.Utils; 52 import org.broadinstitute.hellbender.utils.gcs.BucketUtils; 53 import org.broadinstitute.hellbender.utils.io.IOUtils; 54 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 55 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; 56 import org.broadinstitute.hellbender.utils.variant.VariantContextGetters; 57 import org.genomicsdb.GenomicsDBUtils; 58 import org.genomicsdb.model.GenomicsDBExportConfiguration; 59 import org.genomicsdb.model.GenomicsDBVidMapProto; 60 import org.genomicsdb.reader.GenomicsDBFeatureReader; 61 import org.testng.Assert; 62 import org.testng.annotations.DataProvider; 63 import org.testng.annotations.Test; 64 65 @Test(groups = {"variantcalling"}) 66 public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTest { 67 private static final String HG_00096 = largeFileTestDir + "gvcfs/HG00096.g.vcf.gz"; 68 private static final String HG_00268 = largeFileTestDir + "gvcfs/HG00268.g.vcf.gz"; 69 private static final String NA_19625 = largeFileTestDir + "gvcfs/NA19625.g.vcf.gz"; 70 //The following 3 files were obtained by running CombineGVCFs on the above 3 files (separately). This introduces spanning 71 //deletions in the files. Hence, these files can be used to test for spanning deletions in the input VCF. 72 private static final String HG_00096_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz"; 73 private static final String HG_00268_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz"; 74 private static final String NA_19625_after_combine_gvcfs = largeFileTestDir + "gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz"; 75 private static final String NA_24385 = largeFileTestDir + "NA24385.vcf.gz"; 76 private static final String NA_12878_PHASED = largeFileTestDir + "NA12878.phasedData.Chr20.vcf"; //NOTE: this is not phased according to the vcf spec but it reflects phasing currently produced by haplotype caller 77 private static final String MULTIPLOID_DATA_HG37 = largeFileTestDir + "gvcfs/HapMap5plex.ploidy10.b37.g.vcf"; 78 private static final String NA12878_HG37 = toolsTestDir + "GenomicsDBImport/expected.testGVCFMode.gatk4.g.vcf"; 79 //This file was generated by running CombineGVCFs on the input files 80 //./gatk CombineGVCFs -V src/test/resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/expected.testGVCFMode.gatk4.g.vcf -V src/test/resources/large/gvcfs/HapMap5plex.ploidy10.b37.g.vcf -R src/test/resources/large/human_g1k_v37.20.21.fasta -L 20:10000000-10100000 -O src/test/resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/expected.testGenomicsDBImportWithNonDiploidData.vcf 81 private static final String MULTIPLOID_EXPECTED_RESULT = toolsTestDir + "GenomicsDBImport/expected.testGenomicsDBImportWithNonDiploidData.vcf"; 82 private static final String MNP_GVCF = toolsTestDir + "GenomicsDBImport/mnp.input.g.vcf"; 83 private static final String ARTIFICIAL_PHASED = getTestDataDir() + "/ArtificalPhasedData.1.g.vcf"; 84 private static final String HG_00268_WITH_SPACES = largeFileTestDir + "gvcfs/HG00268.spaceInSampleName.g.vcf"; 85 private static final List<String> LOCAL_GVCFS = Arrays.asList(HG_00096, HG_00268, NA_19625); 86 private static final List<String> LOCAL_GVCFS_AFTER_COMBINE_GVCFS = Arrays.asList(HG_00096_after_combine_gvcfs, 87 HG_00268_after_combine_gvcfs, 88 NA_19625_after_combine_gvcfs); 89 private static final String GENOMICSDB_TEST_DIR = toolsTestDir + "GenomicsDBImport/"; 90 private static final String COMBINEGVCFS_TEST_DIR = toolsTestDir + "walkers/CombineGVCFs/"; 91 private static final String COMBINED = largeFileTestDir + "gvcfs/combined.gatk3.7.g.vcf.gz"; 92 private static final String COMBINED_WITH_GENOTYPES = largeFileTestDir + "gvcfs/combined_with_genotypes.g.vcf.gz"; 93 //This file was obtained from combined.gatk3.7.g.vcf.gz by dropping all the samples 94 private static final String COMBINED_SITES_ONLY = largeFileTestDir + "gvcfs/combined.gatk3.7_sites_only.g.vcf.gz"; 95 private static final String INTERVAL_PICARD_STYLE_EXPECTED = toolsTestDir + "GenomicsDBImport/interval_expected.interval_list"; 96 private static final String MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED = 97 toolsTestDir + "GenomicsDBImport/multiple_non_adjacent_intervals_combine_gvcfs_expected.interval_list"; 98 private static final String MERGED_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED = 99 toolsTestDir + "GenomicsDBImport/chr20_chr21_merged_contigs_expected.interval_list"; 100 private static final String TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE = GENOMICSDB_TEST_DIR + "/int64_test.tar.gz"; 101 //Consider a gVCF with a REF block chr20:50-150. Importing this data into GenomicsDB using multiple intervals 102 //-L chr20:1-100 and -L chr20:101-200 will cause the REF block to be imported into both the arrays 103 //Now, when reading data from the workspace (assume full scan) - the data is split into 2 REF block intervals chr20:50-100 104 //and chr20:101-150 one from each array 105 //The following COMBINED_MULTI_INTERVAL gvcf is identical to the gVCF in the previous line except at the partition break 106 //position 107 //The previous file has the following line: 108 //chr20 17970000 . G <NON_REF> . . END=17970001 109 // 110 //while this file has: 111 //chr20 17970000 . G <NON_REF> . . . 112 //chr20 17970001 . G <NON_REF> . . . 113 // 114 private static final String COMBINED_MULTI_INTERVAL = largeFileTestDir + "gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz"; 115 private static final String COMBINED_WITHSPACES = largeFileTestDir + "gvcfs/combined.gatk3.7.smaller_interval.g.vcf"; 116 private static final ArrayList<SimpleInterval> INTERVAL = 117 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 17960187, 17981445))); 118 private static final ArrayList<SimpleInterval> INTERVAL_NOTFULL = 119 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 1, 17960187))); 120 private static final ArrayList<SimpleInterval> INTERVAL_20_21 = 121 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20"),new SimpleInterval("chr21"))); 122 private static final ArrayList<SimpleInterval> MULTIPLE_INTERVALS = new ArrayList<SimpleInterval>(Arrays.asList( 123 new SimpleInterval("chr20", 17960187, 17970000), 124 new SimpleInterval("chr20", 17970001, 17980000), 125 new SimpleInterval("chr20", 17980001, 17981445) 126 )); 127 private static final ArrayList<SimpleInterval> MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS = 128 new ArrayList<SimpleInterval>(Arrays.asList( 129 new SimpleInterval("chr20", 17960187, 17969999), 130 new SimpleInterval("chr20", 17970000, 17980000), 131 new SimpleInterval("chr20", 17980001, 17981445) 132 )); 133 private static final ArrayList<SimpleInterval> MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS = 134 new ArrayList<SimpleInterval>(Arrays.asList( 135 new SimpleInterval("chr20", 17960187, 17969999), 136 new SimpleInterval("chr20", 17980001, 17981445), 137 new SimpleInterval("chr21", 29477554, 29486255) 138 )); 139 private static final ArrayList<SimpleInterval> INTERVAL_3736 = 140 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr6",130365070,146544250))); 141 private static final ArrayList<SimpleInterval> INTERVAL_NONDIPLOID = 142 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10000000, 10100000))); 143 private static final ArrayList<SimpleInterval> SMALLER_INTERVAL = 144 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 17960187, 17961973))); 145 private static final VCFHeader VCF_HEADER = VariantContextTestUtils.getCompleteHeader(); 146 private static final String SAMPLE_NAME_KEY = "SN"; 147 private static final String ANOTHER_ATTRIBUTE_KEY = "AA"; 148 149 private static final List<String> GVCFS_WITH_NEW_MQ = Arrays.asList(NA12878_HG37, getTestDataDir() + "/walkers/CombineGVCFs/YRIoffspring.chr20snippet.g.vcf"); 150 private static final String COMBINED_WITH_NEW_MQ = toolsTestDir + "/walkers/GenomicsDBImport/newMQcalc.combined.g.vcf"; 151 private static final List<SimpleInterval> INTERVAL2 = Arrays.asList(new SimpleInterval("20", 1, 11_000_000)); 152 private static final List<String> ATTRIBUTES_TO_IGNORE = Arrays.asList("RAW_MQ","RAW_MQandDP"); //CombineGVCFs doesn't support the old RAW_MQ anymore 153 // we're using vcfs instead of gvcfs for many contigs test, and these attributes don't have default combine operations in GenomicsDB 154 private static final List<String> MANY_CONTIGS_ATTRIBUTES_TO_IGNORE = Arrays.asList("HaplotypeScore","MLEAC", "MLEAF"); 155 private static final String P717 = largeFileTestDir + "Ptrichocarpa.v3.sorted.p717.vcf"; 156 private static final String P717_2 = largeFileTestDir + "Ptrichocarpa.v3.sorted.p717_2.vcf"; 157 private static final List<String> MANY_CONTIGS_VCF = Arrays.asList(P717, P717_2); 158 private static final String EXPECTED_SEVERAL_CONTIGS_VCF = largeFileTestDir + "Ptrichocarpa.v3.p717.p717_2.combined.final.expected.vcf"; 159 private static final String MANY_CONTIGS_REF = largeFileTestDir + "Populus_trichocarpa.Pop_tri_v3.dna.nonchromosomal_subset_renamed.fa"; 160 // scaffold_3123 has been removed to test non adjacent interval list works (after scaffold_3381 in header) 161 private static final List<String> MANY_CONTIGS_NON_ADJACENT_INTERVALS = Arrays.asList("scaffold_3121", "scaffold_3427", "scaffold_3213", "scaffold_3050", "scaffold_3381", 162 "scaffold_3472", "scaffold_2907", "scaffold_3046", "scaffold_3412", "scaffold_3304", "scaffold_3332", "scaffold_3326", "scaffold_3230", 163 "scaffold_3160", "scaffold_3403", "scaffold_2851", "scaffold_3416", "scaffold_3340", "scaffold_2911", "scaffold_3442", "scaffold_3681", "scaffold_2889", 164 "scaffold_3305", "scaffold_3335", "scaffold_3316", "scaffold_3126", "scaffold_3363", "scaffold_2844", "scaffold_3388", "scaffold_3285", "scaffold_2968", 165 "scaffold_3074", "scaffold_3436", "scaffold_3289", "scaffold_3264", "scaffold_2919", "scaffold_3422", "scaffold_3393", "scaffold_3387", "scaffold_3453", 166 "scaffold_3171", "scaffold_3372", "scaffold_3389", "scaffold_3259", "scaffold_2930", "scaffold_3129", "scaffold_3044", "scaffold_3147", "scaffold_2885", 167 "scaffold_3452", "scaffold_3202", "scaffold_3263", "scaffold_3354", "scaffold_3134", "scaffold_3255", "scaffold_3320", "scaffold_3523", "scaffold_3432", 168 "scaffold_3239", "scaffold_3206", "scaffold_3437", "scaffold_2922", "scaffold_3136", "scaffold_3292", "scaffold_3391", "scaffold_3061", "scaffold_3250", 169 "scaffold_3226", "scaffold_2857", "scaffold_3528", "scaffold_3325", "scaffold_3296", "scaffold_3298", "scaffold_2924", "scaffold_3157", "scaffold_2855", 170 "scaffold_3275", "scaffold_3007", "scaffold_3306", "scaffold_3179", "scaffold_3060", "scaffold_3222", "scaffold_3648", "scaffold_3005", "scaffold_3020", 171 "scaffold_3194", "scaffold_3328", "scaffold_3251", "scaffold_3547", "scaffold_3342", "scaffold_3139", "scaffold_3262", "scaffold_3210", "scaffold_2981", 172 "scaffold_2933", "scaffold_3056", "scaffold_3413", "scaffold_3064", "scaffold_3353", "scaffold_2913", "scaffold_3445", "scaffold_3374", "scaffold_3214", 173 "scaffold_3423", "scaffold_3095", "scaffold_2965", "scaffold_3357", "scaffold_3021", "scaffold_3228", "scaffold_3300", "scaffold_3042", "scaffold_3312", 174 "scaffold_3537", "scaffold_3058", "scaffold_3425", "scaffold_3431", "scaffold_3368", "scaffold_2951", "scaffold_3356", "scaffold_3116", "scaffold_3257", 175 "scaffold_3478", "scaffold_3068", "scaffold_3008", "scaffold_2893", "scaffold_3088", "scaffold_3269", "scaffold_3245", "scaffold_3190", "scaffold_3054", 176 "scaffold_3383", "scaffold_3346", "scaffold_3223", "scaffold_3446", "scaffold_3370", "scaffold_3252", "scaffold_3053", "scaffold_3100", "scaffold_2838", 177 "scaffold_3272", "scaffold_3384", "scaffold_2868", "scaffold_3398", "scaffold_3107", "scaffold_3014", "scaffold_3364", "scaffold_2987", "scaffold_3191", 178 "scaffold_3076", "scaffold_3246", "scaffold_3011", "scaffold_3348", "scaffold_3231", "scaffold_3448", "scaffold_3360", "scaffold_3352", "scaffold_3294", 179 "scaffold_2853", "scaffold_3024", "scaffold_3426", "scaffold_3379", "scaffold_3440", "scaffold_3550", "scaffold_2879", "scaffold_3362", "scaffold_3236"); 180 private static final int SEVERAL_CONTIGS = 7; 181 private static final String MANY_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED = 182 toolsTestDir + "GenomicsDBImport/Ptrichocarpa.v3.expected.interval_list"; 183 184 @Override getTestedClassName()185 public String getTestedClassName() { 186 return GenomicsDBImport.class.getSimpleName(); 187 } 188 189 @DataProvider(name="batchSizes") batchSizes()190 public Object[][] batchSizes() { 191 return new Object[][] { 192 new Object[]{1}, 193 new Object[]{2}, 194 new Object[]{3}, 195 new Object[]{4}, 196 new Object[]{100}, 197 }; 198 } 199 200 @Test testGenomicsDBImportFileInputs()201 public void testGenomicsDBImportFileInputs() throws IOException { 202 testGenomicsDBImporter(LOCAL_GVCFS, INTERVAL, COMBINED, b38_reference_20_21, true, 1); 203 } 204 205 @Test testGenomicsDBImportFileInputs_newMQ()206 public void testGenomicsDBImportFileInputs_newMQ() throws IOException { 207 testGenomicsDBImporter_newMQ(GVCFS_WITH_NEW_MQ, INTERVAL2, COMBINED_WITH_NEW_MQ, b37_reference_20_21, true, Collections.emptyList()); 208 } 209 210 @Test testGenomicsDBImportFileInputsWithMultipleIntervals()211 public void testGenomicsDBImportFileInputsWithMultipleIntervals() throws IOException { 212 testGenomicsDBImporter(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, b38_reference_20_21, true, 1); 213 } 214 215 @Test(timeOut = 1000000) testGenomicsDBImportWith1000IntervalsToBeMerged()216 public void testGenomicsDBImportWith1000IntervalsToBeMerged() throws IOException { 217 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 218 LinkedList<SimpleInterval> intervals = new LinkedList<SimpleInterval>(); 219 //[ 17960187, 17981445 ] 220 int base = 17960187; 221 for (int i = 0; i < 1000; ++i) 222 intervals.add(new SimpleInterval("chr20", base + 20 * i, base + 20 * i + 10)); //intervals of size 10 separated by 10 223 writeToGenomicsDB(new ArrayList<String>(Arrays.asList(LOCAL_GVCFS.get(0))), intervals, workspace, 0, 224 false, 0, 1, true); 225 } 226 227 @Test testGenomicsDBImportFileInputsAgainstCombineGVCF()228 public void testGenomicsDBImportFileInputsAgainstCombineGVCF() throws IOException { 229 testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL, b38_reference_20_21, new String[0]); 230 } 231 232 @Test testGenomicsDBImportFileInputsAgainstCombineGVCFMergeContigsToSinglePartition()233 public void testGenomicsDBImportFileInputsAgainstCombineGVCFMergeContigsToSinglePartition() throws IOException { 234 testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL_20_21, b38_reference_20_21, new String[0], 1, 1, false); 235 } 236 237 @Test testGenomicsDBImportMergeContigsManyNonAdjacentContigsToSeveralContigs()238 public void testGenomicsDBImportMergeContigsManyNonAdjacentContigsToSeveralContigs() throws IOException { 239 List<SimpleInterval> manyContigs = MANY_CONTIGS_NON_ADJACENT_INTERVALS.stream().map(SimpleInterval::new).collect(Collectors.toList()); 240 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 241 242 writeToGenomicsDB(MANY_CONTIGS_VCF, manyContigs, workspace, 0, false, 0, 1, false, false, false, SEVERAL_CONTIGS, false); 243 checkJSONFilesAreWritten(workspace); 244 checkGenomicsDBAgainstExpected(workspace, manyContigs, EXPECTED_SEVERAL_CONTIGS_VCF, MANY_CONTIGS_REF, true, 245 MANY_CONTIGS_ATTRIBUTES_TO_IGNORE, true, false); 246 } 247 248 @Test(expectedExceptions = {UserException.class}, expectedExceptionsMessageRegExp=".*entire contigs be specified.*") testGenomicsDBMergeContigsThrowsOnNotInputIntervalLessThanContigLength()249 public void testGenomicsDBMergeContigsThrowsOnNotInputIntervalLessThanContigLength() throws IOException { 250 testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL_NOTFULL, b38_reference_20_21, new String[0], 1, 1, false); 251 } 252 253 @Test testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervals()254 public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervals() throws IOException { 255 testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, new String[0]); 256 } 257 258 @Test testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsWithMultipleThreads()259 public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsWithMultipleThreads() throws IOException { 260 testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, 261 new String[0], 4); 262 } 263 264 @Test testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervals()265 public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervals() throws IOException { 266 testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, 267 b38_reference_20_21, new String[0]); 268 } 269 270 @Test testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsForFilesProducedAfterCombineGVCFs()271 public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsForFilesProducedAfterCombineGVCFs() 272 throws IOException { 273 //this test covers the scenario where the input vcfs have spanning deletions 274 testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS_AFTER_COMBINE_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, 275 b38_reference_20_21, new String[0]); 276 } 277 278 @Test testGenomicsDBImportFileInputsAgainstCombineGVCFWithNonDiploidData()279 public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithNonDiploidData() throws IOException { 280 testGenomicsDBImporterWithGenotypes(Arrays.asList(NA12878_HG37, MULTIPLOID_DATA_HG37), INTERVAL_NONDIPLOID, 281 MULTIPLOID_EXPECTED_RESULT, b37_reference_20_21, 282 true, 283 false, 284 false); 285 } 286 287 @Test testGenomicsDBImportPhasedData()288 public void testGenomicsDBImportPhasedData() throws IOException { 289 testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), INTERVAL, NA_12878_PHASED, b37_reference_20_21); 290 } 291 292 @Test testGenomicsDBImportPhasedDataWithMultipleIntervals()293 public void testGenomicsDBImportPhasedDataWithMultipleIntervals() throws IOException { 294 testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), MULTIPLE_INTERVALS, NA_12878_PHASED, b37_reference_20_21); 295 } 296 297 @Test testGenomicsDBImportArtificialPhasedData()298 public void testGenomicsDBImportArtificialPhasedData() throws IOException { 299 ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1", 10109, 10297))); 300 testGenomicsDBImporterWithGenotypes(Arrays.asList(ARTIFICIAL_PHASED), intervals, ARTIFICIAL_PHASED, b37_reference_20_21); 301 } 302 303 @Test testGenomicsDBThreeLargeSamplesWithGenotypes()304 public void testGenomicsDBThreeLargeSamplesWithGenotypes() throws IOException { 305 ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 1, 64444167))); 306 testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_WITH_GENOTYPES, b38_reference_20_21, true, true, false); 307 } 308 309 @Test testGenomicsDBThreeLargeSamplesSitesOnlyQuery()310 public void testGenomicsDBThreeLargeSamplesSitesOnlyQuery() throws IOException { 311 ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList( 312 new SimpleInterval("chr20", 1, 64444167), 313 new SimpleInterval("chr21", 1, 46709983))); 314 testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_SITES_ONLY, b38_reference_20_21, true, true, true); 315 } 316 317 @Test(expectedExceptions={UserException.BadInput.class}, expectedExceptionsMessageRegExp=".*GenomicsDBImport does not support GVCFs.*") testGenomicsDbImportThrowsOnMnp()318 public void testGenomicsDbImportThrowsOnMnp() throws IOException { 319 for (int threads = 1; threads <= 2; ++threads) { 320 testGenomicsDBImporter( 321 Collections.singletonList(MNP_GVCF), 322 Collections.singletonList(new SimpleInterval("20", 69700, 69900)), 323 null, // Should never produce a VCF 324 b38_reference_20_21, 325 true, 326 threads 327 ); 328 } 329 } 330 testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile)331 private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, 332 final String expectedCombinedVCF, 333 final String referenceFile) throws IOException { 334 testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, 335 expectedCombinedVCF, referenceFile, 336 false, 337 true, 338 false); 339 } 340 testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll)341 private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, 342 final String expectedCombinedVCF, final String referenceFile, 343 final boolean testAll) throws IOException { 344 testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, 345 expectedCombinedVCF, referenceFile, 346 testAll, 347 false, 348 false); 349 } 350 testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final boolean produceGTField, final boolean sitesOnlyQuery)351 private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, 352 final String expectedCombinedVCF, final String referenceFile, 353 final boolean testAll, 354 final boolean produceGTField, 355 final boolean sitesOnlyQuery) throws IOException { 356 testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, expectedCombinedVCF, referenceFile, testAll, produceGTField, 357 sitesOnlyQuery, false); 358 } 359 testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final boolean produceGTField, final boolean sitesOnlyQuery, final boolean useNativeReader)360 private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, final List<SimpleInterval> intervals, 361 final String expectedCombinedVCF, final String referenceFile, 362 final boolean testAll, 363 final boolean produceGTField, 364 final boolean sitesOnlyQuery, 365 final boolean useNativeReader) throws IOException { 366 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 367 368 writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1, false, false, false, 0, useNativeReader); 369 checkJSONFilesAreWritten(workspace); 370 checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE, produceGTField, sitesOnlyQuery); 371 checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE, produceGTField, sitesOnlyQuery, true); 372 } 373 runCombineGVCFs(final List<String> inputs, final List<SimpleInterval> intervals, final String reference, final String[] extraArgs)374 private File runCombineGVCFs(final List<String> inputs, final List<SimpleInterval> intervals, final String reference, final String[] extraArgs) { 375 final File output = createTempFile("genotypegvcf", ".vcf"); 376 377 final ArgumentsBuilder args = new ArgumentsBuilder(); 378 args.addReference(new File(reference)) 379 .addOutput(output); 380 for (String input: inputs) { 381 args.add("V", input); 382 } 383 intervals.forEach(args::addInterval); 384 Arrays.stream(extraArgs).forEach(args::addRaw); 385 386 Utils.resetRandomGenerator(); 387 new Main().instanceMain(makeCommandLineArgs(args.getArgsList(), "CombineGVCFs")); 388 return output; 389 } 390 testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String referenceFile, final String[] CombineGVCFArgs)391 private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, 392 final String referenceFile, final String[] CombineGVCFArgs) throws IOException { 393 testGenomicsDBAgainstCombineGVCFs(vcfInputs, intervals, referenceFile, CombineGVCFArgs, 1); 394 } 395 testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String referenceFile, final String[] CombineGVCFArgs, final int numVCFReaderThreadsInImporter)396 private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, 397 final String referenceFile, final String[] CombineGVCFArgs, 398 final int numVCFReaderThreadsInImporter) throws IOException { 399 testGenomicsDBAgainstCombineGVCFs(vcfInputs, intervals, referenceFile, CombineGVCFArgs, numVCFReaderThreadsInImporter, 0, false); 400 } 401 testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String referenceFile, final String[] CombineGVCFArgs, final int numVCFReaderThreadsInImporter, final int chrsToPartitions, final boolean useNativeReader)402 private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals, 403 final String referenceFile, final String[] CombineGVCFArgs, 404 final int numVCFReaderThreadsInImporter, final int chrsToPartitions, 405 final boolean useNativeReader) throws IOException { 406 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 407 408 writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, numVCFReaderThreadsInImporter, false, false, false, 409 chrsToPartitions, useNativeReader); 410 checkJSONFilesAreWritten(workspace); 411 for(SimpleInterval currInterval : intervals) { 412 List<SimpleInterval> tmpList = new ArrayList<SimpleInterval>(Arrays.asList(currInterval)); 413 File expectedCombinedVCF = runCombineGVCFs(vcfInputs, tmpList, referenceFile, CombineGVCFArgs); 414 checkGenomicsDBAgainstExpected(workspace, tmpList, expectedCombinedVCF.getAbsolutePath(), referenceFile, true, ATTRIBUTES_TO_IGNORE); 415 } 416 } 417 418 @Test(groups = {"bucket"}) testGenomicsDBImportGCSInputs()419 public void testGenomicsDBImportGCSInputs() throws IOException { 420 testGenomicsDBImporter(resolveLargeFilesAsCloudURIs(LOCAL_GVCFS), INTERVAL, COMBINED, b38_reference_20_21, true, 1); 421 } 422 423 @Test testGenomicsDBAbsolutePathDependency()424 public void testGenomicsDBAbsolutePathDependency() throws IOException { 425 final File workspace = createTempDir("genomicsdb-tests-"); 426 final File workspace2 = createTempDir("genomicsdb-secondary-tests-"); 427 428 writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace.getAbsolutePath() + "/workspace", 0, false, 0, 1); 429 checkJSONFilesAreWritten(workspace.getAbsolutePath() + "/workspace"); 430 Files.move(workspace.toPath(), workspace2.toPath(), StandardCopyOption.REPLACE_EXISTING); 431 checkGenomicsDBAgainstExpected(workspace2.getAbsolutePath() + "/workspace", INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 432 } 433 434 @Test (enabled = true) testGenomicsDBAlleleSpecificAnnotations()435 public void testGenomicsDBAlleleSpecificAnnotations() throws IOException { 436 testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.chr20snippet.g.vcf", COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"), 437 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10433000, 10700000))), 438 b37_reference_20_21, 439 new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"}); 440 } 441 442 @Test (enabled = true) testGenomicsDBAlleleSpecificAnnotationsInTheMiddleOfSpanningDeletion()443 public void testGenomicsDBAlleleSpecificAnnotationsInTheMiddleOfSpanningDeletion() throws IOException { 444 testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.chr20snippet.g.vcf", COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"), 445 new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10433313, 10700000))), 446 b37_reference_20_21, 447 new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"}); 448 } 449 450 /** 451 * Converts a list of large file paths into equivalent cloud paths 452 * This must be done non-statically because any failure during static initialization results in hard to understand 453 * TestNG errors and it is possible for {@link BaseTest#getGCPTestInputPath()} to fail if the environment isn't 454 * fully set up. 455 * 456 * The cloud bucket must be organized the same way as the local test files in order to resolve correctly. 457 */ resolveLargeFilesAsCloudURIs(final List<String> filenames)458 private static List<String> resolveLargeFilesAsCloudURIs(final List<String> filenames){ 459 return filenames.stream() 460 .map( filename -> filename.replace(publicTestDir, getGCPTestInputPath())) 461 .peek( filename -> Assert.assertTrue(BucketUtils.isGcsUrl(filename))) 462 .collect(Collectors.toList()); 463 } 464 465 @Test(dataProvider = "batchSizes") testGenomicsDBImportFileInputsInBatches(final int batchSize)466 public void testGenomicsDBImportFileInputsInBatches(final int batchSize) throws IOException { 467 testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, INTERVAL, COMBINED, batchSize); 468 } 469 470 @Test(dataProvider = "batchSizes") testGenomicsDBImportFileInputsInBatchesWithMultipleIntervals(final int batchSize)471 public void testGenomicsDBImportFileInputsInBatchesWithMultipleIntervals(final int batchSize) throws IOException { 472 testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, batchSize); 473 } 474 475 @Test(groups = {"bucket"}, dataProvider = "batchSizes") testGenomicsDBImportGCSInputsInBatches(final int batchSize)476 public void testGenomicsDBImportGCSInputsInBatches(final int batchSize) throws IOException { 477 testGenomicsDBImporterWithBatchSize(resolveLargeFilesAsCloudURIs(LOCAL_GVCFS), INTERVAL, COMBINED, batchSize); 478 } 479 480 @DataProvider getThreads()481 public Object[][] getThreads(){ 482 return new Object[][] { 483 {1}, {2}, {5} 484 }; 485 } 486 487 @Test(groups = {"bucket"}, dataProvider = "getThreads") testDifferentThreadValuesFromABucket(final int threads)488 public void testDifferentThreadValuesFromABucket(final int threads) throws IOException { 489 final List<String> vcfInputs = resolveLargeFilesAsCloudURIs(LOCAL_GVCFS); 490 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 491 492 writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, threads); 493 checkJSONFilesAreWritten(workspace); 494 checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 495 } 496 497 @Test(dataProvider = "getThreads") testDifferentThreadValuesLocally(final int threads)498 public void testDifferentThreadValuesLocally(final int threads) throws IOException { 499 final List<String> vcfInputs = LOCAL_GVCFS; 500 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 501 502 writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, threads); 503 checkJSONFilesAreWritten(workspace); 504 checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 505 } 506 /** 507 * 508 * @throws CommandLineException.OutOfRangeArgumentValue Value must be >= 1024 bytes 509 */ 510 @Test(expectedExceptions = CommandLineException.OutOfRangeArgumentValue.class) testZeroVCFBufferSize()511 public void testZeroVCFBufferSize() throws IOException { 512 testGenomicsDBImportWithZeroBufferSize(LOCAL_GVCFS, INTERVAL, COMBINED); 513 } 514 515 testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final int threads)516 private void testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals, 517 final String expectedCombinedVCF, final String referenceFile, 518 final boolean testAll, final int threads) throws IOException { 519 testGenomicsDBImporter(vcfInputs, intervals, expectedCombinedVCF, referenceFile, testAll, threads, false); 520 } 521 testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final int threads, final boolean useNativeReader)522 private void testGenomicsDBImporter(final List<String> vcfInputs, final List<SimpleInterval> intervals, 523 final String expectedCombinedVCF, final String referenceFile, 524 final boolean testAll, final int threads, final boolean useNativeReader) throws IOException { 525 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 526 writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1, false, false, false, 0, useNativeReader); 527 528 checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE); 529 } 530 testGenomicsDBImporter_newMQ(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore)531 private void testGenomicsDBImporter_newMQ(final List<String> vcfInputs, final List<SimpleInterval> intervals, 532 final String expectedCombinedVCF, final String referenceFile, 533 final boolean testAll, final List<String> attributesToIgnore) throws IOException { 534 final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; 535 536 writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1); 537 checkJSONFilesAreWritten(workspace); 538 539 checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, attributesToIgnore); 540 } 541 testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final int batchSize)542 private void testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, 543 final String expectedCombinedVCF, final int batchSize) throws IOException { 544 testGenomicsDBImporterWithBatchSize(vcfInputs, intervals, expectedCombinedVCF, batchSize, false); 545 } 546 testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final int batchSize, final boolean useNativeReader)547 private void testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, 548 final String expectedCombinedVCF, final int batchSize, 549 final boolean useNativeReader) throws IOException { 550 final String workspace = createTempDir("genomicsdb-batchsize-tests-").getAbsolutePath() + "/workspace-" + batchSize; 551 552 writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, false, 0, 1, false, false, false, 0, true); 553 checkJSONFilesAreWritten(workspace); 554 checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 555 } 556 testGenomicsDBImportWithZeroBufferSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String expectedCombinedVCF)557 private void testGenomicsDBImportWithZeroBufferSize(final List<String> vcfInputs, final List<SimpleInterval> intervals, 558 final String expectedCombinedVCF) throws IOException { 559 final String workspace = createTempDir("genomicsdb-buffersize-tests-").getAbsolutePath() + "/workspace"; 560 561 writeToGenomicsDB(vcfInputs, intervals, workspace, 0, true, 0, 1); 562 checkJSONFilesAreWritten(workspace); 563 checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 564 565 } 566 writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads)567 private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, 568 final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads) { 569 writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, false); 570 } 571 writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals)572 private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, 573 final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals) { 574 writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, mergeIntervals, false, false); 575 } 576 writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental)577 private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, 578 final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, 579 final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental) { 580 writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, useBufferSize, bufferSizePerSample, threads, mergeIntervals, 581 overwriteWorkspace, incremental, 0, false); 582 } 583 writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental, final int chrsToPartitions, final boolean useNativeReader)584 private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleInterval> intervals, final String workspace, 585 final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads, 586 final boolean mergeIntervals, final boolean overwriteWorkspace, final boolean incremental, 587 final int chrsToPartitions, final boolean useNativeReader) { 588 final ArgumentsBuilder args = new ArgumentsBuilder(); 589 if (incremental) { 590 args.add(GenomicsDBImport.INCREMENTAL_WORKSPACE_ARG_LONG_NAME, workspace); 591 } else { 592 args.add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); 593 } 594 intervals.forEach(args::addInterval); 595 vcfInputs.forEach(vcf -> args.add("V", vcf)); 596 args.add("batch-size", String.valueOf(batchSize)); 597 args.add(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, String.valueOf(threads)); 598 args.add(GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME, mergeIntervals); 599 args.add(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, overwriteWorkspace); 600 if (chrsToPartitions != 0) { 601 args.add(GenomicsDBImport.MERGE_CONTIGS_INTO_NUM_PARTITIONS, String.valueOf(chrsToPartitions)); 602 } 603 if (useBufferSize) { 604 args.add("genomicsdb-vcf-buffer-size", String.valueOf(bufferSizePerSample)); 605 } 606 607 runCommandLine(args); 608 if (chrsToPartitions != 0) { 609 String[] partitions = GenomicsDBUtils.listGenomicsDBArrays(workspace); 610 // it may not always be the case that the number of partitions created matches 611 // the number we specified, but will be true for our tests 612 Assert.assertTrue(partitions.length == chrsToPartitions); 613 } 614 } 615 checkJSONFilesAreWritten(final String workspace)616 private static void checkJSONFilesAreWritten(final String workspace) { 617 Assert.assertTrue(BucketUtils.fileExists(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME))); 618 Assert.assertTrue(BucketUtils.fileExists(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME))); 619 Assert.assertTrue(BucketUtils.fileExists(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME))); 620 } 621 checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore)622 private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, 623 final String expectedCombinedVCF, final String referenceFile, 624 final boolean testAll, final List<String> attributesToIgnore) throws IOException { 625 checkGenomicsDBAgainstExpected(workspace, intervals, 626 expectedCombinedVCF, referenceFile, 627 testAll, 628 attributesToIgnore, 629 false, 630 false, 631 false); 632 } 633 checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore, final boolean produceGTfield, final boolean sitesOnlyQuery)634 private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, 635 final String expectedCombinedVCF, final String referenceFile, 636 final boolean testAll, final List<String> attributesToIgnore, 637 final boolean produceGTfield, final boolean sitesOnlyQuery) throws IOException { 638 checkGenomicsDBAgainstExpected(workspace, intervals, 639 expectedCombinedVCF, referenceFile, 640 testAll, 641 attributesToIgnore, 642 produceGTfield, 643 sitesOnlyQuery, 644 false); 645 } 646 checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, final String expectedCombinedVCF, final String referenceFile, final boolean testAll, final List<String> attributesToIgnore, final boolean produceGTField, final boolean sitesOnlyQuery, final boolean useVCFCodec)647 private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals, 648 final String expectedCombinedVCF, final String referenceFile, 649 final boolean testAll, 650 final List<String> attributesToIgnore, 651 final boolean produceGTField, 652 final boolean sitesOnlyQuery, 653 final boolean useVCFCodec) throws IOException { 654 final FeatureReader<VariantContext> genomicsDBFeatureReader = 655 getGenomicsDBFeatureReader(workspace, referenceFile, produceGTField, sitesOnlyQuery, useVCFCodec); 656 657 final AbstractFeatureReader<VariantContext, LineIterator> combinedVCFReader = 658 AbstractFeatureReader.getFeatureReader(expectedCombinedVCF, new VCFCodec(), true); 659 660 661 intervals.forEach(interval -> { 662 try (CloseableTribbleIterator<VariantContext> actualVcs = 663 genomicsDBFeatureReader.query(interval.getContig(), interval.getStart(), interval.getEnd()); 664 665 CloseableTribbleIterator<VariantContext> expectedVcs = 666 combinedVCFReader.query(interval.getContig(), interval.getStart(), interval.getEnd())) { 667 668 BaseTest.assertCondition(actualVcs, expectedVcs, (a, e) -> { 669 // Test that the VCs match 670 if (testAll) { 671 // To correct a discrepancy between genotypeGVCFs which outputs empty genotypes as "./." and GenomicsDB 672 // which returns them as "." we simply remap the empty ones to be consistent for comparison 673 List<Genotype> genotypes = a.getGenotypes().stream() 674 .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(2)).make():g) 675 .collect(Collectors.toList()); 676 a = new VariantContextBuilder(a).genotypes(genotypes).make(); 677 VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, attributesToIgnore, Collections.emptyList(), VCF_HEADER); 678 679 // Test only that the genotypes match 680 } else { 681 List<Genotype> genotypes = e.getGenotypes().stream() 682 .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(Collections.emptyList()).make():g) 683 .collect(Collectors.toList()); 684 e = new VariantContextBuilder(e).genotypes(genotypes).make(); 685 VariantContextTestUtils.assertVariantContextsHaveSameGenotypes(a, e); 686 } 687 }); 688 } catch (IOException e) { 689 Assert.fail(e.getMessage(), e); 690 } 691 }); 692 } 693 694 @DataProvider getOrderingTests()695 public Iterator<Object[]> getOrderingTests(){ 696 final File outOfOrderSampleMap = getSampleMapFile( 697 "HG00268\t" + HG_00268 + "\n" + 698 "NA19625\t" + NA_19625 + "\n" + 699 "HG00096\t" + HG_00096); 700 701 final List<Integer> batchSizes = Arrays.asList(0, 1, 2, 3, 4); 702 final List<Object[]> results = new ArrayList<>(); 703 for( final Integer batchSize: batchSizes){ 704 // -V in order 705 results.add(new Object[] {new ArgumentsBuilder() 706 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize)) 707 .addVCF(new File(HG_00096)) 708 .addVCF(new File(HG_00268)) 709 .addVCF(new File(NA_19625))}); 710 711 // -V out of order 712 results.add(new Object[] {new ArgumentsBuilder() 713 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize)) 714 .addVCF(new File(HG_00268)) 715 .addVCF(new File(NA_19625)) 716 .addVCF(new File(HG_00096))}); 717 718 //in order sample map 719 results.add(new Object[] {new ArgumentsBuilder() 720 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize)) 721 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, createInOrderSampleMap())}); 722 723 //out of order sample map 724 results.add(new Object[] {new ArgumentsBuilder() 725 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize)) 726 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap)}); 727 728 //out of order sample map with multiple threads 729 results.add(new Object[] {new ArgumentsBuilder() 730 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize)) 731 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap) 732 .add(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, "2")}); 733 } 734 return results.iterator(); 735 } 736 737 @Test testSampleNameWithSpaces()738 public void testSampleNameWithSpaces() throws IOException { 739 final File outOfOrderSampleMap = getSampleMapFile( 740 "HG00268 withSpaces\t" + HG_00268_WITH_SPACES + "\n" + 741 "NA19625\t" + NA_19625 + "\n" + 742 "HG00096\t" + HG_00096 ); 743 744 final String workspace = createTempDir("gendbtest").getAbsolutePath() + "/workspace"; 745 746 ArgumentsBuilder args = new ArgumentsBuilder() 747 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(2)) 748 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap) 749 .addInterval(SMALLER_INTERVAL.get(0)) 750 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); 751 752 runCommandLine(args); 753 checkJSONFilesAreWritten(workspace); 754 checkGenomicsDBAgainstExpected(workspace, SMALLER_INTERVAL, COMBINED_WITHSPACES, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 755 checkGenomicsDBAgainstExpected(workspace, SMALLER_INTERVAL, COMBINED_WITHSPACES, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true); 756 } 757 758 @Test(dataProvider = "getOrderingTests") testSampleNameOrdering(final ArgumentsBuilder args)759 public void testSampleNameOrdering(final ArgumentsBuilder args) throws IOException { 760 final String workspace = createTempDir("gendbtest").getAbsolutePath() + "/workspace"; 761 762 args.addInterval(INTERVAL.get(0)) 763 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); 764 765 runCommandLine(args); 766 checkJSONFilesAreWritten(workspace); 767 checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 768 checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE, false, false, true); 769 } 770 createInOrderSampleMap()771 private static File createInOrderSampleMap() { 772 final String sampleFileContents = 773 "HG00096\t" +HG_00096 +"\n" + 774 "HG00268\t"+ HG_00268 + "\n" + 775 "NA19625\t"+ NA_19625; 776 777 return getSampleMapFile(sampleFileContents); 778 } 779 getSampleMapFile(final String sampleFileContents)780 private static File getSampleMapFile(final String sampleFileContents) { 781 final File sampleNameMap = IOUtils.writeTempFile(sampleFileContents, "sampleNameMap", ".txt"); 782 sampleNameMap.deleteOnExit(); 783 return sampleNameMap; 784 } 785 getSampleMapFile(final Map<String, String> mapping)786 private static File getSampleMapFile(final Map<String, String> mapping){ 787 return getSampleMapFile(mapping.entrySet() 788 .stream() 789 .map( pair -> pair.getKey() + "\t" + pair.getValue()) 790 .collect(Collectors.joining("\n"))); 791 } 792 793 @DataProvider getRenameCombinations()794 public static Iterator<Object[]> getRenameCombinations() { 795 final Map<String,String> noRemapping = new LinkedHashMap<>(); 796 noRemapping.put("s1", "s1"); 797 noRemapping.put("s2", "s2"); 798 noRemapping.put("s3", "s3"); 799 800 final Map<String,String> sameInput = new LinkedHashMap<>(); 801 sameInput.put("s1", "s1"); 802 sameInput.put("s2", "s1"); 803 sameInput.put("s3", "s1"); 804 805 806 final Map<String,String> sameInputWeirdOrder = new LinkedHashMap<>(); 807 sameInputWeirdOrder.put("s3", "s1"); 808 sameInputWeirdOrder.put("s1", "s1"); 809 sameInputWeirdOrder.put("s2", "s1"); 810 811 final Map<String,String> swizzled = new LinkedHashMap<>(); 812 swizzled.put("s2","s1"); 813 swizzled.put("s3","s2"); 814 swizzled.put("s1","s3"); 815 816 final Map<String,String> multipleOutOfOrderRenamingsAcrossBatches = new LinkedHashMap<>(); 817 multipleOutOfOrderRenamingsAcrossBatches.put("s1", "s1"); 818 multipleOutOfOrderRenamingsAcrossBatches.put("s2", "s2"); 819 multipleOutOfOrderRenamingsAcrossBatches.put("s1_Renamed", "s1"); 820 multipleOutOfOrderRenamingsAcrossBatches.put("Renamed_s2", "s2"); 821 multipleOutOfOrderRenamingsAcrossBatches.put("s4", "s3"); 822 multipleOutOfOrderRenamingsAcrossBatches.put("s3", "s3"); 823 multipleOutOfOrderRenamingsAcrossBatches.put("someOtherSample", "s4"); 824 825 826 final List<Integer> batchSizes = Arrays.asList(0, 1, 4); 827 final List<Integer> threads = Arrays.asList(1, 2); 828 final List<Map<String, String>> mappings = Arrays.asList(noRemapping, sameInput, sameInputWeirdOrder, swizzled, multipleOutOfOrderRenamingsAcrossBatches); 829 final List<Object[]> out = new ArrayList<>(); 830 for(final Map<String,String> mapping : mappings){ 831 for(final int batchSize :batchSizes){ 832 for(final int threading : threads){ 833 out.add( new Object[]{mapping, threading, batchSize}); 834 } 835 } 836 } 837 return out.iterator(); 838 } 839 840 @Test(dataProvider = "getRenameCombinations") testRenamingSamples(final Map<String, String> renamingMap, final int threads, final int batchSize)841 public void testRenamingSamples(final Map<String, String> renamingMap, final int threads, final int batchSize) throws IOException { 842 final LinkedHashMap<String, String> sampleMap = new LinkedHashMap<>(renamingMap); 843 sampleMap.replaceAll( (newSampleName, originalSampleName)-> createInputVCF(originalSampleName).getAbsolutePath()); 844 845 final File sampleMapFile = getSampleMapFile(sampleMap); 846 847 final String workspace = createTempDir("workspace").getAbsolutePath(); 848 Files.delete(Paths.get(workspace)); 849 final ArgumentsBuilder args = new ArgumentsBuilder() 850 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleMapFile.getAbsolutePath()) 851 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, new File(workspace).getAbsolutePath()) 852 .add(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, String.valueOf(threads)) 853 .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize)) 854 .addInterval(INTERVAL.get(0)); 855 856 runCommandLine(args); 857 final Set<String> expectedSampleNames = sampleMap.keySet(); 858 try(final FeatureReader<VariantContext> reader = getGenomicsDBFeatureReader(workspace, b37_reference_20_21)) { 859 final CloseableTribbleIterator<VariantContext> iterator = reader.iterator(); 860 Assert.assertTrue(iterator.hasNext(), "expected to see a variant"); 861 Assert.assertTrue(expectedSampleNames.size() > 0); 862 Assert.assertEquals(expectedSampleNames.size(), renamingMap.size()); 863 iterator.forEachRemaining(vc -> { 864 Assert.assertEquals(vc.getSampleNames().size(), expectedSampleNames.size()); 865 Assert.assertEqualsNoOrder(vc.getSampleNames().toArray(), expectedSampleNames.toArray()); 866 expectedSampleNames.forEach( sample -> { 867 Assert.assertEquals(vc.getGenotype(sample).getAnyAttribute(SAMPLE_NAME_KEY), renamingMap.get(sample)); 868 //check another attribute just to make sure we're not mangling things 869 Assert.assertEquals(VariantContextGetters.getAttributeAsInt(vc.getGenotype(sample), ANOTHER_ATTRIBUTE_KEY, -1), 10); 870 }); 871 }); 872 } 873 874 } 875 createInputVCF(final String sampleName)876 private static File createInputVCF(final String sampleName) { 877 final String contig = "chr20"; 878 final SAMSequenceDictionary dict = new SAMSequenceDictionary( 879 Collections.singletonList(new SAMSequenceRecord(contig, 64444167))); 880 881 final VCFFormatHeaderLine formatField = new VCFFormatHeaderLine(SAMPLE_NAME_KEY, 1, VCFHeaderLineType.String, 882 "the name of the sample this genotype came from"); 883 final Set<VCFHeaderLine> headerLines = new HashSet<>(); 884 headerLines.add(formatField); 885 headerLines.add(new VCFFormatHeaderLine(ANOTHER_ATTRIBUTE_KEY, 1, VCFHeaderLineType.Integer, "Another value")); 886 headerLines.add(VCFStandardHeaderLines.getFormatLine("GT")); 887 888 final File out = createTempFile(sampleName +"_", ".vcf"); 889 try (final VariantContextWriter writer = GATKVariantContextUtils.createVCFWriter(out.toPath(), dict, false, 890 Options.INDEX_ON_THE_FLY)) { 891 final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.singleton(sampleName)); 892 vcfHeader.setSequenceDictionary(dict); 893 writer.writeHeader(vcfHeader); 894 final Allele Aref = Allele.create("A", true); 895 final Allele C = Allele.create("C"); 896 final List<Allele> alleles = Arrays.asList(Aref, C); 897 final VariantContext variant = new VariantContextBuilder("invented", contig, INTERVAL.get(0).getStart(), INTERVAL.get(0).getStart(), alleles) 898 .genotypes(new GenotypeBuilder(sampleName, alleles).attribute(SAMPLE_NAME_KEY, sampleName) 899 .attribute(ANOTHER_ATTRIBUTE_KEY, 10).make()) 900 .make(); 901 writer.add(variant); 902 return out; 903 } 904 } 905 906 @Test(expectedExceptions = CommandLineException.class) testCantSpecifyVCFAndSampleNameFile()907 public void testCantSpecifyVCFAndSampleNameFile(){ 908 final ArgumentsBuilder args = new ArgumentsBuilder() 909 .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, createInOrderSampleMap().getAbsolutePath()) 910 .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, HG_00096) 911 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, createTempDir("workspace").getAbsolutePath()) 912 .addInterval(INTERVAL.get(0)); 913 runCommandLine(args); 914 } 915 916 @Test(expectedExceptions = CommandLineException.MissingArgument.class) testRequireOneOfVCFOrSampleNameFile()917 public void testRequireOneOfVCFOrSampleNameFile(){ 918 final ArgumentsBuilder args = new ArgumentsBuilder() 919 .add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, createTempDir("workspace").getAbsolutePath()) 920 .add("L", "1:1-10"); 921 922 runCommandLine(args); 923 } 924 925 @Test testGenomicsDBImportWithoutDBField()926 public void testGenomicsDBImportWithoutDBField() throws IOException { 927 //Test for https://github.com/broadinstitute/gatk/issues/3736 928 final List<String> vcfInputs = Arrays.asList(NA_24385); 929 final String workspace = createTempDir("genomicsdb-tests").getAbsolutePath() + "/workspace"; 930 writeToGenomicsDB(vcfInputs, INTERVAL_3736, workspace, 0, false, 0, 1); 931 } 932 933 @Test testLongWorkspacePath()934 public void testLongWorkspacePath() throws IOException { 935 //Test for https://github.com/broadinstitute/gatk/issues/4160 936 final List<String> vcfInputs = LOCAL_GVCFS; 937 final String workspace = createTempDir("long_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_genomicsdb").getAbsolutePath() + "/should_not_fail_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; 938 writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, 1); 939 } 940 941 @Test testCommandIncludedInOutputHeader()942 public void testCommandIncludedInOutputHeader() throws IOException { 943 final List<String> vcfInputs = LOCAL_GVCFS; 944 final String workspace = createTempDir("genomicsdb-tests").getAbsolutePath() + "/workspace"; 945 946 writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, 1); 947 try(final FeatureReader<VariantContext> genomicsDBFeatureReader = 948 getGenomicsDBFeatureReader(workspace, b38_reference_20_21)) 949 { 950 final VCFHeader header = (VCFHeader) genomicsDBFeatureReader.getHeader(); 951 final Optional<VCFHeaderLine> commandLineHeaderLine = header.getMetaDataInSortedOrder().stream() 952 .filter(line -> line.getValue().contains(GenomicsDBImport.class.getSimpleName())) 953 .findAny(); 954 955 Assert.assertTrue(commandLineHeaderLine.isPresent(), "no headerline was present containing information about the GenomicsDBImport command"); 956 } 957 958 959 } 960 961 @Test testPreserveContigOrderingInHeader()962 public void testPreserveContigOrderingInHeader() throws IOException { 963 final String workspace = createTempDir("testPreserveContigOrderingInHeader-").getAbsolutePath() + "/workspace"; 964 ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 17959479, 17959479))); 965 writeToGenomicsDB(Arrays.asList(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf", 966 GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting2.g.vcf"), intervals, workspace, 0, false, 0, 1); 967 968 try ( final FeatureReader<VariantContext> genomicsDBFeatureReader = 969 getGenomicsDBFeatureReader(workspace, b38_reference_20_21); 970 971 final AbstractFeatureReader<VariantContext, LineIterator> inputGVCFReader = 972 AbstractFeatureReader.getFeatureReader(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf", new VCFCodec(), true); 973 ) { 974 final SAMSequenceDictionary dictionaryFromGenomicsDB = ((VCFHeader)genomicsDBFeatureReader.getHeader()).getSequenceDictionary(); 975 final SAMSequenceDictionary dictionaryFromInputGVCF = ((VCFHeader)inputGVCFReader.getHeader()).getSequenceDictionary(); 976 977 Assert.assertEquals(dictionaryFromGenomicsDB, dictionaryFromInputGVCF, "Sequence dictionary from GenomicsDB does not match original sequence dictionary from input GVCF"); 978 } 979 980 } getGenomicsDBFeatureReader( final String workspace, final String reference, final boolean produceGTField)981 private static FeatureReader<VariantContext> getGenomicsDBFeatureReader( 982 final String workspace, final String reference, 983 final boolean produceGTField) throws IOException { 984 return getGenomicsDBFeatureReader(workspace, reference, 985 produceGTField, false); 986 } 987 getGenomicsDBFeatureReader( final String workspace, final String reference, final boolean produceGTField, final boolean sitesOnlyQuery)988 private static FeatureReader<VariantContext> getGenomicsDBFeatureReader( 989 final String workspace, final String reference, 990 final boolean produceGTField, 991 final boolean sitesOnlyQuery) throws IOException { 992 return getGenomicsDBFeatureReader(workspace, reference, 993 produceGTField, sitesOnlyQuery, false); 994 } 995 getGenomicsDBFeatureReader( final String workspace, final String reference, final boolean produceGTField, final boolean sitesOnlyQuery, final boolean useVCFCodec)996 private static FeatureReader<VariantContext> getGenomicsDBFeatureReader( 997 final String workspace, final String reference, 998 final boolean produceGTField, 999 final boolean sitesOnlyQuery, 1000 final boolean useVCFCodec) throws IOException { 1001 String workspaceAbsPath = BucketUtils.makeFilePathAbsolute(workspace); 1002 GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder = GenomicsDBExportConfiguration.ExportConfiguration.newBuilder() 1003 .setWorkspace(workspace) 1004 .setReferenceGenome(reference) 1005 .setVidMappingFile(IOUtils.appendPathToDir(workspaceAbsPath, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME)) 1006 .setCallsetMappingFile(IOUtils.appendPathToDir(workspaceAbsPath, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME)) 1007 .setVcfHeaderFilename(IOUtils.appendPathToDir(workspaceAbsPath, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME)) 1008 .setProduceGTField(produceGTField) 1009 .setSitesOnlyQuery(sitesOnlyQuery) 1010 .setGenerateArrayNameFromPartitionBounds(true); 1011 GenomicsDBVidMapProto.VidMappingPB vidMapPB = null; 1012 try { 1013 vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getProtobufVidMappingFromJsonFile(IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME)); 1014 } 1015 catch (final IOException e) { 1016 throw new UserException("Could not open vid json file "+GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME, e); 1017 } 1018 HashMap<String, Integer> fieldNameToIndexInVidFieldsList = 1019 org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); 1020 1021 vidMapPB = org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, 1022 GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum"); 1023 1024 if(vidMapPB != null) { 1025 exportConfigurationBuilder.setVidMapping(vidMapPB); 1026 } 1027 1028 if (useVCFCodec) { 1029 return new GenomicsDBFeatureReader<>(exportConfigurationBuilder.build(), new VCFCodec(), Optional.empty()); 1030 } else { 1031 return new GenomicsDBFeatureReader<>(exportConfigurationBuilder.build(), new BCF2Codec(), Optional.empty()); 1032 } 1033 } 1034 getGenomicsDBFeatureReader( final String workspace, final String reference)1035 private static FeatureReader<VariantContext> getGenomicsDBFeatureReader( 1036 final String workspace, final String reference) throws IOException { 1037 return getGenomicsDBFeatureReader(workspace, reference, false); 1038 } 1039 1040 @Test(expectedExceptions = GenomicsDBImport.UnableToCreateGenomicsDBWorkspace.class) testYouCantWriteIntoAnExistingDirectory()1041 public void testYouCantWriteIntoAnExistingDirectory(){ 1042 // this actually creates the directory on disk, not just the file name. 1043 final String workspace = createTempDir("workspace").getAbsolutePath(); 1044 writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1); 1045 } 1046 1047 @Test(expectedExceptions = CommandLineException.class) testOverwriteWorkspaceAndIncrementalImportCannotBothBeTrue()1048 public void testOverwriteWorkspaceAndIncrementalImportCannotBothBeTrue() { 1049 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; 1050 writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1, false, true, true); 1051 } 1052 1053 @Test(expectedExceptions = UserException.class) testIncrementalMustHaveExistingWorkspace()1054 public void testIncrementalMustHaveExistingWorkspace() { 1055 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath(); 1056 writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace + "workspace2", 0, false, 0, 1, false, false, true); 1057 } 1058 testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace, final int batchSize, final boolean produceGTField, final boolean useVCFCodec, final String expected, final int chrsToPartitions, final boolean useNativeReader)1059 private void testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace, 1060 final int batchSize, final boolean produceGTField, final boolean useVCFCodec, final String expected, 1061 final int chrsToPartitions, final boolean useNativeReader) throws IOException { 1062 for(int i=0; i<LOCAL_GVCFS.size(); i+=stepSize) { 1063 int upper = Math.min(i+stepSize, LOCAL_GVCFS.size()); 1064 writeToGenomicsDB(LOCAL_GVCFS.subList(i, upper), intervals, workspace, batchSize, false, 0, 1, false, false, i!=0, 1065 chrsToPartitions, i!=0 && useNativeReader); 1066 checkJSONFilesAreWritten(workspace); 1067 } 1068 for(SimpleInterval currInterval : intervals) { 1069 List<SimpleInterval> tmpList = new ArrayList<SimpleInterval>(Arrays.asList(currInterval)); 1070 String expectedVcf = expected; 1071 if (expected.isEmpty()) { 1072 File expectedCombinedVCF = runCombineGVCFs(LOCAL_GVCFS, tmpList, b38_reference_20_21, new String[0]); 1073 expectedVcf = expectedCombinedVCF.getAbsolutePath(); 1074 } 1075 checkGenomicsDBAgainstExpected(workspace, tmpList, expectedVcf, b38_reference_20_21, true, 1076 ATTRIBUTES_TO_IGNORE, produceGTField, false); 1077 if (useVCFCodec) { 1078 checkGenomicsDBAgainstExpected(workspace, tmpList, expectedVcf, b38_reference_20_21, true, 1079 ATTRIBUTES_TO_IGNORE, produceGTField, false, true); 1080 } 1081 } 1082 } 1083 1084 @Test testGenomicsDBBasicIncremental()1085 public void testGenomicsDBBasicIncremental() throws IOException { 1086 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; 1087 testIncrementalImport(2, INTERVAL, workspace, 0, true, true, COMBINED_WITH_GENOTYPES, 0, false); 1088 createAndCheckIntervalListFromExistingWorkspace(workspace, INTERVAL_PICARD_STYLE_EXPECTED); 1089 } 1090 1091 @Test testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervals()1092 public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervals() throws IOException { 1093 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; 1094 testIncrementalImport(2, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 1, false, true, "", 0, false); 1095 createAndCheckIntervalListFromExistingWorkspace(workspace, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED); 1096 } 1097 1098 @Test testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsMergeContigsIntoPartitions()1099 public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsMergeContigsIntoPartitions() throws IOException { 1100 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; 1101 testIncrementalImport(2, INTERVAL_20_21, workspace, 1, false, true, "", 1, false); 1102 createAndCheckIntervalListFromExistingWorkspace(workspace, MERGED_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED); 1103 } 1104 1105 @Test testGenomicsDBIncrementalAndBatchSize2()1106 public void testGenomicsDBIncrementalAndBatchSize2() throws IOException { 1107 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; 1108 testIncrementalImport(2, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, false, 1109 COMBINED_WITH_GENOTYPES, 0, false); 1110 } 1111 1112 @Test testGenomicsDBMultipleIncrementalImports()1113 public void testGenomicsDBMultipleIncrementalImports() throws IOException { 1114 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; 1115 testIncrementalImport(1, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 2, true, true, 1116 COMBINED_WITH_GENOTYPES, 0, false); 1117 } 1118 1119 @Test testGenomicsDBIncrementalWithManyNonAdjacentContigsToSeveralPartitions()1120 public void testGenomicsDBIncrementalWithManyNonAdjacentContigsToSeveralPartitions() throws IOException { 1121 List<SimpleInterval> manyContigs = MANY_CONTIGS_NON_ADJACENT_INTERVALS.stream().map(SimpleInterval::new).collect(Collectors.toList()); 1122 final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; 1123 1124 writeToGenomicsDB(MANY_CONTIGS_VCF.subList(0, 1), manyContigs, workspace, 0, false, 0, 1, false, false, false, SEVERAL_CONTIGS, false); 1125 writeToGenomicsDB(MANY_CONTIGS_VCF.subList(1, 2), manyContigs, workspace, 0, false, 0, 1, false, false, true, SEVERAL_CONTIGS, false); 1126 checkJSONFilesAreWritten(workspace); 1127 checkGenomicsDBAgainstExpected(workspace, manyContigs, EXPECTED_SEVERAL_CONTIGS_VCF, MANY_CONTIGS_REF, true, 1128 MANY_CONTIGS_ATTRIBUTES_TO_IGNORE, true, false); 1129 1130 createAndCheckIntervalListFromExistingWorkspace(workspace, MANY_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED); 1131 } 1132 createAndCheckIntervalListFromExistingWorkspace(final String workspace, final String expectedOutput)1133 private void createAndCheckIntervalListFromExistingWorkspace(final String workspace, final String expectedOutput) { 1134 final ArgumentsBuilder args = new ArgumentsBuilder(); 1135 final String outputIntervalList = workspace + "interval_output"; 1136 args.add(GenomicsDBImport.INCREMENTAL_WORKSPACE_ARG_LONG_NAME, workspace); 1137 args.add(GenomicsDBImport.INTERVAL_LIST_LONG_NAME, outputIntervalList); 1138 1139 runCommandLine(args); 1140 1141 final IntervalList generatedInterval = IntervalList.fromFile(new File(outputIntervalList)); 1142 final IntervalList expectedInterval = IntervalList.fromFile(new File(expectedOutput)); 1143 Assert.assertTrue(generatedInterval.sorted().equals(expectedInterval.sorted())); 1144 } 1145 basicWriteAndQueryWithOptions(String workspace, Map<String, Object> options)1146 void basicWriteAndQueryWithOptions(String workspace, Map<String, Object> options) throws IOException { 1147 final ArgumentsBuilder args = new ArgumentsBuilder(); 1148 args.add(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); 1149 INTERVAL.forEach(args::addInterval); 1150 LOCAL_GVCFS.forEach(vcf -> args.add("V", vcf)); 1151 for ( String key : options.keySet()) { 1152 if (key.equals(GenomicsDBImport.SHARED_POSIXFS_OPTIMIZATIONS)) { 1153 Assert.assertTrue(options.get(key) instanceof Boolean); 1154 args.add(GenomicsDBImport.SHARED_POSIXFS_OPTIMIZATIONS, (Boolean)options.get(key)); 1155 } 1156 if (key.equals(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME)) { 1157 Assert.assertTrue(options.get(key) instanceof Boolean); 1158 args.add(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, (Boolean)options.get(key)); 1159 } 1160 } 1161 runCommandLine(args); 1162 checkJSONFilesAreWritten(workspace); 1163 checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 1164 } 1165 1166 @Test testWithMiscOptions()1167 public void testWithMiscOptions() throws IOException { 1168 final String workspace = createTempDir("genomicsdb-misc-options").getAbsolutePath() + "/workspace"; 1169 IOUtils.deleteOnExit(IOUtils.getPath(workspace)); 1170 Map<String, Object> options = new HashMap<String, Object>(); 1171 1172 // Test with shared posixfs optimizations set 1173 options.put(GenomicsDBImport.SHARED_POSIXFS_OPTIMIZATIONS, true); 1174 basicWriteAndQueryWithOptions(workspace, options); 1175 1176 // Test with shared posixfs optimizations and overwrite workspace set 1177 options.put(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, true); 1178 basicWriteAndQueryWithOptions(workspace, options); 1179 } 1180 1181 @Test(expectedExceptions = GenomicsDBImport.UnableToCreateGenomicsDBWorkspace.class) testWithMiscOptionsNoOverwrite()1182 public void testWithMiscOptionsNoOverwrite() throws IOException { 1183 final String workspace = createTempDir("genomicsdb-misc-options-nooverwrite").getAbsolutePath() + "/workspace"; 1184 IOUtils.deleteOnExit(IOUtils.getPath(workspace)); 1185 Map<String, Object> options = new HashMap<String, Object>(); 1186 basicWriteAndQueryWithOptions(workspace, options); 1187 1188 // Test with overwrite workspace set to false - should throw an exception - GenomicsDBImport.UnableToCreateGenomicsDBWorkspace 1189 options.replace(GenomicsDBImport.OVERWRITE_WORKSPACE_LONG_NAME, false); 1190 basicWriteAndQueryWithOptions(workspace, options); 1191 } 1192 1193 @Test testQueryWithComputationsExceeding32BitsDefault()1194 public void testQueryWithComputationsExceeding32BitsDefault() throws IOException { 1195 final String folder = createTempDir("computations_exceed_32bits").getAbsolutePath(); 1196 IOUtils.extractTarGz(Paths.get(TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE), Paths.get(folder)); 1197 IOUtils.deleteOnExit(IOUtils.getPath(folder)); 1198 final String workspace = folder + "/bigint_genomicsdb_ws"; 1199 checkGenomicsDBAgainstExpected(workspace, new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1"))), folder+"/expected_combined_bigint.vcf", 1200 folder+"/reference/chr1_10MB.fasta.gz", true, ATTRIBUTES_TO_IGNORE, false, false, true); 1201 } 1202 1203 // The following test should fail with a Throwable because of limitations in BCF2Codec - see https://github.com/broadinstitute/gatk/issues/6548 1204 @Test(expectedExceptions = Throwable.class) testQueryWithComputationsExceeding32BitsBCFCodec()1205 public void testQueryWithComputationsExceeding32BitsBCFCodec() throws IOException { 1206 final String folder = createTempDir("computations_exceed_32bits_bcf2codec").getAbsolutePath() + "/testQueryWithComputationsExceed32Bits"; 1207 IOUtils.extractTarGz(Paths.get(TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE), Paths.get(folder)); 1208 IOUtils.deleteOnExit(IOUtils.getPath(folder)); 1209 final String workspace = folder + "/bigint_genomicsdb_ws"; 1210 checkGenomicsDBAgainstExpected(workspace, new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1"))), folder+"/expected_combined_bigint.vcf", 1211 folder+"/reference/chr1_10MB.fasta.gz", true, ATTRIBUTES_TO_IGNORE, false, false, false); 1212 } 1213 1214 @Test(groups = {"bucket"}) testWriteToAndQueryFromGCS()1215 public void testWriteToAndQueryFromGCS() throws IOException { 1216 final String workspace = BucketUtils.randomRemotePath(getGCPTestStaging(), "", "") + "/"; 1217 IOUtils.deleteOnExit(IOUtils.getPath(workspace)); 1218 writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1); 1219 checkJSONFilesAreWritten(workspace); 1220 checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); 1221 } 1222 1223 @Test(groups = {"bucket"}, expectedExceptions = GenomicsDBImport.UnableToCreateGenomicsDBWorkspace.class) testWriteToExistingGCSDirectory()1224 public void testWriteToExistingGCSDirectory() throws IOException { 1225 final String workspace = BucketUtils.randomRemotePath(getGCPTestStaging(), "", "") + "/"; 1226 IOUtils.deleteOnExit(IOUtils.getPath(workspace)); 1227 int rc = GenomicsDBUtils.createTileDBWorkspace(workspace, false); 1228 Assert.assertEquals(rc, 0); 1229 writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace, 0, false, 0, 1); 1230 } 1231 } 1232