1 package org.broadinstitute.hellbender; 2 3 import htsjdk.samtools.SAMFileHeader; 4 import org.broadinstitute.hellbender.testutils.BaseTest; 5 import org.broadinstitute.hellbender.utils.GenomeLoc; 6 import org.broadinstitute.hellbender.utils.GenomeLocParser; 7 import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile; 8 import org.broadinstitute.hellbender.utils.io.IOUtils; 9 import org.testng.annotations.AfterClass; 10 import org.testng.annotations.BeforeClass; 11 12 import java.io.File; 13 import java.util.ArrayList; 14 import java.util.Arrays; 15 import java.util.Collections; 16 import java.util.List; 17 18 /** 19 * This is the base test class for all of our test cases. All test cases should extend from this 20 * class; it sets up the logger, and resolves the location of directories that we rely on. 21 */ 22 public abstract class GATKBaseTest extends BaseTest { 23 24 private static final String CURRENT_DIRECTORY = System.getProperty("user.dir"); 25 public static final String gatkDirectory = System.getProperty("gatkdir", CURRENT_DIRECTORY) + "/"; 26 27 public static final String publicMainResourcesDir = new File(gatkDirectory, "src/main/resources").getAbsolutePath() + "/"; 28 public static final String packageMainResourcesDir = publicMainResourcesDir + "org/broadinstitute/hellbender/"; 29 30 private static final String publicTestDirRelative = "src/test/resources/"; 31 public static final String publicTestDir = new File(gatkDirectory, publicTestDirRelative).getAbsolutePath() + "/"; 32 public static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, ""); 33 34 public static final String packageRootTestDir = publicTestDir + "org/broadinstitute/hellbender/"; 35 public static final String toolsTestDir = packageRootTestDir + "tools/"; 36 public static final String exampleTestDir = toolsTestDir + "examples/"; 37 38 public static final String GCS_GATK_TEST_RESOURCES = "gs://hellbender/test/resources/"; 39 40 public static final String GCS_b37_REFERENCE_2BIT = GCS_GATK_TEST_RESOURCES + "benchmark/human_g1k_v37.2bit"; 41 public static final String GCS_b37_CHR20_21_REFERENCE_2BIT = GCS_GATK_TEST_RESOURCES + "large/human_g1k_v37.20.21.2bit"; 42 public static final String GCS_b37_CHR20_21_REFERENCE = GCS_GATK_TEST_RESOURCES + "large/human_g1k_v37.20.21.fasta"; 43 44 // environment variable set by the GATK Docker build file 45 private static final String GATK_DOCKER_CONTAINER = "GATK_DOCKER_CONTAINER"; 46 47 /** 48 * LARGE FILES FOR TESTING (MANAGED BY GIT LFS) 49 */ 50 public static final String largeFileTestDir = new File(publicTestDir, "large").getAbsolutePath() + "/"; 51 52 // The complete B37 human reference, including the Epstein-Barr contig, in fasta.gz format. 53 // Source: /seq/references/Homo_sapiens_assembly19/v1/ in the Broad Institute filesystem. 54 public static final String b37Reference = largeFileTestDir + "Homo_sapiens_assembly19.fasta.gz"; 55 56 // The complete HG38 human reference, in fasta.gz format. 57 // Source: /seq/references/Homo_sapiens_assembly38/v0/ in the Broad Institute filesystem. 58 public static final String hg38Reference = largeFileTestDir + "Homo_sapiens_assembly38.fasta.gz"; 59 60 // All of chromosomes 20 and 21 from the b37 reference 61 public static final String b37_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.fasta"; 62 63 public static final String b37_reference_20_21_gz = largeFileTestDir + "human_g1k_v37.20.21.fasta.gz"; 64 65 public static final String b37_2bit_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.2bit"; 66 67 public static final String b37_reference_20_21_img = largeFileTestDir + "human_g1k_v37.20.21.fasta.img"; 68 69 // All of chromosomes 20 and 21 from the b38 reference 70 public static final String b38_reference_20_21 = largeFileTestDir + "Homo_sapiens_assembly38.20.21.fasta"; 71 72 // ~600,000 reads from chromosomes 20 and 21 of an NA12878 WGS bam aligned to b37, plus ~50,000 unmapped reads 73 public static final String NA12878_20_21_WGS_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam"; 74 public static final String NA12878_20_21_WGS_cram = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram"; 75 public static final String NA12878_20_21_covered_regions = publicTestDir + "wgs_calling_regions.v1.chr20_chr21.interval_list"; 76 77 // ~10,000 reads from chromosome 20 of NA12878 RNA-seq, aligned to b37 with STAR 78 public static final String NA12878_20_RNAseq_bam = largeFileTestDir + "NA12878.RNAseq.with.mate.info.bam"; 79 public static final String b37_20_gff3 = largeFileTestDir + "Homo_sapiens.GRCh37.20.gff3"; 80 81 // ~20,000 reads from E-Coli RNA-seq, aligned with bwa-aln 82 public static final String E_COLI_RNAseq_bam = largeFileTestDir + "E_Coli_small_rna.bam"; 83 public static final String E_COLI_gff3 = largeFileTestDir + "E_Coli.gff3"; 84 85 // Variants from a DBSNP 138 VCF overlapping the reads in NA12878_20_21_WGS_bam 86 public static final String dbsnp_138_b37_20_21_vcf = largeFileTestDir + "dbsnp_138.b37.20.21.vcf"; 87 88 // Variants from a DBSNP 138 VCF form the first 65Mb of chr1 89 public static final String dbsnp_138_b37_1_65M_vcf = largeFileTestDir + "dbsnp_138.b37.1.1-65M.vcf"; 90 91 public static final String WGS_B37_CH20_1M_1M1K_BAM = "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.bam"; 92 public static final String DBSNP_138_B37_CH20_1M_1M1K_VCF = "dbsnp_138.b37.excluding_sites_after_129.ch20.1m-1m1k.vcf"; 93 94 /** 95 * END OF LARGE FILES FOR TESTING 96 */ 97 98 public static final String NA12878_chr17_1k_BAM = publicTestDir + "NA12878.chr17_69k_70k.dictFix.bam"; 99 public static final String NA12878_chr17_1k_CRAM = publicTestDir + "NA12878.chr17_69k_70k.dictFix.cram"; 100 public static final String v37_chr17_1Mb_Reference = publicTestDir + "human_g1k_v37.chr17_1Mb.fasta"; 101 102 public static final String hg19_chr1_1M_Reference = publicTestDir + "Homo_sapiens_assembly19_chr1_1M.fasta"; 103 public static final String hg19_chr1_1M_dict = publicTestDir + "Homo_sapiens_assembly19_chr1_1M.dict"; 104 public static final String hg19_chr1_1M_dbSNP = publicTestDir + "Homo_sapiens_assembly19.dbsnp135.chr1_1M.exome_intervals.vcf"; 105 106 // the following file has been modified such that the first chromosome length is 1M; this is sometimes 107 // required due to sequence dictionary validation, since a reference FASTA with only 1M bases is used 108 public static final String hg19_chr1_1M_dbSNP_modified = publicTestDir + "HSA19.dbsnp135.chr1_1M.exome_intervals.modified.vcf"; 109 110 public static final String hg19_chr1_1M_exampleVCF = publicTestDir + "joint_calling.chr1_1M.1kg_samples.10samples.noINFO.vcf"; 111 112 //contains chromosomes 1,2,3, and 4 using b37/GRCh37 contig names (i.e. no "chr") 113 public static final String hg19MiniReference = publicTestDir + "hg19mini.fasta"; 114 // Micro reference is the same as hg19mini, but contains only chromosomes 1 and 2 115 public static final String hg19MicroReference = publicTestDir + "hg19micro.fasta"; 116 117 public static final String FULL_HG19_DICT = publicTestDir + "Homo_sapiens_assembly19.dict"; 118 public static final String FULL_HG38_DICT = publicTestDir + "large/Homo_sapiens_assembly38.dict"; 119 120 public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; 121 public static final String exampleReference = hg19MiniReference; 122 public static final String hg19MiniIntervalFile = publicTestDir + "hg19mini.interval_list"; 123 public static final String wgsIntervalFile = publicTestDir + "wgs_calling_regions.v1.interval_list"; 124 125 public static final String DREAM_BAMS_DIR = publicTestDir + "large/mutect/dream_synthetic_bams"; 126 public static final String DREAM_VCFS_DIR = publicTestDir + "org/broadinstitute/hellbender/tools/mutect/dream/vcfs"; 127 128 public static final String thousandGenomes = largeFileTestDir + "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf"; 129 130 public CachingIndexedFastaSequenceFile hg19ReferenceReader; 131 public GenomeLocParser hg19GenomeLocParser; 132 133 // used to seed the genome loc parser with a sequence dictionary 134 protected SAMFileHeader hg19Header; 135 136 @BeforeClass initializeHG19Reference()137 public void initializeHG19Reference() { 138 hg19ReferenceReader = new CachingIndexedFastaSequenceFile(IOUtils.getPath(hg19MiniReference)); 139 hg19Header = new SAMFileHeader(); 140 hg19Header.setSequenceDictionary(hg19ReferenceReader.getSequenceDictionary()); 141 hg19GenomeLocParser = new GenomeLocParser(hg19ReferenceReader); 142 } 143 144 @AfterClass closeHg19Reference()145 public void closeHg19Reference(){ 146 hg19ReferenceReader.close(); 147 } 148 intervalStringsToGenomeLocs( String... intervals)149 protected List<GenomeLoc> intervalStringsToGenomeLocs( String... intervals) { 150 return intervalStringsToGenomeLocs(Arrays.asList(intervals)); 151 } 152 intervalStringsToGenomeLocs( List<String> intervals )153 protected List<GenomeLoc> intervalStringsToGenomeLocs( List<String> intervals ) { 154 List<GenomeLoc> locs = new ArrayList<>(); 155 for (String interval: intervals) 156 locs.add(hg19GenomeLocParser.parseGenomeLoc(interval)); 157 return Collections.unmodifiableList(locs); 158 } 159 160 /** 161 * @return true if we're running on the GATK Docker (which also implies we're running within the GATK conda environment) 162 */ isGATKDockerContainer()163 protected boolean isGATKDockerContainer() { 164 final String gatkDockerContainer = System.getenv(GATK_DOCKER_CONTAINER); 165 return gatkDockerContainer != null && gatkDockerContainer.equalsIgnoreCase("true"); 166 } 167 168 } 169 170