1 package org.broadinstitute.hellbender;
2 
3 import htsjdk.samtools.SAMFileHeader;
4 import org.broadinstitute.hellbender.testutils.BaseTest;
5 import org.broadinstitute.hellbender.utils.GenomeLoc;
6 import org.broadinstitute.hellbender.utils.GenomeLocParser;
7 import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
8 import org.broadinstitute.hellbender.utils.io.IOUtils;
9 import org.testng.annotations.AfterClass;
10 import org.testng.annotations.BeforeClass;
11 
12 import java.io.File;
13 import java.util.ArrayList;
14 import java.util.Arrays;
15 import java.util.Collections;
16 import java.util.List;
17 
18 /**
19  * This is the base test class for all of our test cases.  All test cases should extend from this
20  * class; it sets up the logger, and resolves the location of directories that we rely on.
21  */
22 public abstract class GATKBaseTest extends BaseTest {
23 
24     private static final String CURRENT_DIRECTORY = System.getProperty("user.dir");
25     public static final String gatkDirectory = System.getProperty("gatkdir", CURRENT_DIRECTORY) + "/";
26 
27     public static final String publicMainResourcesDir = new File(gatkDirectory, "src/main/resources").getAbsolutePath() + "/";
28     public static final String packageMainResourcesDir = publicMainResourcesDir + "org/broadinstitute/hellbender/";
29 
30     private static final String publicTestDirRelative = "src/test/resources/";
31     public static final String publicTestDir = new File(gatkDirectory, publicTestDirRelative).getAbsolutePath() + "/";
32     public static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, "");
33 
34     public static final String packageRootTestDir = publicTestDir + "org/broadinstitute/hellbender/";
35     public static final String toolsTestDir = packageRootTestDir + "tools/";
36     public static final String exampleTestDir = toolsTestDir + "examples/";
37 
38     public static final String GCS_GATK_TEST_RESOURCES = "gs://hellbender/test/resources/";
39 
40     public static final String GCS_b37_REFERENCE_2BIT = GCS_GATK_TEST_RESOURCES + "benchmark/human_g1k_v37.2bit";
41     public static final String GCS_b37_CHR20_21_REFERENCE_2BIT = GCS_GATK_TEST_RESOURCES + "large/human_g1k_v37.20.21.2bit";
42     public static final String GCS_b37_CHR20_21_REFERENCE = GCS_GATK_TEST_RESOURCES + "large/human_g1k_v37.20.21.fasta";
43 
44     // environment variable set by the GATK Docker build file
45     private static final String GATK_DOCKER_CONTAINER = "GATK_DOCKER_CONTAINER";
46 
47     /**
48      * LARGE FILES FOR TESTING (MANAGED BY GIT LFS)
49      */
50     public static final String largeFileTestDir = new File(publicTestDir, "large").getAbsolutePath() + "/";
51 
52     // The complete B37 human reference, including the Epstein-Barr contig, in fasta.gz format.
53     // Source: /seq/references/Homo_sapiens_assembly19/v1/ in the Broad Institute filesystem.
54     public static final String b37Reference = largeFileTestDir + "Homo_sapiens_assembly19.fasta.gz";
55 
56     // The complete HG38 human reference, in fasta.gz format.
57     // Source: /seq/references/Homo_sapiens_assembly38/v0/ in the Broad Institute filesystem.
58     public static final String hg38Reference = largeFileTestDir + "Homo_sapiens_assembly38.fasta.gz";
59 
60     // All of chromosomes 20 and 21 from the b37 reference
61     public static final String b37_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.fasta";
62 
63     public static final String b37_reference_20_21_gz = largeFileTestDir + "human_g1k_v37.20.21.fasta.gz";
64 
65     public static final String b37_2bit_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.2bit";
66 
67     public static final String b37_reference_20_21_img = largeFileTestDir + "human_g1k_v37.20.21.fasta.img";
68 
69     // All of chromosomes 20 and 21 from the b38 reference
70     public static final String b38_reference_20_21 = largeFileTestDir + "Homo_sapiens_assembly38.20.21.fasta";
71 
72     // ~600,000 reads from chromosomes 20 and 21 of an NA12878 WGS bam aligned to b37, plus ~50,000 unmapped reads
73     public static final String NA12878_20_21_WGS_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam";
74     public static final String NA12878_20_21_WGS_cram = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram";
75     public static final String NA12878_20_21_covered_regions = publicTestDir + "wgs_calling_regions.v1.chr20_chr21.interval_list";
76 
77     // ~10,000 reads from chromosome 20 of NA12878 RNA-seq, aligned to b37 with STAR
78     public static final String NA12878_20_RNAseq_bam = largeFileTestDir + "NA12878.RNAseq.with.mate.info.bam";
79     public static final String b37_20_gff3 = largeFileTestDir + "Homo_sapiens.GRCh37.20.gff3";
80 
81     // ~20,000 reads from E-Coli RNA-seq, aligned with bwa-aln
82     public static final String E_COLI_RNAseq_bam = largeFileTestDir + "E_Coli_small_rna.bam";
83     public static final String E_COLI_gff3 = largeFileTestDir + "E_Coli.gff3";
84 
85     // Variants from a DBSNP 138 VCF overlapping the reads in NA12878_20_21_WGS_bam
86     public static final String dbsnp_138_b37_20_21_vcf = largeFileTestDir + "dbsnp_138.b37.20.21.vcf";
87 
88     // Variants from a DBSNP 138 VCF form the first 65Mb of chr1
89     public static final String dbsnp_138_b37_1_65M_vcf = largeFileTestDir + "dbsnp_138.b37.1.1-65M.vcf";
90 
91     public static final String WGS_B37_CH20_1M_1M1K_BAM = "CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.bam";
92     public static final String DBSNP_138_B37_CH20_1M_1M1K_VCF = "dbsnp_138.b37.excluding_sites_after_129.ch20.1m-1m1k.vcf";
93 
94     /**
95      * END OF LARGE FILES FOR TESTING
96      */
97 
98     public static final String NA12878_chr17_1k_BAM = publicTestDir + "NA12878.chr17_69k_70k.dictFix.bam";
99     public static final String NA12878_chr17_1k_CRAM = publicTestDir + "NA12878.chr17_69k_70k.dictFix.cram";
100     public static final String v37_chr17_1Mb_Reference = publicTestDir + "human_g1k_v37.chr17_1Mb.fasta";
101 
102     public static final String hg19_chr1_1M_Reference = publicTestDir + "Homo_sapiens_assembly19_chr1_1M.fasta";
103     public static final String hg19_chr1_1M_dict = publicTestDir + "Homo_sapiens_assembly19_chr1_1M.dict";
104     public static final String hg19_chr1_1M_dbSNP = publicTestDir + "Homo_sapiens_assembly19.dbsnp135.chr1_1M.exome_intervals.vcf";
105 
106     // the following file has been modified such that the first chromosome length is 1M; this is sometimes
107     // required due to sequence dictionary validation, since a reference FASTA with only 1M bases is used
108     public static final String hg19_chr1_1M_dbSNP_modified = publicTestDir + "HSA19.dbsnp135.chr1_1M.exome_intervals.modified.vcf";
109 
110     public static final String hg19_chr1_1M_exampleVCF = publicTestDir + "joint_calling.chr1_1M.1kg_samples.10samples.noINFO.vcf";
111 
112     //contains chromosomes 1,2,3, and 4 using b37/GRCh37 contig names (i.e. no "chr")
113     public static final String hg19MiniReference = publicTestDir + "hg19mini.fasta";
114     // Micro reference is the same as hg19mini, but contains only chromosomes 1 and 2
115     public static final String hg19MicroReference = publicTestDir + "hg19micro.fasta";
116 
117     public static final String FULL_HG19_DICT = publicTestDir + "Homo_sapiens_assembly19.dict";
118     public static final String FULL_HG38_DICT = publicTestDir + "large/Homo_sapiens_assembly38.dict";
119 
120     public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta";
121     public static final String exampleReference = hg19MiniReference;
122     public static final String hg19MiniIntervalFile = publicTestDir + "hg19mini.interval_list";
123     public static final String wgsIntervalFile = publicTestDir + "wgs_calling_regions.v1.interval_list";
124 
125     public static final String DREAM_BAMS_DIR = publicTestDir + "large/mutect/dream_synthetic_bams";
126     public static final String DREAM_VCFS_DIR = publicTestDir + "org/broadinstitute/hellbender/tools/mutect/dream/vcfs";
127 
128     public static final String thousandGenomes = largeFileTestDir + "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf";
129 
130     public CachingIndexedFastaSequenceFile hg19ReferenceReader;
131     public GenomeLocParser hg19GenomeLocParser;
132 
133     // used to seed the genome loc parser with a sequence dictionary
134     protected SAMFileHeader hg19Header;
135 
136     @BeforeClass
initializeHG19Reference()137     public void initializeHG19Reference() {
138         hg19ReferenceReader = new CachingIndexedFastaSequenceFile(IOUtils.getPath(hg19MiniReference));
139         hg19Header = new SAMFileHeader();
140         hg19Header.setSequenceDictionary(hg19ReferenceReader.getSequenceDictionary());
141         hg19GenomeLocParser = new GenomeLocParser(hg19ReferenceReader);
142     }
143 
144     @AfterClass
closeHg19Reference()145     public void closeHg19Reference(){
146         hg19ReferenceReader.close();
147     }
148 
intervalStringsToGenomeLocs( String... intervals)149     protected List<GenomeLoc> intervalStringsToGenomeLocs( String... intervals) {
150         return intervalStringsToGenomeLocs(Arrays.asList(intervals));
151     }
152 
intervalStringsToGenomeLocs( List<String> intervals )153     protected List<GenomeLoc> intervalStringsToGenomeLocs( List<String> intervals ) {
154         List<GenomeLoc> locs = new ArrayList<>();
155         for (String interval: intervals)
156             locs.add(hg19GenomeLocParser.parseGenomeLoc(interval));
157         return Collections.unmodifiableList(locs);
158     }
159 
160     /**
161      * @return true if we're running on the GATK Docker (which also implies we're running within the GATK conda environment)
162      */
isGATKDockerContainer()163     protected boolean isGATKDockerContainer() {
164         final String gatkDockerContainer = System.getenv(GATK_DOCKER_CONTAINER);
165         return gatkDockerContainer != null && gatkDockerContainer.equalsIgnoreCase("true");
166     }
167 
168 }
169 
170