1 package org.broadinstitute.hellbender.engine; 2 3 import htsjdk.samtools.SAMSequenceDictionary; 4 import htsjdk.samtools.reference.ReferenceSequence; 5 import org.broadinstitute.hellbender.utils.SimpleInterval; 6 import org.broadinstitute.hellbender.utils.iterators.ByteArrayIterator; 7 import org.broadinstitute.hellbender.utils.reference.ReferenceBases; 8 9 import java.nio.file.Path; 10 import java.util.Iterator; 11 12 /** 13 * Manages traversals and queries over reference data. 14 * 15 * Supports targeted queries over the reference by interval and over the entire reference. 16 */ 17 public interface ReferenceDataSource extends GATKDataSource<Byte>, AutoCloseable { 18 19 /** 20 * Initialize this data source using a fasta file. 21 * 22 * The provided fasta file must have companion .fai and .dict files. 23 * 24 * @param fastaPath reference fasta Path 25 */ of(final Path fastaPath)26 public static ReferenceDataSource of(final Path fastaPath) { 27 return new ReferenceFileSource(fastaPath); 28 } 29 30 /** 31 * Initialize this data source using a fasta file. 32 * 33 * The provided fasta file must have companion .fai and .dict files. 34 * 35 * If {@code preserveFileBases} is {@code true}, will NOT convert IUPAC bases in the file to `N` and will NOT capitalize lower-case bases. 36 * 37 * NOTE: Most GATK tools do not support data created by setting {@code preserveFileBases} to {@code true}. 38 * 39 * @param fastaPath reference fasta Path 40 * @param preserveAmbiguityCodesAndCapitalization Whether to preserve the original bases in the given reference file path. 41 */ of(final Path fastaPath, final boolean preserveAmbiguityCodesAndCapitalization)42 public static ReferenceDataSource of(final Path fastaPath, final boolean preserveAmbiguityCodesAndCapitalization) { 43 return new ReferenceFileSource(fastaPath, preserveAmbiguityCodesAndCapitalization); 44 } 45 46 /** 47 * Initialize this data source using ReferenceBases and corresponding sequence dictionary. 48 */ of(final ReferenceBases bases, final SAMSequenceDictionary referenceSequenceDictionary)49 public static ReferenceDataSource of(final ReferenceBases bases, final SAMSequenceDictionary referenceSequenceDictionary) { 50 return new ReferenceMemorySource(bases, referenceSequenceDictionary); 51 } 52 53 /** 54 * Query a specific interval on this reference, and get back all bases spanning that interval at once. 55 * Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils 56 * class for guidance on how to work with bases in this format. 57 * 58 * The default implementation calls #queryAndPrefetch(contig, start, stop). 59 * 60 * @param interval query interval 61 * @return a ReferenceSequence containing all bases spanning the query interval, prefetched 62 */ queryAndPrefetch( final SimpleInterval interval )63 default public ReferenceSequence queryAndPrefetch( final SimpleInterval interval ) { 64 return queryAndPrefetch(interval.getContig(), interval.getStart(), interval.getEnd()); 65 } 66 67 /** 68 * Query a specific interval on this reference, and get back all bases spanning that interval at once. 69 * Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils 70 * class for guidance on how to work with bases in this format. 71 * 72 * @param contig query interval contig 73 * @param start query interval start 74 * @param stop query interval stop 75 * @return a ReferenceSequence containing all bases spanning the query interval, prefetched 76 */ queryAndPrefetch(final String contig, final long start , final long stop)77 public ReferenceSequence queryAndPrefetch(final String contig, final long start , final long stop); 78 79 /** 80 * Query a specific interval on this reference, and get back an iterator over the bases spanning that interval. 81 * 82 * See the BaseUtils class for guidance on how to work with bases in this format. 83 * 84 * @param interval query interval 85 * @return iterator over the bases spanning the query interval 86 */ 87 @Override query(final SimpleInterval interval)88 default public Iterator<Byte> query(final SimpleInterval interval) { 89 // TODO: need a way to iterate lazily over reference bases without necessarily loading them all into memory at once 90 return new ByteArrayIterator(queryAndPrefetch(interval).getBases()); 91 } 92 93 /** 94 * Get the sequence dictionary for this reference 95 * 96 * @return SAMSequenceDictionary for this reference 97 */ getSequenceDictionary()98 public SAMSequenceDictionary getSequenceDictionary(); 99 100 /** 101 * Permanently close this data source. The default implementation does nothing. 102 */ 103 @Override close()104 default public void close(){ 105 //do nothing 106 } 107 } 108