1 package org.broadinstitute.hellbender.engine;
2 
3 import htsjdk.samtools.SAMSequenceDictionary;
4 import htsjdk.samtools.reference.ReferenceSequence;
5 import org.broadinstitute.hellbender.utils.SimpleInterval;
6 import org.broadinstitute.hellbender.utils.iterators.ByteArrayIterator;
7 import org.broadinstitute.hellbender.utils.reference.ReferenceBases;
8 
9 import java.nio.file.Path;
10 import java.util.Iterator;
11 
12 /**
13  * Manages traversals and queries over reference data.
14  *
15  * Supports targeted queries over the reference by interval and over the entire reference.
16  */
17 public interface ReferenceDataSource extends GATKDataSource<Byte>, AutoCloseable {
18 
19     /**
20      * Initialize this data source using a fasta file.
21      *
22      * The provided fasta file must have companion .fai and .dict files.
23      *
24      * @param fastaPath reference fasta Path
25      */
of(final Path fastaPath)26     public static ReferenceDataSource of(final Path fastaPath) {
27         return new ReferenceFileSource(fastaPath);
28     }
29 
30     /**
31      * Initialize this data source using a fasta file.
32      *
33      * The provided fasta file must have companion .fai and .dict files.
34      *
35      * If {@code preserveFileBases} is {@code true}, will NOT convert IUPAC bases in the file to `N` and will NOT capitalize lower-case bases.
36      *
37      * NOTE: Most GATK tools do not support data created by setting {@code preserveFileBases} to {@code true}.
38      *
39      * @param fastaPath reference fasta Path
40      * @param preserveAmbiguityCodesAndCapitalization Whether to preserve the original bases in the given reference file path.
41      */
of(final Path fastaPath, final boolean preserveAmbiguityCodesAndCapitalization)42     public static ReferenceDataSource of(final Path fastaPath, final boolean preserveAmbiguityCodesAndCapitalization) {
43         return new ReferenceFileSource(fastaPath, preserveAmbiguityCodesAndCapitalization);
44     }
45 
46     /**
47      * Initialize this data source using ReferenceBases and corresponding sequence dictionary.
48      */
of(final ReferenceBases bases, final SAMSequenceDictionary referenceSequenceDictionary)49     public static ReferenceDataSource of(final ReferenceBases bases, final SAMSequenceDictionary referenceSequenceDictionary) {
50         return new ReferenceMemorySource(bases, referenceSequenceDictionary);
51     }
52 
53     /**
54      * Query a specific interval on this reference, and get back all bases spanning that interval at once.
55      * Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils
56      * class for guidance on how to work with bases in this format.
57      *
58      * The default implementation calls #queryAndPrefetch(contig, start, stop).
59      *
60      * @param interval query interval
61      * @return a ReferenceSequence containing all bases spanning the query interval, prefetched
62      */
queryAndPrefetch( final SimpleInterval interval )63     default public ReferenceSequence queryAndPrefetch( final SimpleInterval interval ) {
64         return queryAndPrefetch(interval.getContig(), interval.getStart(), interval.getEnd());
65     }
66 
67     /**
68      * Query a specific interval on this reference, and get back all bases spanning that interval at once.
69      * Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils
70      * class for guidance on how to work with bases in this format.
71      *
72      * @param contig query interval contig
73      * @param start query interval start
74      * @param stop query interval stop
75      * @return a ReferenceSequence containing all bases spanning the query interval, prefetched
76      */
queryAndPrefetch(final String contig, final long start , final long stop)77     public ReferenceSequence queryAndPrefetch(final String contig, final long start , final long stop);
78 
79     /**
80       * Query a specific interval on this reference, and get back an iterator over the bases spanning that interval.
81       *
82       * See the BaseUtils class for guidance on how to work with bases in this format.
83       *
84       * @param interval query interval
85       * @return iterator over the bases spanning the query interval
86       */
87     @Override
query(final SimpleInterval interval)88     default public Iterator<Byte> query(final SimpleInterval interval) {
89         // TODO: need a way to iterate lazily over reference bases without necessarily loading them all into memory at once
90         return new ByteArrayIterator(queryAndPrefetch(interval).getBases());
91     }
92 
93     /**
94      * Get the sequence dictionary for this reference
95      *
96      * @return SAMSequenceDictionary for this reference
97      */
getSequenceDictionary()98     public SAMSequenceDictionary getSequenceDictionary();
99 
100     /**
101      * Permanently close this data source. The default implementation does nothing.
102      */
103     @Override
close()104     default public void close(){
105         //do nothing
106     }
107 }
108