1 package org.broadinstitute.hellbender.utils.reference;
2 
3 import htsjdk.samtools.SAMSequenceDictionary;
4 import htsjdk.samtools.SAMSequenceRecord;
5 import org.broadinstitute.hellbender.utils.SimpleInterval;
6 
7 import java.util.List;
8 import java.util.function.IntFunction;
9 import java.util.function.ToIntFunction;
10 
11 /**
12  * Class to translate back and forth from absolute {@code long-typed} base positions to relative ones (the usual contig, position pairs).
13  * <p>
14  *     If we were to concatenate the sequence of every chromosome in the order these appear in reference dictionary,
15  *     each base in the reference with a absolute position or offset from the beginning of this imaginary super-contig.
16  * </p>
17  * <p>
18  *     Some times it might be handy to use this type of address/coordinate rather than the usual contig (name or index) and position
19  *     within the contig.
20  * </p>
21  * <p>
22  *     This class implement such a transformation from absolute coordinates efficiently.
23  * </p>
24  * <p>
25  *     Relative to absolute is rather a trivial matter if you keep track of the accumulative number of bases on the reference
26  *     right before the start of each contig.
27  * </p>
28  * <p> Absolute to relative is a bit trickier. A obvious solution would consist in a binary search amogst the contig for the one that
29  *     include the absolute base range that encloses the requested position. The cost of this lookup would be O(log C) where C is the number of
30  *     contigs in the reference. In some cases like hg38 there are over 3000+ such contigs due to the alt-contigs and decoy, resulting in around 11 iterations
31  *     to find the target's location contig</p>
32  * <p>
33  *     This class has a couple of accelerations:
34  *     <p>
35  *         First we check whether the target contig is the last returned, which should be quite often the case when accessing the refernce in
36  *         sequence.
37  *     </p>
38  *     <p>
39  *         Then we keep a "percentile" table that contains the contig index that would include that percentile absolute position. The granularity
40  *         of such table is linked to the number of contigs in the reference with a O(C) additional memory cost.
41  *     </p>
42  *     <p>
43  *         These two acceleration may effectively reduce the look-up cost to O(1) is most scenarios. The draw back is a more complicate code and the
44  *         need to deal with float-point arithmetic for the percentile look-up.
45  *     </p>
46  * </p>
47  */
48 public final class AbsoluteCoordinates {
49 
50     /**
51      * Length of each contig.
52      */
53     private final int[] lengths;
54 
55     /**
56      * Percentile look up table.
57      */
58     private final int[] percentiles;
59 
60     /**
61      * Contains the number of bases before the contig with the ith index (0-based).
62      */
63     private final long[] accumulative;
64 
65     /**
66      * Total length of the reference.
67      */
68     private final long total;
69 
70     /**
71      * Factor to multiply to an absolute position to get its corresponding percentile.
72      */
73     private final float percentileFactor;
74 
75     /**
76      * Function that resolves the contig index given its name.
77      */
78     private final ToIntFunction<String> contigToIndex;
79 
80     /**
81      * Function that resolves the contig name given its index.
82      */
83     private final IntFunction<String> indexToContig;
84     private int lastCtg;
85 
86 
AbsoluteCoordinates(final int[] lengths, final long[] accumulative, final ToIntFunction<String> contigToIndex, final IntFunction<String> indexToContig)87     private AbsoluteCoordinates(final int[] lengths, final long[] accumulative,
88                                 final ToIntFunction<String> contigToIndex, final IntFunction<String> indexToContig) {
89         this.lengths = lengths;
90         this.accumulative = accumulative;
91         this.contigToIndex = contigToIndex;
92         this.indexToContig = indexToContig;
93         this.total = accumulative[accumulative.length - 1];
94         this.percentiles = calculatePercentiles(lengths, accumulative, total);
95         this.percentileFactor = (percentiles.length - 1) / (float) this.total;
96         this.lastCtg = 0;
97     }
98 
calculatePercentiles(final int[] lengths, final long[] accumulative, final long total)99     private static int[] calculatePercentiles(final int[] lengths, final long[] accumulative, final long total) {
100         final int[] result = new int[(accumulative.length << 1) + 1];
101         final float fraction = total / (float)(result.length - 1);
102         double fractionAccumulator = 0;
103         for (int i = 0, j = 0; i < lengths.length; i++) {
104             final long accumulativePlusLength = accumulative[i + 1]; // == accumulative[i] + lengths[i];
105             while (fractionAccumulator < accumulativePlusLength && j < result.length - 1) {
106                 result[j++] = i;
107                 fractionAccumulator += fraction;
108             }
109         }
110         result[result.length - 1] = lengths.length - 1;
111         return result;
112     }
113 
of(final SAMSequenceDictionary dictionary)114     public static AbsoluteCoordinates of(final SAMSequenceDictionary dictionary) {
115         final ToIntFunction<String> contigToIndex = dictionary::getSequenceIndex;
116         final IntFunction<String> indexToContig = i -> dictionary.getSequence(i).getContig();
117         final List<SAMSequenceRecord> sequences = dictionary.getSequences();
118         final int numberOfSequences = sequences.size();
119         final int[] lengths = new int[numberOfSequences];
120         final long[] accumulative = new long[numberOfSequences + 1];
121         for (int i = 0; i < numberOfSequences; i++) {
122             final SAMSequenceRecord sequence = sequences.get(i);
123             lengths[i] = sequence.getSequenceLength();
124         }
125         long leftSum = lengths[0];
126         for (int i = 1; i < numberOfSequences; i++) {
127             accumulative[i] = leftSum;
128             leftSum += lengths[i];
129         }
130         accumulative[numberOfSequences] = leftSum;
131         return new AbsoluteCoordinates(lengths, accumulative, contigToIndex, indexToContig);
132     }
133 
toAbsolute(final String ctgName, final int position)134     public long toAbsolute(final String ctgName, final int position) {
135         return toAbsolute(contigToIndex.applyAsInt(ctgName), position);
136     }
137 
138     /**
139      * Obtains the absolute coordinate for the start position in an interval.
140      * @param simpleInterval the target position in relative coordinates
141      * @return 1 or greater.
142      * @throws IllegalArgumentException no such contig name or tht conting is too small.
143      */
toAbsolute(final SimpleInterval simpleInterval)144     public long toAbsolute(final SimpleInterval simpleInterval) {
145         return toAbsolute(simpleInterval.getContig(), simpleInterval.getStart());
146     }
147 
148 
toAbsolute(final int ctgIdx, final int position)149     public long toAbsolute(final int ctgIdx, final int position) {
150         if (lengths[ctgIdx] < position) {
151             throw new IllegalArgumentException("position outside containg contig");
152         }
153         lastCtg = ctgIdx;
154         return accumulative[ctgIdx] + position;
155     }
156 
157     public static class Relative {
158         public final String contig;
159         public final int contigIndex;
160         public final int position;
161 
Relative(final String contig, final int contigIndex, final int position)162         Relative(final String contig, final int contigIndex, final int position) {
163             this.contig = contig;
164             this.contigIndex = contigIndex;
165             this.position = position;
166         }
167     }
168 
169     @FunctionalInterface
170     interface RelativeFactory<E> {
create(final String ctgName, final int ctgIdx, final int position)171         E create(final String ctgName, final int ctgIdx, final int position);
172     }
173 
toSimpleInterval(final long absoluteStart, final int length)174     public SimpleInterval toSimpleInterval(final long absoluteStart, final int length) {
175         return toRelative(absoluteStart, (n, i, p) -> new SimpleInterval(n, p, p + length - 1));
176     }
177 
toRelative(final long absolute)178     public Relative toRelative(final long absolute) {
179         return toRelative(absolute, Relative::new);
180     }
181 
toRelative(final long absolute, final RelativeFactory<E> factory)182     public <E> E toRelative(final long absolute, final RelativeFactory<E> factory) {
183         if (absolute < 1) {
184             throw new IllegalArgumentException("absolute cannot be less than 1");
185         } else if (absolute > total) {
186             throw new IllegalArgumentException("absolute is too large");
187         } else if (accumulative[lastCtg] < absolute) {
188             if (absolute <= accumulative[lastCtg + 1]) {
189                return factory.create(indexToContig.apply(lastCtg), lastCtg, (int) (absolute - accumulative[lastCtg]));
190             } else {
191                 return searchRelative(absolute, factory);
192             }
193         } else {
194             return searchRelative(absolute, factory);
195         }
196     }
197 
searchRelative(final long target, final RelativeFactory<E> factory)198     private <E> E searchRelative(final long target, final RelativeFactory<E> factory) {
199         final int percentileIndex = (int) (target * percentileFactor);
200         if (percentileIndex >= percentiles.length) {
201              throw new IllegalArgumentException("xx " + target + " " + total );
202         }
203         final int percentileCtg = percentiles[percentileIndex];
204         if (percentileIndex < percentiles.length - 1) {
205             return searchRelative(target, percentileCtg, percentiles[percentileIndex + 1], factory);
206         } else {
207             return searchRelative(target, percentileCtg, lengths.length - 1, factory);
208         }
209     }
210 
searchRelative(final long target, final int minCtgIdx, final int maxCtgIdx, final RelativeFactory<E> factory)211     private <E> E searchRelative(final long target, final int minCtgIdx, final int maxCtgIdx, final RelativeFactory<E> factory) {
212         int i = minCtgIdx, j = maxCtgIdx;
213         while (i < j) {
214             final int mid = (i + j) >> 1;
215             if (accumulative[mid] >= target) {
216                 j = mid - 1;
217             } else if (accumulative[mid + 1] < target) {
218                 i = mid + 1;
219             } else {
220                 i = mid;
221                 break;
222             }
223         }
224         lastCtg = i;
225         return factory.create(indexToContig.apply(i), i, (int) (target - accumulative[i]));
226     }
227 }
228