1 package org.broadinstitute.hellbender.utils.reference; 2 3 import htsjdk.samtools.SAMSequenceDictionary; 4 import htsjdk.samtools.SAMSequenceRecord; 5 import org.broadinstitute.hellbender.utils.SimpleInterval; 6 7 import java.util.List; 8 import java.util.function.IntFunction; 9 import java.util.function.ToIntFunction; 10 11 /** 12 * Class to translate back and forth from absolute {@code long-typed} base positions to relative ones (the usual contig, position pairs). 13 * <p> 14 * If we were to concatenate the sequence of every chromosome in the order these appear in reference dictionary, 15 * each base in the reference with a absolute position or offset from the beginning of this imaginary super-contig. 16 * </p> 17 * <p> 18 * Some times it might be handy to use this type of address/coordinate rather than the usual contig (name or index) and position 19 * within the contig. 20 * </p> 21 * <p> 22 * This class implement such a transformation from absolute coordinates efficiently. 23 * </p> 24 * <p> 25 * Relative to absolute is rather a trivial matter if you keep track of the accumulative number of bases on the reference 26 * right before the start of each contig. 27 * </p> 28 * <p> Absolute to relative is a bit trickier. A obvious solution would consist in a binary search amogst the contig for the one that 29 * include the absolute base range that encloses the requested position. The cost of this lookup would be O(log C) where C is the number of 30 * contigs in the reference. In some cases like hg38 there are over 3000+ such contigs due to the alt-contigs and decoy, resulting in around 11 iterations 31 * to find the target's location contig</p> 32 * <p> 33 * This class has a couple of accelerations: 34 * <p> 35 * First we check whether the target contig is the last returned, which should be quite often the case when accessing the refernce in 36 * sequence. 37 * </p> 38 * <p> 39 * Then we keep a "percentile" table that contains the contig index that would include that percentile absolute position. The granularity 40 * of such table is linked to the number of contigs in the reference with a O(C) additional memory cost. 41 * </p> 42 * <p> 43 * These two acceleration may effectively reduce the look-up cost to O(1) is most scenarios. The draw back is a more complicate code and the 44 * need to deal with float-point arithmetic for the percentile look-up. 45 * </p> 46 * </p> 47 */ 48 public final class AbsoluteCoordinates { 49 50 /** 51 * Length of each contig. 52 */ 53 private final int[] lengths; 54 55 /** 56 * Percentile look up table. 57 */ 58 private final int[] percentiles; 59 60 /** 61 * Contains the number of bases before the contig with the ith index (0-based). 62 */ 63 private final long[] accumulative; 64 65 /** 66 * Total length of the reference. 67 */ 68 private final long total; 69 70 /** 71 * Factor to multiply to an absolute position to get its corresponding percentile. 72 */ 73 private final float percentileFactor; 74 75 /** 76 * Function that resolves the contig index given its name. 77 */ 78 private final ToIntFunction<String> contigToIndex; 79 80 /** 81 * Function that resolves the contig name given its index. 82 */ 83 private final IntFunction<String> indexToContig; 84 private int lastCtg; 85 86 AbsoluteCoordinates(final int[] lengths, final long[] accumulative, final ToIntFunction<String> contigToIndex, final IntFunction<String> indexToContig)87 private AbsoluteCoordinates(final int[] lengths, final long[] accumulative, 88 final ToIntFunction<String> contigToIndex, final IntFunction<String> indexToContig) { 89 this.lengths = lengths; 90 this.accumulative = accumulative; 91 this.contigToIndex = contigToIndex; 92 this.indexToContig = indexToContig; 93 this.total = accumulative[accumulative.length - 1]; 94 this.percentiles = calculatePercentiles(lengths, accumulative, total); 95 this.percentileFactor = (percentiles.length - 1) / (float) this.total; 96 this.lastCtg = 0; 97 } 98 calculatePercentiles(final int[] lengths, final long[] accumulative, final long total)99 private static int[] calculatePercentiles(final int[] lengths, final long[] accumulative, final long total) { 100 final int[] result = new int[(accumulative.length << 1) + 1]; 101 final float fraction = total / (float)(result.length - 1); 102 double fractionAccumulator = 0; 103 for (int i = 0, j = 0; i < lengths.length; i++) { 104 final long accumulativePlusLength = accumulative[i + 1]; // == accumulative[i] + lengths[i]; 105 while (fractionAccumulator < accumulativePlusLength && j < result.length - 1) { 106 result[j++] = i; 107 fractionAccumulator += fraction; 108 } 109 } 110 result[result.length - 1] = lengths.length - 1; 111 return result; 112 } 113 of(final SAMSequenceDictionary dictionary)114 public static AbsoluteCoordinates of(final SAMSequenceDictionary dictionary) { 115 final ToIntFunction<String> contigToIndex = dictionary::getSequenceIndex; 116 final IntFunction<String> indexToContig = i -> dictionary.getSequence(i).getContig(); 117 final List<SAMSequenceRecord> sequences = dictionary.getSequences(); 118 final int numberOfSequences = sequences.size(); 119 final int[] lengths = new int[numberOfSequences]; 120 final long[] accumulative = new long[numberOfSequences + 1]; 121 for (int i = 0; i < numberOfSequences; i++) { 122 final SAMSequenceRecord sequence = sequences.get(i); 123 lengths[i] = sequence.getSequenceLength(); 124 } 125 long leftSum = lengths[0]; 126 for (int i = 1; i < numberOfSequences; i++) { 127 accumulative[i] = leftSum; 128 leftSum += lengths[i]; 129 } 130 accumulative[numberOfSequences] = leftSum; 131 return new AbsoluteCoordinates(lengths, accumulative, contigToIndex, indexToContig); 132 } 133 toAbsolute(final String ctgName, final int position)134 public long toAbsolute(final String ctgName, final int position) { 135 return toAbsolute(contigToIndex.applyAsInt(ctgName), position); 136 } 137 138 /** 139 * Obtains the absolute coordinate for the start position in an interval. 140 * @param simpleInterval the target position in relative coordinates 141 * @return 1 or greater. 142 * @throws IllegalArgumentException no such contig name or tht conting is too small. 143 */ toAbsolute(final SimpleInterval simpleInterval)144 public long toAbsolute(final SimpleInterval simpleInterval) { 145 return toAbsolute(simpleInterval.getContig(), simpleInterval.getStart()); 146 } 147 148 toAbsolute(final int ctgIdx, final int position)149 public long toAbsolute(final int ctgIdx, final int position) { 150 if (lengths[ctgIdx] < position) { 151 throw new IllegalArgumentException("position outside containg contig"); 152 } 153 lastCtg = ctgIdx; 154 return accumulative[ctgIdx] + position; 155 } 156 157 public static class Relative { 158 public final String contig; 159 public final int contigIndex; 160 public final int position; 161 Relative(final String contig, final int contigIndex, final int position)162 Relative(final String contig, final int contigIndex, final int position) { 163 this.contig = contig; 164 this.contigIndex = contigIndex; 165 this.position = position; 166 } 167 } 168 169 @FunctionalInterface 170 interface RelativeFactory<E> { create(final String ctgName, final int ctgIdx, final int position)171 E create(final String ctgName, final int ctgIdx, final int position); 172 } 173 toSimpleInterval(final long absoluteStart, final int length)174 public SimpleInterval toSimpleInterval(final long absoluteStart, final int length) { 175 return toRelative(absoluteStart, (n, i, p) -> new SimpleInterval(n, p, p + length - 1)); 176 } 177 toRelative(final long absolute)178 public Relative toRelative(final long absolute) { 179 return toRelative(absolute, Relative::new); 180 } 181 toRelative(final long absolute, final RelativeFactory<E> factory)182 public <E> E toRelative(final long absolute, final RelativeFactory<E> factory) { 183 if (absolute < 1) { 184 throw new IllegalArgumentException("absolute cannot be less than 1"); 185 } else if (absolute > total) { 186 throw new IllegalArgumentException("absolute is too large"); 187 } else if (accumulative[lastCtg] < absolute) { 188 if (absolute <= accumulative[lastCtg + 1]) { 189 return factory.create(indexToContig.apply(lastCtg), lastCtg, (int) (absolute - accumulative[lastCtg])); 190 } else { 191 return searchRelative(absolute, factory); 192 } 193 } else { 194 return searchRelative(absolute, factory); 195 } 196 } 197 searchRelative(final long target, final RelativeFactory<E> factory)198 private <E> E searchRelative(final long target, final RelativeFactory<E> factory) { 199 final int percentileIndex = (int) (target * percentileFactor); 200 if (percentileIndex >= percentiles.length) { 201 throw new IllegalArgumentException("xx " + target + " " + total ); 202 } 203 final int percentileCtg = percentiles[percentileIndex]; 204 if (percentileIndex < percentiles.length - 1) { 205 return searchRelative(target, percentileCtg, percentiles[percentileIndex + 1], factory); 206 } else { 207 return searchRelative(target, percentileCtg, lengths.length - 1, factory); 208 } 209 } 210 searchRelative(final long target, final int minCtgIdx, final int maxCtgIdx, final RelativeFactory<E> factory)211 private <E> E searchRelative(final long target, final int minCtgIdx, final int maxCtgIdx, final RelativeFactory<E> factory) { 212 int i = minCtgIdx, j = maxCtgIdx; 213 while (i < j) { 214 final int mid = (i + j) >> 1; 215 if (accumulative[mid] >= target) { 216 j = mid - 1; 217 } else if (accumulative[mid + 1] < target) { 218 i = mid + 1; 219 } else { 220 i = mid; 221 break; 222 } 223 } 224 lastCtg = i; 225 return factory.create(indexToContig.apply(i), i, (int) (target - accumulative[i])); 226 } 227 } 228