1 package org.broadinstitute.hellbender.utils.haplotype; 2 3 import htsjdk.samtools.Cigar; 4 import htsjdk.samtools.CigarElement; 5 import htsjdk.samtools.CigarOperator; 6 import htsjdk.samtools.util.Locatable; 7 import htsjdk.variant.variantcontext.Allele; 8 import org.apache.commons.lang3.ArrayUtils; 9 import org.apache.commons.lang3.tuple.Pair; 10 import org.broadinstitute.hellbender.utils.SimpleInterval; 11 import org.broadinstitute.hellbender.utils.Utils; 12 import org.broadinstitute.hellbender.utils.read.AlignmentUtils; 13 import org.broadinstitute.hellbender.utils.read.CigarBuilder; 14 import org.broadinstitute.hellbender.utils.read.ReadUtils; 15 16 import java.util.Arrays; 17 import java.util.Comparator; 18 19 public final class Haplotype extends Allele { 20 private static final long serialVersionUID = 1L; 21 22 /** 23 * Compares two haplotypes first by their lengths and then by lexicographic order of their bases. 24 */ 25 public static final Comparator<Haplotype> SIZE_AND_BASE_ORDER = 26 Comparator.comparingInt((Haplotype hap) -> hap.getBases().length) 27 .thenComparing(hap -> hap.getBaseString()); 28 29 private Locatable genomeLocation = null; 30 private EventMap eventMap = null; 31 private Cigar cigar; 32 private int alignmentStartHapwrtRef; 33 private double score = Double.NaN; 34 35 // debug information for tracking kmer sizes used in graph construction for debug output 36 private int kmerSize = 0; 37 38 /** 39 * Main constructor 40 * 41 * @param bases a non-null array of bases 42 * @param isRef is this the reference haplotype? 43 */ Haplotype( final byte[] bases, final boolean isRef )44 public Haplotype( final byte[] bases, final boolean isRef ) { 45 super(Arrays.copyOf(bases, bases.length), isRef); 46 } 47 48 /** 49 * Create a new non-ref haplotype 50 * 51 * @param bases a non-null array of bases 52 */ Haplotype( final byte[] bases )53 public Haplotype( final byte[] bases ) { 54 this(bases, false); 55 } 56 57 /** 58 * Create a new haplotype with bases 59 * 60 * Requires bases.length == cigar.getReadLength() 61 * 62 * @param bases a non-null array of bases 63 * @param isRef is this the reference haplotype? 64 * @param alignmentStartHapwrtRef offset of this haplotype w.r.t. the reference 65 * @param cigar the cigar that maps this haplotype to the reference sequence 66 */ Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar)67 public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) { 68 this(bases, isRef); 69 this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; 70 setCigar(cigar); 71 } 72 Haplotype( final byte[] bases, final Locatable loc )73 public Haplotype( final byte[] bases, final Locatable loc ) { 74 this(bases, false); 75 this.genomeLocation = loc; 76 } 77 78 /** 79 * Create a new Haplotype derived from this one that exactly spans the provided location 80 * 81 * Note that this haplotype must have a contain a genome loc for this operation to be successful. If no 82 * GenomeLoc is contained than @throws an IllegalStateException 83 * 84 * Also loc must be fully contained within this Haplotype's genomeLoc. If not an IllegalArgumentException is 85 * thrown. 86 * 87 * @param loc a location completely contained within this Haplotype's location 88 * @return a new Haplotype within only the bases spanning the provided location, or null for some reason the haplotype would be malformed if 89 */ trim(final Locatable loc)90 public Haplotype trim(final Locatable loc) { 91 Utils.nonNull( loc, "Loc cannot be null"); 92 Utils.nonNull(genomeLocation, "Cannot trim a Haplotype without containing GenomeLoc"); 93 Utils.validateArg(new SimpleInterval(genomeLocation).contains(loc), () -> "Can only trim a Haplotype to a containing span. My loc is " + genomeLocation + " but wanted trim to " + loc); 94 Utils.nonNull( getCigar(), "Cannot trim haplotype without a cigar " + this); 95 96 final int newStart = loc.getStart() - this.genomeLocation.getStart(); 97 final int newStop = newStart + loc.getEnd() - loc.getStart(); 98 99 // note: the following returns null if the bases covering the ref interval start or end in a deletion. 100 final byte[] newBases = AlignmentUtils.getBasesCoveringRefInterval(newStart, newStop, getBases(), 0, getCigar()); 101 102 if ( newBases == null || newBases.length == 0 ) { // we cannot meaningfully chop down the haplotype, so return null 103 return null; 104 } 105 106 // note: trimCigarByReference does not remove leading or trailing indels, while getBasesCoveringRefInterval does remove bases 107 // of leading and trailing insertions. We must remove leading and trailing insertions from the Cigar manually. 108 // we keep leading and trailing deletions because these are necessary for haplotypes to maintain consistent reference coordinates 109 final Cigar newCigar = AlignmentUtils.trimCigarByReference(getCigar(), newStart, newStop).getCigar(); 110 final boolean leadingInsertion = !newCigar.getFirstCigarElement().getOperator().consumesReferenceBases(); 111 final boolean trailingInsertion = !newCigar.getLastCigarElement().getOperator().consumesReferenceBases(); 112 final int firstIndexToKeepInclusive = leadingInsertion ? 1 : 0; 113 final int lastIndexToKeepExclusive = newCigar.numCigarElements() - (trailingInsertion ? 1 : 0); 114 115 if (lastIndexToKeepExclusive <= firstIndexToKeepInclusive) { // edge case of entire cigar is insertion 116 return null; 117 } 118 119 final Cigar leadingIndelTrimmedNewCigar = !(leadingInsertion || trailingInsertion) ? newCigar : 120 new CigarBuilder(false).addAll(newCigar.getCigarElements().subList(firstIndexToKeepInclusive, lastIndexToKeepExclusive)).make(); 121 122 final Haplotype ret = new Haplotype(newBases, isReference()); 123 ret.setCigar(leadingIndelTrimmedNewCigar); 124 ret.setGenomeLocation(loc); 125 ret.setScore(score); 126 ret.setKmerSize(kmerSize); 127 ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef()); 128 return ret; 129 } 130 131 @Override equals( final Object h )132 public boolean equals( final Object h ) { 133 return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); 134 } 135 136 @Override hashCode()137 public int hashCode() { 138 return Arrays.hashCode(getBases()); 139 } 140 getEventMap()141 public EventMap getEventMap() { 142 return eventMap; 143 } 144 setEventMap( final EventMap eventMap )145 public void setEventMap( final EventMap eventMap ) { 146 this.eventMap = eventMap; 147 } 148 149 @Override toString()150 public String toString() { 151 return getDisplayString(); 152 } 153 154 /** 155 * Get the span of this haplotype (may be null) 156 * @return a potentially null genome loc 157 */ getLocation()158 public Locatable getLocation() { 159 return this.genomeLocation; 160 } 161 setGenomeLocation(final Locatable genomeLocation)162 public void setGenomeLocation(final Locatable genomeLocation) { 163 this.genomeLocation = genomeLocation; 164 } 165 getStartPosition()166 public long getStartPosition() { 167 return genomeLocation.getStart(); 168 } 169 getStopPosition()170 public long getStopPosition() { 171 return genomeLocation.getEnd(); 172 } 173 getAlignmentStartHapwrtRef()174 public int getAlignmentStartHapwrtRef() { 175 return alignmentStartHapwrtRef; 176 } 177 setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef )178 public void setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef ) { 179 this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; 180 } 181 182 /** 183 * Get the cigar for this haplotype. Note that the cigar is guaranteed to be consolidated 184 * in that multiple adjacent equal operates will have been merged 185 * @return the cigar of this haplotype 186 */ getCigar()187 public Cigar getCigar() { 188 return cigar; 189 } 190 191 /** 192 * Get the haplotype cigar extended by padSize M at the tail, consolidated into a clean cigar 193 * 194 * @param padSize how many additional Ms should be appended to the end of this cigar. Must be >= 0 195 * @return a newly allocated Cigar that consolidate(getCigar + padSize + M) 196 */ getConsolidatedPaddedCigar(final int padSize)197 public Cigar getConsolidatedPaddedCigar(final int padSize) { 198 Utils.validateArg( padSize >= 0, () -> "padSize must be >= 0 but got " + padSize); 199 return new CigarBuilder().addAll(getCigar()).add(new CigarElement(padSize, CigarOperator.M)).make(); 200 } 201 202 /** 203 * Set the cigar of this haplotype to cigar. 204 * 205 * This method consolidates the cigar, so that 1M1M1I1M1M => 2M1I2M. It does not remove leading or trailing deletions 206 * because haplotypes, unlike reads, are pegged to a specific reference start and end. 207 * 208 * @param cigar a cigar whose readLength == length() 209 */ setCigar( final Cigar cigar )210 public void setCigar( final Cigar cigar ) { 211 this.cigar = new CigarBuilder(false).addAll(cigar).make(); 212 Utils.validateArg( this.cigar.getReadLength() == length(), () -> "Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength() + " " + this.cigar); 213 } 214 insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation )215 public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { 216 // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates 217 final Pair<Integer, CigarOperator> haplotypeInsertLocationAndOperator = ReadUtils.getReadIndexForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation); 218 219 // can't insert outside the haplotype or into a deletion 220 if( haplotypeInsertLocationAndOperator.getLeft() == ReadUtils.READ_INDEX_NOT_FOUND || !haplotypeInsertLocationAndOperator.getRight().consumesReadBases() ) { 221 return null; 222 } 223 final int haplotypeInsertLocation = haplotypeInsertLocationAndOperator.getLeft(); 224 final byte[] myBases = getBases(); 225 226 // can't insert if we don't have any sequence after the inserted alt allele to span the new variant 227 if (haplotypeInsertLocation + refAllele.length() >= myBases.length) { 228 return null; 229 } 230 231 byte[] newHaplotypeBases = {}; 232 newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, 0, haplotypeInsertLocation)); // bases before the variant 233 newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant 234 newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, haplotypeInsertLocation + refAllele.length(), myBases.length)); // bases after the variant 235 return new Haplotype(newHaplotypeBases); 236 } 237 238 /** 239 * Get the score (an estimate of the support) of this haplotype 240 * @return a double, where higher values are better 241 */ getScore()242 public double getScore() { 243 return score; 244 } 245 246 /** 247 * Set the score (an estimate of the support) of this haplotype. 248 * 249 * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score 250 * 251 * @param score a double, where higher values are better 252 */ setScore(final double score)253 public void setScore(final double score) { 254 this.score = score; 255 } 256 257 /** 258 * Get the span of this haplotype (may be null) 259 * @return a potentially null genome loc 260 */ getGenomeLocation()261 public Locatable getGenomeLocation() { 262 return genomeLocation; 263 } 264 265 getKmerSize()266 public int getKmerSize() { 267 return kmerSize; 268 } 269 setKmerSize(int kmerSize)270 public void setKmerSize(int kmerSize) { 271 this.kmerSize = kmerSize; 272 } 273 } 274