1 package org.broadinstitute.hellbender.utils.haplotype;
2 
3 import htsjdk.samtools.Cigar;
4 import htsjdk.samtools.CigarElement;
5 import htsjdk.samtools.CigarOperator;
6 import htsjdk.samtools.util.Locatable;
7 import htsjdk.variant.variantcontext.Allele;
8 import org.apache.commons.lang3.ArrayUtils;
9 import org.apache.commons.lang3.tuple.Pair;
10 import org.broadinstitute.hellbender.utils.SimpleInterval;
11 import org.broadinstitute.hellbender.utils.Utils;
12 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
13 import org.broadinstitute.hellbender.utils.read.CigarBuilder;
14 import org.broadinstitute.hellbender.utils.read.ReadUtils;
15 
16 import java.util.Arrays;
17 import java.util.Comparator;
18 
19 public final class Haplotype extends Allele {
20     private static final long serialVersionUID = 1L;
21 
22     /**
23      * Compares two haplotypes first by their lengths and then by lexicographic order of their bases.
24      */
25     public static final Comparator<Haplotype> SIZE_AND_BASE_ORDER =
26             Comparator.comparingInt((Haplotype hap) -> hap.getBases().length)
27                       .thenComparing(hap -> hap.getBaseString());
28 
29     private Locatable genomeLocation = null;
30     private EventMap eventMap = null;
31     private Cigar cigar;
32     private int alignmentStartHapwrtRef;
33     private double score = Double.NaN;
34 
35     // debug information for tracking kmer sizes used in graph construction for debug output
36     private int kmerSize = 0;
37 
38     /**
39      * Main constructor
40      *
41      * @param bases a non-null array of bases
42      * @param isRef is this the reference haplotype?
43      */
Haplotype( final byte[] bases, final boolean isRef )44     public Haplotype( final byte[] bases, final boolean isRef ) {
45         super(Arrays.copyOf(bases, bases.length), isRef);
46     }
47 
48     /**
49      * Create a new non-ref haplotype
50      *
51      * @param bases a non-null array of bases
52      */
Haplotype( final byte[] bases )53     public Haplotype( final byte[] bases ) {
54         this(bases, false);
55     }
56 
57     /**
58      * Create a new haplotype with bases
59      *
60      * Requires bases.length == cigar.getReadLength()
61      *
62      * @param bases a non-null array of bases
63      * @param isRef is this the reference haplotype?
64      * @param alignmentStartHapwrtRef offset of this haplotype w.r.t. the reference
65      * @param cigar the cigar that maps this haplotype to the reference sequence
66      */
Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar)67     public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) {
68         this(bases, isRef);
69         this.alignmentStartHapwrtRef = alignmentStartHapwrtRef;
70         setCigar(cigar);
71     }
72 
Haplotype( final byte[] bases, final Locatable loc )73     public Haplotype( final byte[] bases, final Locatable loc ) {
74         this(bases, false);
75         this.genomeLocation = loc;
76     }
77 
78     /**
79      * Create a new Haplotype derived from this one that exactly spans the provided location
80      *
81      * Note that this haplotype must have a contain a genome loc for this operation to be successful.  If no
82      * GenomeLoc is contained than @throws an IllegalStateException
83      *
84      * Also loc must be fully contained within this Haplotype's genomeLoc.  If not an IllegalArgumentException is
85      * thrown.
86      *
87      * @param loc a location completely contained within this Haplotype's location
88      * @return a new Haplotype within only the bases spanning the provided location, or null for some reason the haplotype would be malformed if
89      */
trim(final Locatable loc)90     public Haplotype trim(final Locatable loc) {
91         Utils.nonNull( loc, "Loc cannot be null");
92         Utils.nonNull(genomeLocation, "Cannot trim a Haplotype without containing GenomeLoc");
93         Utils.validateArg(new SimpleInterval(genomeLocation).contains(loc), () -> "Can only trim a Haplotype to a containing span.  My loc is " + genomeLocation + " but wanted trim to " + loc);
94         Utils.nonNull( getCigar(), "Cannot trim haplotype without a cigar " + this);
95 
96         final int newStart = loc.getStart() - this.genomeLocation.getStart();
97         final int newStop = newStart + loc.getEnd() - loc.getStart();
98 
99         // note: the following returns null if the bases covering the ref interval start or end in a deletion.
100         final byte[] newBases = AlignmentUtils.getBasesCoveringRefInterval(newStart, newStop, getBases(), 0, getCigar());
101 
102         if ( newBases == null || newBases.length == 0 ) { // we cannot meaningfully chop down the haplotype, so return null
103             return null;
104         }
105 
106         // note: trimCigarByReference does not remove leading or trailing indels, while getBasesCoveringRefInterval does remove bases
107         // of leading and trailing insertions.  We must remove leading and trailing insertions from the Cigar manually.
108         // we keep leading and trailing deletions because these are necessary for haplotypes to maintain consistent reference coordinates
109         final Cigar newCigar = AlignmentUtils.trimCigarByReference(getCigar(), newStart, newStop).getCigar();
110         final boolean leadingInsertion = !newCigar.getFirstCigarElement().getOperator().consumesReferenceBases();
111         final boolean trailingInsertion = !newCigar.getLastCigarElement().getOperator().consumesReferenceBases();
112         final int firstIndexToKeepInclusive = leadingInsertion ? 1 : 0;
113         final int lastIndexToKeepExclusive = newCigar.numCigarElements() - (trailingInsertion ? 1 : 0);
114 
115         if (lastIndexToKeepExclusive <= firstIndexToKeepInclusive) {    // edge case of entire cigar is insertion
116             return null;
117         }
118 
119         final Cigar leadingIndelTrimmedNewCigar = !(leadingInsertion || trailingInsertion)  ? newCigar :
120                 new CigarBuilder(false).addAll(newCigar.getCigarElements().subList(firstIndexToKeepInclusive, lastIndexToKeepExclusive)).make();
121 
122         final Haplotype ret = new Haplotype(newBases, isReference());
123         ret.setCigar(leadingIndelTrimmedNewCigar);
124         ret.setGenomeLocation(loc);
125         ret.setScore(score);
126         ret.setKmerSize(kmerSize);
127         ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef());
128         return ret;
129     }
130 
131     @Override
equals( final Object h )132     public boolean equals( final Object h ) {
133         return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases());
134     }
135 
136     @Override
hashCode()137     public int hashCode() {
138         return Arrays.hashCode(getBases());
139     }
140 
getEventMap()141     public EventMap getEventMap() {
142         return eventMap;
143     }
144 
setEventMap( final EventMap eventMap )145     public void setEventMap( final EventMap eventMap ) {
146         this.eventMap = eventMap;
147     }
148 
149     @Override
toString()150     public String toString() {
151         return getDisplayString();
152     }
153 
154     /**
155      * Get the span of this haplotype (may be null)
156      * @return a potentially null genome loc
157      */
getLocation()158     public Locatable getLocation() {
159         return this.genomeLocation;
160     }
161 
setGenomeLocation(final Locatable genomeLocation)162     public void setGenomeLocation(final Locatable genomeLocation) {
163         this.genomeLocation = genomeLocation;
164     }
165 
getStartPosition()166     public long getStartPosition() {
167         return genomeLocation.getStart();
168     }
169 
getStopPosition()170     public long getStopPosition() {
171         return genomeLocation.getEnd();
172     }
173 
getAlignmentStartHapwrtRef()174     public int getAlignmentStartHapwrtRef() {
175         return alignmentStartHapwrtRef;
176     }
177 
setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef )178     public void setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef ) {
179         this.alignmentStartHapwrtRef = alignmentStartHapwrtRef;
180     }
181 
182     /**
183      * Get the cigar for this haplotype.  Note that the cigar is guaranteed to be consolidated
184      * in that multiple adjacent equal operates will have been merged
185      * @return the cigar of this haplotype
186      */
getCigar()187     public Cigar getCigar() {
188         return cigar;
189     }
190 
191     /**
192      * Get the haplotype cigar extended by padSize M at the tail, consolidated into a clean cigar
193      *
194      * @param padSize how many additional Ms should be appended to the end of this cigar.  Must be >= 0
195      * @return a newly allocated Cigar that consolidate(getCigar + padSize + M)
196      */
getConsolidatedPaddedCigar(final int padSize)197     public Cigar getConsolidatedPaddedCigar(final int padSize) {
198         Utils.validateArg( padSize >= 0, () -> "padSize must be >= 0 but got " + padSize);
199         return new CigarBuilder().addAll(getCigar()).add(new CigarElement(padSize, CigarOperator.M)).make();
200     }
201 
202     /**
203      * Set the cigar of this haplotype to cigar.
204      *
205      * This method consolidates the cigar, so that 1M1M1I1M1M => 2M1I2M.  It does not remove leading or trailing deletions
206      * because haplotypes, unlike reads, are pegged to a specific reference start and end.
207      *
208      * @param cigar a cigar whose readLength == length()
209      */
setCigar( final Cigar cigar )210     public void setCigar( final Cigar cigar ) {
211         this.cigar = new CigarBuilder(false).addAll(cigar).make();
212         Utils.validateArg( this.cigar.getReadLength() == length(), () -> "Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength() + " " + this.cigar);
213     }
214 
insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation )215     public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
216         // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
217         final Pair<Integer, CigarOperator> haplotypeInsertLocationAndOperator = ReadUtils.getReadIndexForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation);
218 
219         // can't insert outside the haplotype or into a deletion
220         if( haplotypeInsertLocationAndOperator.getLeft() == ReadUtils.READ_INDEX_NOT_FOUND || !haplotypeInsertLocationAndOperator.getRight().consumesReadBases() ) {
221             return null;
222         }
223         final int haplotypeInsertLocation = haplotypeInsertLocationAndOperator.getLeft();
224         final byte[] myBases = getBases();
225 
226         // can't insert if we don't have any sequence after the inserted alt allele to span the new variant
227         if (haplotypeInsertLocation + refAllele.length() >= myBases.length) {
228             return null;
229         }
230 
231         byte[] newHaplotypeBases = {};
232         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, 0, haplotypeInsertLocation)); // bases before the variant
233         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
234         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, haplotypeInsertLocation + refAllele.length(), myBases.length)); // bases after the variant
235         return new Haplotype(newHaplotypeBases);
236     }
237 
238     /**
239      * Get the score (an estimate of the support) of this haplotype
240      * @return a double, where higher values are better
241      */
getScore()242     public double getScore() {
243         return score;
244     }
245 
246     /**
247      * Set the score (an estimate of the support) of this haplotype.
248      *
249      * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score
250      *
251      * @param score a double, where higher values are better
252      */
setScore(final double score)253     public void setScore(final double score) {
254         this.score = score;
255     }
256 
257     /**
258      * Get the span of this haplotype (may be null)
259      * @return a potentially null genome loc
260      */
getGenomeLocation()261     public Locatable getGenomeLocation() {
262         return genomeLocation;
263     }
264 
265 
getKmerSize()266     public int getKmerSize() {
267         return kmerSize;
268     }
269 
setKmerSize(int kmerSize)270     public void setKmerSize(int kmerSize) {
271         this.kmerSize = kmerSize;
272     }
273 }
274