1 package org.broadinstitute.hellbender.utils.read;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import com.google.common.collect.Lists;
5 import htsjdk.samtools.Cigar;
6 import htsjdk.samtools.CigarElement;
7 import htsjdk.samtools.CigarOperator;
8 import org.apache.commons.lang3.tuple.ImmutablePair;
9 import org.apache.commons.lang3.tuple.Pair;
10 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
11 import org.broadinstitute.hellbender.exceptions.GATKException;
12 import org.broadinstitute.hellbender.utils.BaseUtils;
13 import org.broadinstitute.hellbender.utils.IndexRange;
14 import org.broadinstitute.hellbender.utils.Utils;
15 import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
16 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
17 import org.broadinstitute.hellbender.utils.param.ParamUtils;
18 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
19 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
20 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
21 
22 import java.util.*;
23 import java.util.stream.IntStream;
24 
25 
26 public final class AlignmentUtils {
27     private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X);
28     private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S);
29     public final static String HAPLOTYPE_TAG = "HC";
30     public final static byte GAP_CHARACTER = (byte)'-';
31 
32     // cannot be instantiated
AlignmentUtils()33     private AlignmentUtils() { }
34 
35     /**
36      * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference
37      * via the alignment of haplotype (via its getCigar) method.
38      *
39      * @param originalRead the read we want to write aligned to the reference genome
40      * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
41      * @param referenceStart the start of the reference that haplotype is aligned to.  Provides global coordinate frame.
42      * @param isInformative true if the read is differentially informative for one of the haplotypes
43      *
44      * @param aligner
45      * @throws IllegalArgumentException if {@code originalRead} is {@code null} or {@code haplotype} is {@code null} or it
46      *   does not have a Cigar or the {@code referenceStart} is invalid (less than 1).
47      *
48      * @return a GATKRead aligned to reference. Never {@code null}.
49      */
createReadAlignedToRef(final GATKRead originalRead, final Haplotype haplotype, final Haplotype refHaplotype, final int referenceStart, final boolean isInformative, final SmithWatermanAligner aligner)50     public static GATKRead createReadAlignedToRef(final GATKRead originalRead,
51                                                   final Haplotype haplotype,
52                                                   final Haplotype refHaplotype,
53                                                   final int referenceStart,
54                                                   final boolean isInformative,
55                                                   final SmithWatermanAligner aligner) {
56         Utils.nonNull(originalRead);
57         Utils.nonNull(haplotype);
58         Utils.nonNull(refHaplotype);
59         Utils.nonNull(haplotype.getCigar());
60         Utils.nonNull(aligner);
61         if ( referenceStart < 1 ) { throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); }
62 
63         // compute the smith-waterman alignment of read -> haplotype //TODO use more efficient than the read clipper here
64         final GATKRead readMinusSoftClips = ReadClipper.hardClipSoftClippedBases(originalRead);
65         final int softClippedBases = originalRead.getLength() - readMinusSoftClips.getLength();
66         final SmithWatermanAlignment readToHaplotypeSWAlignment = aligner.align(haplotype.getBases(), readMinusSoftClips.getBases(), CigarUtils.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
67         if ( readToHaplotypeSWAlignment.getAlignmentOffset() == -1 ) {
68             // sw can fail (reasons not clear) so if it happens just don't realign the read
69             return originalRead;
70         }
71 
72         final Cigar swCigar = new CigarBuilder().addAll(readToHaplotypeSWAlignment.getCigar()).make();
73 
74         // since we're modifying the read we need to clone it
75         final GATKRead copiedRead = originalRead.copy();
76 
77         // only informative reads are given the haplotype tag to enhance visualization
78         if ( isInformative ) {
79             copiedRead.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode());
80         }
81 
82         // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar
83         final Cigar rightPaddedHaplotypeVsRefCigar = haplotype.getConsolidatedPaddedCigar(1000);
84 
85         // this computes the number of reference bases before the read starts, based on the haplotype vs ref cigar
86         // This cigar corresponds exactly to the readToRefCigarRaw, below.  One might wonder whether readToRefCigarRaw and
87         // readToRefCigarClean ever imply different starts, which could occur if if the former has a leading deletion.  However,
88         // according to the logic of applyCigarToCigar, this can only happen if the read has a leading deletion wrt its best haplotype,
89         // which our SW aligner won't do, or if the read starts on a haplotype base that is in a deletion wrt to reference, which is nonsensical
90         // since a base that exists is not a deletion.  Thus, there is nothing to worry about, in contrast to below where we do check
91         // whether left-alignment shifted the start position.
92         final int readStartOnReferenceHaplotype = readStartOnReferenceHaplotype(rightPaddedHaplotypeVsRefCigar, readToHaplotypeSWAlignment.getAlignmentOffset());
93 
94 
95         //final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype;
96 
97         final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnReferenceHaplotype;
98 
99         // compute the read -> ref alignment by mapping read -> hap -> ref from the
100         // SW of read -> hap mapped through the given by hap -> ref
101 
102         // this is the sub-cigar of the haplotype-to-ref alignment, with cigar elements before the read start removed.  Elements after the read end are kept.
103         final Cigar haplotypeToRef = trimCigarByBases(rightPaddedHaplotypeVsRefCigar, readToHaplotypeSWAlignment.getAlignmentOffset(), rightPaddedHaplotypeVsRefCigar.getReadLength() - 1).getCigar();
104 
105         final Cigar readToRefCigar = applyCigarToCigar(swCigar, haplotypeToRef);
106         final CigarBuilder.Result leftAlignedReadToRefCigarResult = leftAlignIndels(readToRefCigar, refHaplotype.getBases(), readMinusSoftClips.getBases(), readStartOnReferenceHaplotype);
107         final Cigar leftAlignedReadToRefCigar = leftAlignedReadToRefCigarResult.getCigar();
108         // it's possible that left-alignment shifted a deletion to the beginning of a read and removed it, shifting the first aligned base to the right
109         copiedRead.setPosition(copiedRead.getContig(), readStartOnReference + leftAlignedReadToRefCigarResult.getLeadingDeletionBasesRemoved());
110 
111         // the SW Cigar does not contain the hard clips of the original read
112         // Here we reconcile the aligned read (that has had any softclips removed) with its softclipped bases
113         final Cigar originalCigar = originalRead.getCigar();
114         final Cigar newCigar = appendClippedElementsFromCigarToCigar(leftAlignedReadToRefCigar, originalCigar);
115         copiedRead.setCigar(newCigar);
116 
117         if ( leftAlignedReadToRefCigar.getReadLength() + softClippedBases != copiedRead.getLength() ) {
118             throw new GATKException("Cigar " + leftAlignedReadToRefCigar + " with read length " + leftAlignedReadToRefCigar.getReadLength()
119                     + " != read length " + copiedRead.getLength() + " for read " + copiedRead.toString() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength()
120                     + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength());
121         }
122         // assert that the cigar has the same number of elements as the original read
123 
124         return copiedRead;
125     }
126 
127     /**
128      * Helper method that handles the work of re-appending clipped bases from the original cigar to the new one
129      *
130      * @param cigarToHaveClippedElementsAdded cigar to attach softclips to
131      * @param originalClippedCigar cigar to check for clipped bases
132      * @return a new cigar that has had the clipped elements from the original appended to either end
133      */
134     @VisibleForTesting
appendClippedElementsFromCigarToCigar(final Cigar cigarToHaveClippedElementsAdded, final Cigar originalClippedCigar)135     static Cigar appendClippedElementsFromCigarToCigar(final Cigar cigarToHaveClippedElementsAdded, final Cigar originalClippedCigar) {
136         int firstIndex = 0;
137         int lastIndex = originalClippedCigar.numCigarElements() - 1;
138         CigarElement firstElement = originalClippedCigar.getFirstCigarElement();
139         CigarElement lastElement = originalClippedCigar.getLastCigarElement();
140         final List<CigarElement> readToRefCigarElementsWithHardClips = new ArrayList<>();
141         while (firstElement.getOperator().isClipping() && firstIndex != lastIndex) {
142             readToRefCigarElementsWithHardClips.add(firstElement);
143             firstElement = originalClippedCigar.getCigarElement(++firstIndex);
144         }
145         readToRefCigarElementsWithHardClips.addAll(cigarToHaveClippedElementsAdded.getCigarElements());
146 
147         final List<CigarElement> endCigarElementsToReverse = new ArrayList<>();
148         while (lastElement.getOperator().isClipping() && firstIndex != lastIndex)  {
149             endCigarElementsToReverse.add(lastElement);
150             lastElement = originalClippedCigar.getCigarElement(--lastIndex);
151         }
152         // Reverse the order to preserve the original soft/hardclip ordering in mixed clipping cases where softclips precede hardclips
153         readToRefCigarElementsWithHardClips.addAll(Lists.reverse(endCigarElementsToReverse));
154 
155         return new Cigar(readToRefCigarElementsWithHardClips);
156     }
157 
158 
159     /**
160      * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the
161      * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference
162      *
163      * refStart and refEnd are 0 based offsets that we want to obtain.  In the client code, if the reference
164      * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X.
165      *
166      * If refStart or refEnd would start or end the new bases within a deletion, this function will return null
167      *
168      * If the cigar starts with an insertion, the inserted bases are considered as coming before the start position and
169      * are therefore excluded from the result.  That is getBasesCoveringRefInterval(0, 3, "ACTTGT", 0, 2I4M) should yield "TTGT".
170      *
171      * @param bases
172      * @param refStart
173      * @param refEnd
174      * @param basesStartOnRef where does the bases array start w.r.t. the reference start?  For example, bases[0] of
175      *                        could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at
176      *                        10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10.
177      *                        It's not trivial to eliminate this parameter because it's tied up with the cigar
178      * @param basesToRefCigar the cigar that maps the bases to the reference genome
179      * @return a byte[] containing the bases covering this interval, or null if we would start or end within a deletion
180      */
getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar)181     public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) {
182         if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd);
183         if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef);
184         Utils.nonNull( bases );
185         Utils.nonNull( basesToRefCigar );
186         if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar);
187 
188         int refPos = basesStartOnRef;
189         int basesPos = 0;
190         int basesStart = -1;
191         int basesStop = -1;
192         boolean done = false;
193 
194         for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) {
195             final CigarElement ce = basesToRefCigar.getCigarElement(iii);
196             switch ( ce.getOperator() ) {
197                 case I:
198                     basesPos += ce.getLength();
199                     break;
200                 case M: case X: case EQ:
201                     for ( int i = 0; i < ce.getLength(); i++ ) {
202                         if ( refPos == refStart )
203                             basesStart = basesPos;
204                         if ( refPos == refEnd ) {
205                             basesStop = basesPos;
206                             done = true;
207                             break;
208                         }
209                         refPos++;
210                         basesPos++;
211                     }
212                     break;
213                 case D:
214                     for ( int i = 0; i < ce.getLength(); i++ ) {
215                         if ( refPos == refEnd || refPos == refStart ) {
216                             // if we ever reach a ref position that is either a start or an end, we fail
217                             return null;
218                         }
219                         refPos++;
220                     }
221                     break;
222                 default:
223                     throw new IllegalStateException("Unsupported operator " + ce);
224             }
225         }
226 
227         if ( basesStart == -1 || basesStop == -1 )
228             throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar);
229 
230         return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
231     }
232 
233     /**
234      * Returns the "IGV View" of all the bases and base qualities in a read aligned to the reference according to the cigar, dropping any bases
235      * that might be in the read but aren't in the reference. Any bases that appear in the reference but not the read
236      * will be filled in with GAP_CHARACTER values for the read bases and 0's for base qualities to indicate that they don't exist.
237      *
238      * If the cigar for input read is all matches to the reference then this method will return references to the original
239      * read base/base quality byte arrays in the underlying SamRecord in order to save on array allocation/copying performance effects.
240      *
241      * @param read a read to return aligned to the reference
242      * @return A Pair of byte arrays where the left array corresponds to the bases aligned to the reference and right
243      *         array corresponds to the baseQualities aligned to the reference.
244      */
getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read)245     public static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read) {
246         return getBasesAndBaseQualitiesAlignedOneToOne(read, GAP_CHARACTER, (byte)0);
247     }
248 
getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read, final byte basePadCharacter, final byte qualityPadCharacter)249     private static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read, final byte basePadCharacter, final byte qualityPadCharacter) {
250         Utils.nonNull(read);
251         // As this code is performance sensitive in the HaplotypeCaller, we elect to use the noCopy versions of these getters.
252         // We can do this because we don't mutate base or quality arrays in this method or in its accessors
253         final byte[] bases = read.getBasesNoCopy();
254         final byte[] baseQualities = read.getBaseQualitiesNoCopy();
255         final int numCigarElements = read.numCigarElements();
256         boolean sawIndel = false;
257 
258         // Check if the cigar contains indels
259         // Note that we don't call ContainsOperator() here twice to avoid the performance hit of building stream iterators twice
260         for (int i = 0; i < numCigarElements; i++) {
261             final CigarOperator e = read.getCigarElement(i).getOperator();
262             if (e == CigarOperator.INSERTION || e == CigarOperator.DELETION) {
263                 sawIndel = true;
264                 break;
265             }
266         }
267         if (!sawIndel) {
268             return new ImmutablePair<>(bases, baseQualities);
269         }
270         else {
271             final int numberRefBasesIncludingSoftclips = CigarUtils.countRefBasesAndSoftClips(read.getCigarElements(), 0, numCigarElements);
272             final byte[] paddedBases = new byte[numberRefBasesIncludingSoftclips];
273             final byte[] paddedBaseQualities = new byte[numberRefBasesIncludingSoftclips];
274             int literalPos = 0;
275             int paddedPos = 0;
276             for ( int i = 0; i < numCigarElements; i++ ) {
277                 final CigarElement ce = read.getCigarElement(i);
278                 final CigarOperator co = ce.getOperator();
279                 if (co.consumesReadBases()) {
280                     if (!co.consumesReferenceBases()) {
281                         literalPos += ce.getLength();  //skip inserted bases
282                     }
283                     else {
284                         System.arraycopy(bases, literalPos, paddedBases, paddedPos, ce.getLength());
285                         System.arraycopy(baseQualities, literalPos, paddedBaseQualities, paddedPos, ce.getLength());
286                         literalPos += ce.getLength();
287                         paddedPos += ce.getLength();
288                     }
289                 }
290                 else if (co.consumesReferenceBases()) {
291                     for ( int j = 0; j < ce.getLength(); j++ ) {  //pad deleted bases
292                         paddedBases[paddedPos] = basePadCharacter;
293                         paddedBaseQualities[paddedPos] = qualityPadCharacter;
294                         paddedPos++;
295                     }
296                 }
297             }
298             return new ImmutablePair<>(paddedBases, paddedBaseQualities);
299         }
300     }
301 
302     /**
303      * Returns the number of bases in a read minus the number of softclipped bases.
304      */
unclippedReadLength(final GATKRead read)305     public static int unclippedReadLength(final GATKRead read) {
306         int softClippedBases = 0;
307         for (CigarElement element : read.getCigarElements()) {
308             if (element.getOperator()== CigarOperator.SOFT_CLIP) {
309                 softClippedBases+= element.getLength();
310             }
311         }
312         return read.getLength() - softClippedBases;
313     }
314 
315 
316     public static class MismatchCount {
317         public int numMismatches = 0;
318         public long mismatchQualities = 0;
319     }
320 
321     // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
322     // todo -- high performance implementation.  We can do a lot better than this right now
323     /**
324      * @see #getMismatchCount(GATKRead, byte[], int, int, int) with startOnRead == 0 and nReadBases == read.getReadLength()
325      */
getMismatchCount(GATKRead r, byte[] refSeq, int refIndex)326     public static MismatchCount getMismatchCount(GATKRead r, byte[] refSeq, int refIndex) {
327         return getMismatchCount(r, refSeq, refIndex, 0, r.getLength());
328     }
329 
330     /**
331      * Count how many bases mismatch the reference.  Indels are not considered mismatching.
332      *
333      * @param r                   the sam record to check against
334      * @param refSeq              the byte array representing the reference sequence
335      * @param refIndex            the index in the reference byte array of the read's first base (the reference index
336      *                            is matching the alignment start, there may be tons of soft-clipped bases before/after
337      *                            that so it's wrong to compare with getLength() here.).  Note that refIndex is
338      *                            zero based, not 1 based
339      * @param startOnRead         the index in the read's bases from which we start counting
340      * @param nReadBases          the number of bases after (but including) startOnRead that we check
341      * @return non-null object representing the mismatch count
342      */
343     @SuppressWarnings("fallthrough")
getMismatchCount(GATKRead r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases)344     public static MismatchCount getMismatchCount(GATKRead r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
345         Utils.nonNull( r );
346         Utils.nonNull( refSeq );
347         if ( refIndex < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a reference index that is negative");
348         if ( startOnRead < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a read start that is negative");
349         if ( nReadBases < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count for a negative number of read bases");
350         if ( refSeq.length - refIndex < (r.getEnd() - r.getStart()) )
351             throw new IllegalArgumentException("attempting to calculate the mismatch count against a reference string that is smaller than the read");
352 
353         MismatchCount mc = new MismatchCount();
354 
355         int readIdx = 0;
356         final int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count (note we are including soft-clipped bases with this math)
357         final byte[] readSeq = r.getBases();
358         final Cigar c = r.getCigar();
359         final byte[] readQuals = r.getBaseQualities();
360         for (final CigarElement ce : c.getCigarElements()) {
361 
362             if (readIdx > endOnRead)
363                 break;
364 
365             final int elementLength = ce.getLength();
366             switch (ce.getOperator()) {
367                 case X:
368                     mc.numMismatches += elementLength;
369                     for (int j = 0; j < elementLength; j++)
370                         mc.mismatchQualities += readQuals[readIdx+j];
371                 case EQ:
372                     refIndex += elementLength;
373                     readIdx += elementLength;
374                     break;
375                 case M:
376                     for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) {
377                         if (refIndex >= refSeq.length)
378                             continue;                      // TODO : It should never happen, we should throw exception here
379                         if (readIdx < startOnRead) continue;
380                         if (readIdx > endOnRead) break;
381                         byte refChr = refSeq[refIndex];
382                         byte readChr = readSeq[readIdx];
383                         // Note: we need to count X/N's as mismatches because that's what SAM requires
384                         //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
385                         //     BaseUtils.simpleBaseToBaseIndex(refChr)  == -1 )
386                         //    continue; // do not count Ns/Xs/etc ?
387                         if (readChr != refChr) {
388                             mc.numMismatches++;
389                             mc.mismatchQualities += readQuals[readIdx];
390                         }
391                     }
392                     break;
393                 case I:
394                 case S:
395                     readIdx += elementLength;
396                     break;
397                 case D:
398                 case N:
399                     refIndex += elementLength;
400                     break;
401                 case H:
402                 case P:
403                     break;
404                 default:
405                     throw new GATKException("The " + ce.getOperator() + " cigar element is not currently supported");
406             }
407 
408         }
409         return mc;
410     }
411 
412     /**
413      * Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
414      * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
415      * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
416      * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed.
417      * Formally, this method simply returns the number of M elements in the cigar.
418      *
419      * @param r alignment
420      * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored).
421      */
getNumAlignmentBlocks(final GATKRead r)422     public static int getNumAlignmentBlocks(final GATKRead r) {
423         Utils.nonNull( r );
424         final Cigar cigar = r.getCigar();
425         if (cigar == null) return 0;
426 
427         int n = 0;
428         for (final CigarElement e : cigar.getCigarElements()) {
429             if (ALIGNED_TO_GENOME_OPERATORS.contains(e.getOperator()))
430                 n++;
431         }
432 
433         return n;
434     }
435 
436 
437     /**
438      * Get the number of bases aligned to the genome, including soft clips
439      *
440      * If read is not mapped (i.e., doesn't have a cigar) returns 0
441      *
442      * @param r a non-null Read
443      * @return the number of bases aligned to the genome in R, including soft clipped bases
444      */
getNumAlignedBasesCountingSoftClips(final GATKRead r)445     public static int getNumAlignedBasesCountingSoftClips(final GATKRead r) {
446         int n = 0;
447         final Cigar cigar = r.getCigar();
448         if (cigar == null) return 0;
449 
450         for (final CigarElement e : cigar.getCigarElements())
451             if (ALIGNED_TO_GENOME_PLUS_SOFTCLIPS.contains(e.getOperator()))
452                 n += e.getLength();
453 
454         return n;
455     }
456 
457     /**
458      * Count the number of bases hard clipped from read
459      *
460      * If read's cigar is null, return 0
461      *
462      * @param r a non-null read
463      * @return a positive integer
464      */
getNumHardClippedBases(final GATKRead r)465     public static int getNumHardClippedBases(final GATKRead r) {
466         if ( r == null ) throw new IllegalArgumentException("Read cannot be null");
467 
468         int n = 0;
469         final Cigar cigar = r.getCigar();
470         if (cigar == null) return 0;
471 
472         for (final CigarElement e : cigar.getCigarElements())
473             if (e.getOperator() == CigarOperator.H)
474                 n += e.getLength();
475 
476         return n;
477     }
478 
479     /**
480      * Calculate the number of bases that are soft clipped in read with quality score greater than threshold
481      *
482      * Handles the case where the cigar is null (i.e., the read is unmapped), returning 0
483      *
484      * @param read a non-null read.
485      * @param qualThreshold consider bases with quals > this value as high quality.  Must be >= 0
486      * @return positive integer
487      */
countHighQualitySoftClips(final GATKRead read, final byte qualThreshold )488     public static int countHighQualitySoftClips(final GATKRead read, final byte qualThreshold ) {
489         Utils.nonNull(read);
490         ParamUtils.isPositiveOrZero(qualThreshold, "Expected qualThreshold to be positive");
491 
492         final Cigar cigar = read.getCigar();
493         if ( cigar == null ) {  // the read is unmapped
494             return 0;
495         }
496 
497         int numHQSoftClips = 0;
498         int alignPos = 0;
499         for ( final CigarElement elem : read.getCigarElements() ) {
500             final int elementLength = elem.getLength();
501             final CigarOperator operator = elem.getOperator();
502 
503             if (operator == CigarOperator.SOFT_CLIP) {
504                 for (int n = 0; n < elementLength; n++) {
505                     if( read.getBaseQuality(alignPos++) > qualThreshold ) {
506                         numHQSoftClips++;
507                     }
508                 }
509             } else if (operator.consumesReadBases()) {
510                 alignPos += elementLength;
511             }
512         }
513 
514         return numHQSoftClips;
515     }
516 
calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus)517     public static int calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus) {
518         return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), pileupElement.isDeletion(), alignmentStart, refLocus );
519     }
520 
521     /**
522      * Calculate the index into the read's bases of the beginning of the encompassing cigar element for a given cigar and offset
523      *
524      * @param cigar            the read's CIGAR -- cannot be null
525      * @param offset           the offset to use for the calculation or -1 if in the middle of a deletion
526      * @param isDeletion       are we in the middle of a deletion?
527      * @param alignmentStart   the alignment start of the read
528      * @param refLocus         the reference position of the offset
529      * @return a non-negative int index
530      */
calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus)531     public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus) {
532         if ( cigar == null ) throw new IllegalArgumentException("attempting to find the alignment position from a CIGAR that is null");
533         if ( offset < -1 ) throw new IllegalArgumentException("attempting to find the alignment position with an offset that is negative (and not -1)");
534         if ( alignmentStart < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from an alignment start that is negative");
535         if ( refLocus < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from a reference position that is negative");
536         if ( offset >= cigar.getReadLength() ) throw new IllegalArgumentException("attempting to find the alignment position of an offset than is larger than the read length");
537 
538         int pileupOffset = offset;
539 
540         // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases
541         if (isDeletion) {
542             pileupOffset = refLocus - alignmentStart;
543             final CigarElement ce = cigar.getCigarElement(0);
544             if (ce.getOperator() == CigarOperator.S) {
545                 pileupOffset += ce.getLength();
546             }
547         }
548 
549         int pos = 0;
550         int alignmentPos = 0;
551 
552         for (int iii = 0; iii < cigar.numCigarElements(); iii++) {
553             final CigarElement ce = cigar.getCigarElement(iii);
554             final int elementLength = ce.getLength();
555 
556             switch (ce.getOperator()) {
557                 case I:
558                 case S: // TODO -- I don't think that soft clips should be treated the same as inserted bases here. Investigation needed.
559                     pos += elementLength;
560                     if (pos >= pileupOffset) {
561                         return alignmentPos;
562                     }
563                     break;
564                 case D:
565                     if (!isDeletion) {
566                         alignmentPos += elementLength;
567                     } else {
568                         if (pos + elementLength - 1 >= pileupOffset) {
569                             return alignmentPos + (pileupOffset - pos);
570                         } else {
571                             pos += elementLength;
572                             alignmentPos += elementLength;
573                         }
574                     }
575                     break;
576                 case M:
577                 case EQ:
578                 case X:
579                     if (pos + elementLength - 1 >= pileupOffset) {
580                         return alignmentPos + (pileupOffset - pos);
581                     } else {
582                         pos += elementLength;
583                         alignmentPos += elementLength;
584                     }
585                     break;
586                 case H:
587                 case P:
588                 case N:
589                     break;
590                 default:
591                     throw new GATKException("Unsupported cigar operator: " + ce.getOperator());
592             }
593         }
594 
595         return alignmentPos;
596     }
597 
598     /**
599      * Is the offset inside a deletion?
600      *
601      * @param cigar         the read's CIGAR -- cannot be null
602      * @param offset        the offset into the CIGAR
603      * @return true if the offset is inside a deletion, false otherwise
604      */
isInsideDeletion(final Cigar cigar, final int offset)605     public static boolean isInsideDeletion(final Cigar cigar, final int offset) {
606         Utils.nonNull(cigar);
607         if ( offset < 0 ) return false;
608 
609         // pos counts read bases
610         int pos = 0;
611         int prevPos = 0;
612 
613         for (final CigarElement ce : cigar.getCigarElements()) {
614 
615             switch (ce.getOperator()) {
616                 case I:
617                 case S:
618                 case D:
619                 case M:
620                 case EQ:
621                 case X:
622                     prevPos = pos;
623                     pos += ce.getLength();
624                     break;
625                 case H:
626                 case P:
627                 case N:
628                     break;
629                 default:
630                     throw new GATKException("Unsupported cigar operator: " + ce.getOperator());
631             }
632 
633             // Is the offset inside a deletion?
634             if ( prevPos < offset && pos >= offset && ce.getOperator() == CigarOperator.D ) {
635                 return true;
636 
637             }
638         }
639 
640         return false;
641     }
642 
643     /**
644      * Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions)
645      *
646      * @param cigar            the read's CIGAR -- cannot be null
647      * @param read             the read's base array
648      * @return a non-null array of bases (bytes)
649      */
650     @SuppressWarnings("fallthrough")
readToAlignmentByteArray(final Cigar cigar, final byte[] read)651     public static byte[] readToAlignmentByteArray(final Cigar cigar, final byte[] read) {
652         Utils.nonNull(cigar);
653         Utils.nonNull(read);
654 
655         final int alignmentLength = cigar.getReferenceLength();
656         final byte[] alignment = new byte[alignmentLength];
657         int alignPos = 0;
658         int readPos = 0;
659         for (int iii = 0; iii < cigar.numCigarElements(); iii++) {
660 
661             final CigarElement ce = cigar.getCigarElement(iii);
662             final int elementLength = ce.getLength();
663 
664             switch (ce.getOperator()) {
665                 case I:
666                     if (alignPos > 0) {
667                         final int prevPos = alignPos - 1;
668                         if (alignment[prevPos] == BaseUtils.Base.A.base) {
669                             alignment[prevPos] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE;
670                         } else if (alignment[prevPos] == BaseUtils.Base.C.base) {
671                             alignment[prevPos] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE;
672                         } else if (alignment[prevPos] == BaseUtils.Base.T.base) {
673                             alignment[prevPos] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE;
674                         } else if (alignment[prevPos] == BaseUtils.Base.G.base) {
675                             alignment[prevPos] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE;
676                         }
677                     }
678                 case S:
679                     readPos += elementLength;
680                     break;
681                 case D:
682                 case N:
683                     for (int jjj = 0; jjj < elementLength; jjj++) {
684                         alignment[alignPos++] = PileupElement.DELETION_BASE;
685                     }
686                     break;
687                 case M:
688                 case EQ:
689                 case X:
690                     for (int jjj = 0; jjj < elementLength; jjj++) {
691                         alignment[alignPos++] = read[readPos++];
692                     }
693                     break;
694                 case H:
695                 case P:
696                     break;
697                 default:
698                     throw new GATKException("Unsupported cigar operator: " + ce.getOperator());
699             }
700         }
701         return alignment;
702     }
703 
lengthOnRead(final CigarElement element)704     private static int lengthOnRead(final CigarElement element) {
705         return element.getOperator().consumesReadBases() ? element.getLength() : 0;
706     }
707 
lengthOnReference(final CigarElement element)708     private static int lengthOnReference(final CigarElement element) {
709         return element.getOperator().consumesReferenceBases() ? element.getLength() : 0;
710     }
711 
712     /**
713      * Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
714      * starting at 0-based position <code>readStart</code> on the <code>ref</code> and specified by its <code>cigar</code>.
715      * <p/>
716      * If the alignment has one or more indels, this method attempts to move them left across a stretch of repetitive bases.
717      * For instance, if the original cigar specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output
718      * cigar will always mark the leftmost AT as deleted. If there is no indel in the original cigar or if the indel position
719      * is determined unambiguously (i.e. inserted/deleted sequence is not repeated), the original cigar is returned.
720      *
721      * Soft-clipped bases in the cigar are presumed to correspond to bases in the byte[] of read sequence.  That is, this method
722      * assumes that inputs are precise about the distinction between hard clips (removed from the read sequence) and soft clips
723      * (kept in the read sequence but not aligned).  For example, with the inputs {cigar: 2S2M2I, read sequence: TTAAAA, ref sequence: GGAA, read start: 2}
724      * the method lines up the AAAA (2M2I) of the read with the AA of the ref and left-aligns the indel to yield a cigar of
725      * 2S2I2M.
726      *
727      * @param cigar     structure of the original alignment
728      * @param ref    reference sequence the read is aligned to
729      * @param read   read sequence
730      * @param readStart  0-based position on ref of the first aligned base in the read sequence
731      * @return a non-null cigar, in which the indels are guaranteed to be placed at the leftmost possible position across a repeat (if any)
732      */
leftAlignIndels(final Cigar cigar, final byte[] ref, final byte[] read, final int readStart)733     public static CigarBuilder.Result leftAlignIndels(final Cigar cigar, final byte[] ref, final byte[] read, final int readStart) {
734         ParamUtils.isPositiveOrZero(readStart, "read start within reference base array must be non-negative");
735 
736         if (cigar.getCigarElements().stream().noneMatch(elem -> elem.getOperator().isIndel())) {
737             return new CigarBuilder.Result(cigar, 0, 0);
738         }
739 
740         // we need reference bases from the start of the read to the rightmost indel
741         final int lastIndel = IntStream.range(0, cigar.numCigarElements()).filter(n -> cigar.getCigarElement(n).getOperator().isIndel()).max().getAsInt();
742         final int necessaryRefLength = readStart + cigar.getCigarElements().stream().limit(lastIndel + 1).mapToInt(e -> lengthOnReference(e)).sum();
743         Utils.validateArg(necessaryRefLength <= ref.length, "read goes past end of reference");
744 
745         // at this point, we are one base past the end of the read.  Now we traverse the cigar from right to left
746         final List<CigarElement> resultRightToLeft = new ArrayList<>();
747         final int refLength = cigar.getReferenceLength();
748         final IndexRange refIndelRange = new IndexRange(readStart + refLength, readStart + refLength);
749         final IndexRange readIndelRange = new IndexRange(read.length,read.length);
750         for (int n = cigar.numCigarElements() - 1; n >= 0; n--) {
751             final CigarElement element = cigar.getCigarElement(n);
752             // if it's an indel, just accumulate the read and ref bases consumed.  We won't shift the indel until we hit an alignment
753             // block or the read start.
754             if (element.getOperator().isIndel()) {
755                 refIndelRange.shiftStartLeft(lengthOnReference(element));
756                 readIndelRange.shiftStartLeft(lengthOnRead(element));
757             } else if (refIndelRange.size() == 0 && readIndelRange.size() == 0) {   // no indel, just add the cigar element to the result
758                 resultRightToLeft.add(element);
759                 refIndelRange.shiftLeft(lengthOnReference(element));
760                 readIndelRange.shiftLeft(lengthOnRead(element));
761             } else {    // there's an indel that we need to left-align
762                 // we can left-align into match cigar elements but not into clips
763                 final int maxShift = element.getOperator().isAlignment() ? element.getLength() : 0;
764                 final Pair<Integer, Integer> shifts = normalizeAlleles(Arrays.asList(ref, read), Arrays.asList(refIndelRange, readIndelRange), maxShift, true);
765 
766                 // account for new match alignments on the right due to left-alignment
767                 resultRightToLeft.add(new CigarElement(shifts.getRight(), CigarOperator.MATCH_OR_MISMATCH));
768 
769                 // emit if we didn't go all the way to the start of an alignment block OR we have reached clips OR we have reached the start of the cigar
770                 final boolean emitIndel = n == 0 || shifts.getLeft() < maxShift || !element.getOperator().isAlignment();
771                 final int newMatchOnLeftDueToTrimming = shifts.getLeft() < 0 ? -shifts.getLeft() : 0;   // we may have actually shifted right to make the alleles parsimonious
772                 final int remainingBasesOnLeft = shifts.getLeft() < 0 ? element.getLength() : (element.getLength() - shifts.getLeft());
773 
774                 if (emitIndel) {  // some of this alignment block remains after left-alignment -- emit the indel
775                     resultRightToLeft.add(new CigarElement(refIndelRange.size(), CigarOperator.DELETION));
776                     resultRightToLeft.add(new CigarElement(readIndelRange.size(), CigarOperator.INSERTION));
777                     refIndelRange.shiftEndLeft(refIndelRange.size());       // ref indel range is now empty and points to start of left-aligned indel
778                     readIndelRange.shiftEndLeft(readIndelRange.size());     // read indel range is now empty and points to start of left-aligned indel
779 
780                     refIndelRange.shiftLeft(newMatchOnLeftDueToTrimming + (element.getOperator().consumesReferenceBases() ? remainingBasesOnLeft : 0));
781                     readIndelRange.shiftLeft(newMatchOnLeftDueToTrimming + (element.getOperator().consumesReadBases() ? remainingBasesOnLeft : 0));
782                     // now read and ref indel ranges are empty and point to end of element preceding this block
783                 }
784                 resultRightToLeft.add(new CigarElement(newMatchOnLeftDueToTrimming, CigarOperator.MATCH_OR_MISMATCH));
785                 resultRightToLeft.add(new CigarElement(remainingBasesOnLeft, element.getOperator()));
786             }
787         }
788 
789         // account for any indels at the start of the cigar that weren't processed because they have no adjacent non-indel element to the left
790         resultRightToLeft.add(new CigarElement(refIndelRange.size(), CigarOperator.DELETION));
791         resultRightToLeft.add(new CigarElement(readIndelRange.size(), CigarOperator.INSERTION));
792 
793         Utils.validateArg(readIndelRange.getStart() == 0, "Given cigar does not account for all bases of the read");
794         return new CigarBuilder().addAll(Lists.reverse(resultRightToLeft)).makeAndRecordDeletionsRemovedResult();
795     }
796 
797     /**
798      *  Example usage:  reference = GAAT, read = GAAAT (insertion of one A) and we initially consider the insertion of the A to occur before
799      *  the T.  Thus the reference range of this allele is [3,3) (no bases) and the read range is [3,4).  This will be left-aligned so that
800      *  the insertion occurs after the G, so that the ranges become [1,1) and [1,2) and the returned shifts are 2 bases for both the start and end
801      *  of the range.
802      *
803      *  If the given allele ranges are not parsimonious, for example [3,4) and [3,5) in the above example to include the common T in both alleles,
804      *  the resulting ranges will be shifted by different amounts.  In this case, the shifts are 2 bases in the front and 3 at the end.
805      *
806      *  Note that we use the convention that the ref allele in the case of an alt insertion, or the alt allele in case of a deletion, is represented
807      *  by [n,n) where n is the last aligned coordinate before the indel.  This makes sense when you think in terms of alignment CIGARs:
808      *  suppose for example we have a 5M5I5M read with start 100.  Then the match bases are [100,105) on the ref and [0,5) on the read and the inserted bases are
809      *  [5,10) on the read and [5,5) on the reference.
810      *
811      * @param sequences bases of sequences containing different alleles -- could be reference, a haplotype, a read, or subsequences thereof
812      * @param bounds    initial ranges (inclusive start, exclusive end) of alleles in same order as {@code sequences}
813      *                  Note that this method adjusts these ranges as a side effect
814      * @param maxShift  maximum allowable shift left.  This may be less than the amount demanded by the array bounds.  For example, when
815      *                  left-aligning a read with multiple indels, we don't want to realign one indel past another (if they "collide" we merge
816      *                  them into a single indel and continue -- see {@link AlignmentUtils::leftAlignIndels}
817      * @param trim      If true, remove redundant shared bases at the start and end of alleles
818      * @return          The number of bases the alleles were shifted left such that they still represented the same event.
819      */
820     public static Pair<Integer, Integer> normalizeAlleles(final List<byte[]> sequences, final List<IndexRange> bounds, final int maxShift, final boolean trim) {
821         Utils.nonEmpty(sequences);
822         Utils.validateArg(sequences.size() == bounds.size(), "Must have one initial allele range per sequence");
823         bounds.forEach(bound -> Utils.validateArg(maxShift <= bound.getStart(), "maxShift goes past the start of a sequence"));
824 
825         int startShift = 0;
826         int endShift = 0;
827 
828         // consume any redundant shared bases at the end of the alleles
829         int minSize = bounds.stream().mapToInt(IndexRange::size).min().getAsInt();
830         while (trim && minSize > 0 && lastBaseOnRightIsSame(sequences, bounds)) {
831             bounds.forEach(bound -> bound.shiftEndLeft(1));
832             minSize--;
833             endShift++;
834         }
835 
836         while (trim && minSize > 0 && firstBaseOnLeftIsSame(sequences, bounds)) {
837             bounds.forEach(bound -> bound.shiftStart(1));
838             minSize--;
839             startShift--;
840         }
841 
842         // we shift left as long as the last bases on the right are equal among all sequences and the next bases on the left are all equal.
843         // if a sequence is empty (eg the reference relative to an insertion alt allele) the last base on the right is the next base on the left
844         while( startShift < maxShift && nextBaseOnLeftIsSame(sequences, bounds) && lastBaseOnRightIsSame(sequences, bounds)) {
845                 bounds.forEach(b -> b.shiftLeft(1));
846                 startShift++;
847                 endShift++;
848         }
849 
850         return ImmutablePair.of(startShift, endShift);
851     }
852 
853     // do all sequences share a common base at the end of the given index range
lastBaseOnRightIsSame(List<byte[]> sequences, List<IndexRange> bounds)854     private static boolean lastBaseOnRightIsSame(List<byte[]> sequences, List<IndexRange> bounds) {
855         final byte lastBaseOnRight = sequences.get(0)[bounds.get(0).getEnd() - 1];
856         for(int n = 0; n < sequences.size(); n++) {
857             if (sequences.get(n)[bounds.get(n).getEnd() - 1] != lastBaseOnRight) {
858                 return false;
859             }
860         }
861         return true;
862     }
863 
864     // do all sequences share a common first base
firstBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds)865     private static boolean firstBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds) {
866         final byte nextBaseOnLeft = sequences.get(0)[bounds.get(0).getStart()];
867         for(int n = 0; n < sequences.size(); n++) {
868             if (sequences.get(n)[bounds.get(n).getStart()] != nextBaseOnLeft) {
869                 return false;
870             }
871         }
872         return true;
873     }
874 
875     // do all sequences share a common base just before the given index ranges
nextBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds)876     private static boolean nextBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds) {
877         final byte nextBaseOnLeft = sequences.get(0)[bounds.get(0).getStart() - 1];
878         for(int n = 0; n < sequences.size(); n++) {
879             if (sequences.get(n)[bounds.get(n).getStart() - 1] != nextBaseOnLeft) {
880                 return false;
881             }
882         }
883         return true;
884     }
885 
886     /**
887      * Given a read's first aligned base on an alt haplotype, find the first aligned base in the reference haplotype.  This
888      * method assumes that the alt haplotype and reference haplotype start at the same place.  That is, the alt haplotype
889      * starts at index 0 within the reference base array.
890      *
891      * @param haplotypeVsRefCigar
892      * @param readStartOnHaplotype
893      * @return
894      */
895     @VisibleForTesting
readStartOnReferenceHaplotype(final Cigar haplotypeVsRefCigar, final int readStartOnHaplotype)896     static int readStartOnReferenceHaplotype(final Cigar haplotypeVsRefCigar, final int readStartOnHaplotype) {
897         if (readStartOnHaplotype == 0) {
898             return 0;
899         }
900         // move forward in the haplotype vs ref cigar until we have consumed enough haplotype bases to reach the read start
901         // the number of reference bases consumed during this traversal gives us the reference start
902         int refBasesConsumedBeforeReadStart = 0;
903         int haplotypeBasesConsumed = 0;
904         for (final CigarElement element : haplotypeVsRefCigar) {
905             refBasesConsumedBeforeReadStart += lengthOnReference(element);
906             haplotypeBasesConsumed += lengthOnRead(element);
907 
908             if (haplotypeBasesConsumed >= readStartOnHaplotype) {
909                 final int excess = element.getOperator().consumesReferenceBases() ? haplotypeBasesConsumed - readStartOnHaplotype : 0;
910                 return refBasesConsumedBeforeReadStart - excess;
911             }
912         }
913 
914         throw new IllegalArgumentException("Cigar doesn't reach the read start");
915     }
916 
917     /**
918      * Removing a trailing deletion from the incoming cigar if present
919      *
920      * @param c the cigar we want to update
921      * @return a non-null Cigar
922      */
removeTrailingDeletions(final Cigar c)923     public static Cigar removeTrailingDeletions(final Cigar c) {
924 
925         final List<CigarElement> elements = c.getCigarElements();
926         if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D )
927             return c;
928 
929         return new Cigar(elements.subList(0, elements.size() - 1));
930     }
931 
932     /**
933      * Does cigar start or end with a deletion operation?
934      *
935      * WARNING: there is usually no good reason to use this method, because one should handle the leading and trailing
936      * deletion via the {@link CigarBuilder} class.  Do not use this method when you can instead use {@link CigarBuilder}.
937      *
938      * @param cigar a non-null cigar to test
939      * @return true if the first or last operator of cigar is a D
940      */
startsOrEndsWithInsertionOrDeletion(final Cigar cigar)941     public static boolean startsOrEndsWithInsertionOrDeletion(final Cigar cigar) {
942         Utils.nonNull(cigar);
943 
944         if ( cigar.isEmpty() )
945             return false;
946 
947         final CigarOperator first = cigar.getCigarElement(0).getOperator();
948         final CigarOperator last = cigar.getCigarElement(cigar.numCigarElements()-1).getOperator();
949         return first == CigarOperator.D || first == CigarOperator.I || last == CigarOperator.D || last == CigarOperator.I;
950     }
951 
952 
953     /**
954      * Trim cigar down to one that starts at start reference on the left and extends to end on the reference
955      *
956      * @param cigar a non-null Cigar to trim down
957      * @param start Where should we start keeping bases on the reference?  The first position is 0
958      * @param end Where should we stop keeping bases on the reference?  The maximum value is cigar.getReferenceLength()
959      * @return a new Cigar with reference length == start - end + 1
960      */
trimCigarByReference(final Cigar cigar, final int start, final int end)961     public static CigarBuilder.Result trimCigarByReference(final Cigar cigar, final int start, final int end) {
962         return trimCigar(cigar, start, end, true);
963     }
964 
965     /**
966      * Trim cigar down to one that starts at start base in the cigar and extends to (inclusive) end base
967      *
968      * @param cigar a non-null Cigar to trim down
969      * @param start Where should we start keeping bases in the cigar (inclusive)?  The first position is 0
970      * @param end Where should we stop keeping bases in the cigar (inclusive)?  The maximum value is cigar.getLength() - 1
971      * @return a new Cigar containing == start - end + 1 reads
972      */
trimCigarByBases(final Cigar cigar, final int start, final int end)973     public static CigarBuilder.Result trimCigarByBases(final Cigar cigar, final int start, final int end) {
974         return trimCigar(cigar, start, end, false);
975     }
976 
977 
978     /**
979      * Workhorse for trimCigarByBases and trimCigarByReference
980      *
981      * @param cigar a non-null Cigar to trim down
982      * @param start Where should we start keeping bases in the cigar (inclusive)?  The first position is 0
983      * @param end Where should we stop keeping bases in the cigar (inclusive)?  The maximum value is cigar.getLength() - 1
984      * @param byReference should start and end be interpreted as position in the reference or the read to trim to/from?
985      * @return a non-null cigar
986      */
987     @SuppressWarnings("fallthrough")
trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference)988     private static CigarBuilder.Result trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference) {
989         ParamUtils.isPositiveOrZero(start, "start position can't be negative");
990         Utils.validateArg(end >= start, () -> "end " + end + " is before start " + start);
991         final CigarBuilder newElements = new CigarBuilder();
992 
993         // these variables track the inclusive start and exclusive end of the current cigar element in reference (if byReference) or read (otherwise) coordinates
994         int elementStart;           // inclusive
995         int elementEnd = 0;         // exclusive -- start of next element
996         for ( final CigarElement elt : cigar.getCigarElements() ) {
997             elementStart = elementEnd;
998             elementEnd = elementStart + (byReference ? lengthOnReference(elt) : lengthOnRead(elt));
999 
1000             // we are careful to include zero-length elements at both ends, that is, elements with elementStart == elementEnd == start and elementStart == elementEnd == end + 1
1001             if (elementEnd < start || (elementEnd == start && elementStart < start)) {
1002                 continue;
1003             } else if (elementStart > end && elementEnd > end + 1) {
1004                 break;
1005             }
1006 
1007             final int overlapLength = elementEnd == elementStart ? elt.getLength() : Math.min(end + 1, elementEnd) - Math.max(start, elementStart);
1008             newElements.add(new CigarElement(overlapLength, elt.getOperator()));
1009         }
1010         Utils.validateArg(elementEnd > end, () -> "cigar elements don't reach end position (inclusive) " + end);
1011 
1012         return newElements.makeAndRecordDeletionsRemovedResult();
1013     }
1014 
1015     /**
1016      * Generate a new Cigar that maps the operations of the first cigar through those in a second
1017      *
1018      * For example, if first is 5M and the second is 2M1I2M then the result is 2M1I2M.
1019      * However, if first is 1M2D3M and second is 2M1I3M this results in a cigar X
1020      *
1021      * ref   : AC-GTA
1022      * hap   : ACxGTA  - 2M1I3M
1023      * read  : A--GTA  - 1M2D3M
1024      * result: A--GTA => 1M1D3M
1025      *
1026      * ref   : ACxG-TA
1027      * hap   : AC-G-TA  - 2M1D3M
1028      * read  : AC-GxTA  - 3M1I2M
1029      * result: AC-GxTA => 2M1D1M1I2M
1030      *
1031      * ref   : ACGTA
1032      * hap   : ACGTA  - 5M
1033      * read  : A-GTA  - 1M1I3M
1034      * result: A-GTA => 1M1I3M
1035      *
1036      * ref   : ACGTAC
1037      * hap   : AC---C  - 2M3D1M
1038      * read  : AC---C  - 3M
1039      * result: AG---C => 2M3D
1040      *
1041      * The constraint here is that both cigars should imply that the result have the same number of
1042      * reference bases (i.e.g, cigar.getReferenceLength() are equals).
1043      *
1044      * @param firstToSecond the cigar mapping hap1 -> hap2
1045      * @param secondToThird the cigar mapping hap2 -> hap3
1046      * @return A cigar mapping hap1 -> hap3
1047      */
applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird)1048     public static Cigar applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird) {
1049         final CigarBuilder newElements = new CigarBuilder();
1050         final int nElements12 = firstToSecond.numCigarElements();
1051         final int nElements23 = secondToThird.numCigarElements();
1052 
1053         int cigar12I = 0, cigar23I = 0;
1054         int elt12I = 0, elt23I = 0;
1055 
1056         while ( cigar12I < nElements12 && cigar23I < nElements23 ) {
1057             final CigarElement elt12 = firstToSecond.getCigarElement(cigar12I);
1058             final CigarElement elt23 = secondToThird.getCigarElement(cigar23I);
1059 
1060             final CigarPairTransform transform = getTransformer(elt12.getOperator(), elt23.getOperator());
1061 
1062             if ( transform.op13 != null ) // skip no ops
1063                 newElements.add(new CigarElement(1, transform.op13));
1064 
1065             elt12I += transform.advance12;
1066             elt23I += transform.advance23;
1067 
1068             // if have exhausted our current element, advance to the next one
1069             if ( elt12I == elt12.getLength() ) { cigar12I++; elt12I = 0; }
1070             if ( elt23I == elt23.getLength() ) { cigar23I++; elt23I = 0; }
1071         }
1072 
1073         return newElements.make();
1074     }
1075 
getTransformer(final CigarOperator op12, final CigarOperator op23)1076     private static CigarPairTransform getTransformer(final CigarOperator op12, final CigarOperator op23) {
1077         for ( final CigarPairTransform transform : cigarPairTransformers) {
1078             if ( transform.op12.contains(op12) && transform.op23.contains(op23) )
1079                 return transform;
1080         }
1081 
1082         throw new IllegalStateException("No transformer for operators " + op12 + " and " + op23);
1083     }
1084 
1085     /**
1086      * transformations that project one alignment state through another
1087      *
1088      * Think about this as a state machine, where we have:
1089      *
1090      * bases3 : xxx A zzz
1091      * bases2 : xxx B zzz
1092      * bases1 : xxx C zzz
1093      *
1094      * where A, B and C are alignment states of a three way alignment.  We want to capture
1095      * the transition from operation mapping 1 -> 2 and an operation mapping 2 -> 3 and its
1096      * associated mapping from 1 -> 3 and the advancement of the cigar states of 1->2 and 2->3.
1097      *
1098      * Imagine that A, B, and C are all equivalent (so that op12 = M and op23 = M).  This implies
1099      * a mapping of 1->3 of M, and in this case the next states to consider in the 3 way alignment
1100      * are the subsequent states in 1 and 2 (so that advance12 and advance23 are both 1).
1101      *
1102      * Obviously not all of the states and their associated transitions are so simple.  Suppose instead
1103      * that op12 = I, and op23 = M.  What does this look like:
1104      *
1105      * bases3 : xxx - A zzz
1106      * bases2 : xxx - B zzz
1107      * bases1 : xxx I C zzz
1108      *
1109      * It means that op13 must be an insertion (as we have an extra base in 1 thats not present in 2 and
1110      * so not present in 3).  We advance the cigar in 1 by 1 (as we've consumed one base in 1 for the I)
1111      * but we haven't yet found the base corresponding to the M of op23.  So we don't advance23.
1112      */
1113     private static final class CigarPairTransform {
1114         private final EnumSet<CigarOperator> op12, op23;
1115         private final CigarOperator op13;
1116         private final int advance12, advance23;
1117 
CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23)1118         private CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23) {
1119             this.op12 = getCigarSet(op12);
1120             this.op23 = getCigarSet(op23);
1121             this.op13 = op13;
1122             this.advance12 = advance12;
1123             this.advance23 = advance23;
1124         }
1125 
getCigarSet(final CigarOperator masterOp)1126         private static EnumSet<CigarOperator> getCigarSet(final CigarOperator masterOp) {
1127             switch ( masterOp ) {
1128                 case M: return EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X);
1129                 case I: return EnumSet.of(CigarOperator.I, CigarOperator.S);
1130                 case D: return EnumSet.of(CigarOperator.D);
1131                 default: throw new IllegalStateException("Unexpected state " + masterOp);
1132             }
1133         }
1134 
1135         @Override
toString()1136         public String toString() {
1137             return "CigarPairTransform{" +
1138                     "op12=" + op12 +
1139                     ", op23=" + op23 +
1140                     ", op13=" + op13 +
1141                     ", advance12=" + advance12 +
1142                     ", advance23=" + advance23 +
1143                     '}';
1144         }
1145     }
1146 
1147 
1148     private static final List<CigarPairTransform> cigarPairTransformers = Arrays.asList(
1149             //
1150             // op12 is a match
1151             //
1152             // 3: xxx B yyy
1153             // ^^^^^^^^^^^^
1154             // 2: xxx M yyy
1155             // 1: xxx M yyy
1156             new CigarPairTransform(CigarOperator.M, CigarOperator.M, CigarOperator.M, 1, 1),
1157             // 3: xxx I yyy
1158             // ^^^^^^^^^^^^
1159             // 2: xxx I yyy
1160             // 1: xxx M yyy
1161             new CigarPairTransform(CigarOperator.M, CigarOperator.I, CigarOperator.I, 1, 1),
1162             // 3: xxx D yyy
1163             // ^^^^^^^^^^^^
1164             // 2: xxx D yyy
1165             // 1: xxx M yyy
1166             new CigarPairTransform(CigarOperator.M, CigarOperator.D, CigarOperator.D, 0, 1),
1167 
1168             //
1169             // op12 is a deletion
1170             //
1171             // 3: xxx D M yyy
1172             // ^^^^^^^^^^^^
1173             // 2: xxx M yyy
1174             // 1: xxx D yyy
1175             new CigarPairTransform(CigarOperator.D, CigarOperator.M, CigarOperator.D, 1, 1),
1176             // 3: xxx D2 D1 yyy
1177             // ^^^^^^^^^^^^
1178             // 2: xxx D2 yyy
1179             // 1: xxx D1 yyy
1180             new CigarPairTransform(CigarOperator.D, CigarOperator.D, CigarOperator.D, 0, 1),
1181             // 3: xxx X yyy => no-op, we skip emitting anything here
1182             // ^^^^^^^^^^^^
1183             // 2: xxx I yyy
1184             // 1: xxx D yyy
1185             new CigarPairTransform(CigarOperator.D, CigarOperator.I, null, 1, 1),
1186 
1187             //
1188             // op12 is a insertion
1189             //
1190             // 3: xxx I M yyy
1191             // ^^^^^^^^^^^^
1192             // 2: xxx M yyy
1193             // 1: xxx I yyy
1194             new CigarPairTransform(CigarOperator.I, CigarOperator.M, CigarOperator.I, 1, 0),
1195             // 3: xxx I D yyy
1196             // ^^^^^^^^^^^^
1197             // 2: xxx D yyy
1198             // 1: xxx I yyy
1199             new CigarPairTransform(CigarOperator.I, CigarOperator.D, CigarOperator.I, 1, 0),
1200             // 3: xxx I1 I2 yyy
1201             // ^^^^^^^^^^^^
1202             // 2: xxx I2 yyy
1203             // 1: xxx I1 yyy
1204             new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0)
1205     );
1206 }