1 package org.broadinstitute.hellbender.utils.read; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import com.google.common.collect.Lists; 5 import htsjdk.samtools.Cigar; 6 import htsjdk.samtools.CigarElement; 7 import htsjdk.samtools.CigarOperator; 8 import org.apache.commons.lang3.tuple.ImmutablePair; 9 import org.apache.commons.lang3.tuple.Pair; 10 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; 11 import org.broadinstitute.hellbender.exceptions.GATKException; 12 import org.broadinstitute.hellbender.utils.BaseUtils; 13 import org.broadinstitute.hellbender.utils.IndexRange; 14 import org.broadinstitute.hellbender.utils.Utils; 15 import org.broadinstitute.hellbender.utils.clipping.ReadClipper; 16 import org.broadinstitute.hellbender.utils.haplotype.Haplotype; 17 import org.broadinstitute.hellbender.utils.param.ParamUtils; 18 import org.broadinstitute.hellbender.utils.pileup.PileupElement; 19 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; 20 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment; 21 22 import java.util.*; 23 import java.util.stream.IntStream; 24 25 26 public final class AlignmentUtils { 27 private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); 28 private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S); 29 public final static String HAPLOTYPE_TAG = "HC"; 30 public final static byte GAP_CHARACTER = (byte)'-'; 31 32 // cannot be instantiated AlignmentUtils()33 private AlignmentUtils() { } 34 35 /** 36 * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference 37 * via the alignment of haplotype (via its getCigar) method. 38 * 39 * @param originalRead the read we want to write aligned to the reference genome 40 * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference 41 * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. 42 * @param isInformative true if the read is differentially informative for one of the haplotypes 43 * 44 * @param aligner 45 * @throws IllegalArgumentException if {@code originalRead} is {@code null} or {@code haplotype} is {@code null} or it 46 * does not have a Cigar or the {@code referenceStart} is invalid (less than 1). 47 * 48 * @return a GATKRead aligned to reference. Never {@code null}. 49 */ createReadAlignedToRef(final GATKRead originalRead, final Haplotype haplotype, final Haplotype refHaplotype, final int referenceStart, final boolean isInformative, final SmithWatermanAligner aligner)50 public static GATKRead createReadAlignedToRef(final GATKRead originalRead, 51 final Haplotype haplotype, 52 final Haplotype refHaplotype, 53 final int referenceStart, 54 final boolean isInformative, 55 final SmithWatermanAligner aligner) { 56 Utils.nonNull(originalRead); 57 Utils.nonNull(haplotype); 58 Utils.nonNull(refHaplotype); 59 Utils.nonNull(haplotype.getCigar()); 60 Utils.nonNull(aligner); 61 if ( referenceStart < 1 ) { throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); } 62 63 // compute the smith-waterman alignment of read -> haplotype //TODO use more efficient than the read clipper here 64 final GATKRead readMinusSoftClips = ReadClipper.hardClipSoftClippedBases(originalRead); 65 final int softClippedBases = originalRead.getLength() - readMinusSoftClips.getLength(); 66 final SmithWatermanAlignment readToHaplotypeSWAlignment = aligner.align(haplotype.getBases(), readMinusSoftClips.getBases(), CigarUtils.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); 67 if ( readToHaplotypeSWAlignment.getAlignmentOffset() == -1 ) { 68 // sw can fail (reasons not clear) so if it happens just don't realign the read 69 return originalRead; 70 } 71 72 final Cigar swCigar = new CigarBuilder().addAll(readToHaplotypeSWAlignment.getCigar()).make(); 73 74 // since we're modifying the read we need to clone it 75 final GATKRead copiedRead = originalRead.copy(); 76 77 // only informative reads are given the haplotype tag to enhance visualization 78 if ( isInformative ) { 79 copiedRead.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode()); 80 } 81 82 // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar 83 final Cigar rightPaddedHaplotypeVsRefCigar = haplotype.getConsolidatedPaddedCigar(1000); 84 85 // this computes the number of reference bases before the read starts, based on the haplotype vs ref cigar 86 // This cigar corresponds exactly to the readToRefCigarRaw, below. One might wonder whether readToRefCigarRaw and 87 // readToRefCigarClean ever imply different starts, which could occur if if the former has a leading deletion. However, 88 // according to the logic of applyCigarToCigar, this can only happen if the read has a leading deletion wrt its best haplotype, 89 // which our SW aligner won't do, or if the read starts on a haplotype base that is in a deletion wrt to reference, which is nonsensical 90 // since a base that exists is not a deletion. Thus, there is nothing to worry about, in contrast to below where we do check 91 // whether left-alignment shifted the start position. 92 final int readStartOnReferenceHaplotype = readStartOnReferenceHaplotype(rightPaddedHaplotypeVsRefCigar, readToHaplotypeSWAlignment.getAlignmentOffset()); 93 94 95 //final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; 96 97 final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnReferenceHaplotype; 98 99 // compute the read -> ref alignment by mapping read -> hap -> ref from the 100 // SW of read -> hap mapped through the given by hap -> ref 101 102 // this is the sub-cigar of the haplotype-to-ref alignment, with cigar elements before the read start removed. Elements after the read end are kept. 103 final Cigar haplotypeToRef = trimCigarByBases(rightPaddedHaplotypeVsRefCigar, readToHaplotypeSWAlignment.getAlignmentOffset(), rightPaddedHaplotypeVsRefCigar.getReadLength() - 1).getCigar(); 104 105 final Cigar readToRefCigar = applyCigarToCigar(swCigar, haplotypeToRef); 106 final CigarBuilder.Result leftAlignedReadToRefCigarResult = leftAlignIndels(readToRefCigar, refHaplotype.getBases(), readMinusSoftClips.getBases(), readStartOnReferenceHaplotype); 107 final Cigar leftAlignedReadToRefCigar = leftAlignedReadToRefCigarResult.getCigar(); 108 // it's possible that left-alignment shifted a deletion to the beginning of a read and removed it, shifting the first aligned base to the right 109 copiedRead.setPosition(copiedRead.getContig(), readStartOnReference + leftAlignedReadToRefCigarResult.getLeadingDeletionBasesRemoved()); 110 111 // the SW Cigar does not contain the hard clips of the original read 112 // Here we reconcile the aligned read (that has had any softclips removed) with its softclipped bases 113 final Cigar originalCigar = originalRead.getCigar(); 114 final Cigar newCigar = appendClippedElementsFromCigarToCigar(leftAlignedReadToRefCigar, originalCigar); 115 copiedRead.setCigar(newCigar); 116 117 if ( leftAlignedReadToRefCigar.getReadLength() + softClippedBases != copiedRead.getLength() ) { 118 throw new GATKException("Cigar " + leftAlignedReadToRefCigar + " with read length " + leftAlignedReadToRefCigar.getReadLength() 119 + " != read length " + copiedRead.getLength() + " for read " + copiedRead.toString() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() 120 + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength()); 121 } 122 // assert that the cigar has the same number of elements as the original read 123 124 return copiedRead; 125 } 126 127 /** 128 * Helper method that handles the work of re-appending clipped bases from the original cigar to the new one 129 * 130 * @param cigarToHaveClippedElementsAdded cigar to attach softclips to 131 * @param originalClippedCigar cigar to check for clipped bases 132 * @return a new cigar that has had the clipped elements from the original appended to either end 133 */ 134 @VisibleForTesting appendClippedElementsFromCigarToCigar(final Cigar cigarToHaveClippedElementsAdded, final Cigar originalClippedCigar)135 static Cigar appendClippedElementsFromCigarToCigar(final Cigar cigarToHaveClippedElementsAdded, final Cigar originalClippedCigar) { 136 int firstIndex = 0; 137 int lastIndex = originalClippedCigar.numCigarElements() - 1; 138 CigarElement firstElement = originalClippedCigar.getFirstCigarElement(); 139 CigarElement lastElement = originalClippedCigar.getLastCigarElement(); 140 final List<CigarElement> readToRefCigarElementsWithHardClips = new ArrayList<>(); 141 while (firstElement.getOperator().isClipping() && firstIndex != lastIndex) { 142 readToRefCigarElementsWithHardClips.add(firstElement); 143 firstElement = originalClippedCigar.getCigarElement(++firstIndex); 144 } 145 readToRefCigarElementsWithHardClips.addAll(cigarToHaveClippedElementsAdded.getCigarElements()); 146 147 final List<CigarElement> endCigarElementsToReverse = new ArrayList<>(); 148 while (lastElement.getOperator().isClipping() && firstIndex != lastIndex) { 149 endCigarElementsToReverse.add(lastElement); 150 lastElement = originalClippedCigar.getCigarElement(--lastIndex); 151 } 152 // Reverse the order to preserve the original soft/hardclip ordering in mixed clipping cases where softclips precede hardclips 153 readToRefCigarElementsWithHardClips.addAll(Lists.reverse(endCigarElementsToReverse)); 154 155 return new Cigar(readToRefCigarElementsWithHardClips); 156 } 157 158 159 /** 160 * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the 161 * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference 162 * 163 * refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference 164 * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X. 165 * 166 * If refStart or refEnd would start or end the new bases within a deletion, this function will return null 167 * 168 * If the cigar starts with an insertion, the inserted bases are considered as coming before the start position and 169 * are therefore excluded from the result. That is getBasesCoveringRefInterval(0, 3, "ACTTGT", 0, 2I4M) should yield "TTGT". 170 * 171 * @param bases 172 * @param refStart 173 * @param refEnd 174 * @param basesStartOnRef where does the bases array start w.r.t. the reference start? For example, bases[0] of 175 * could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at 176 * 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10. 177 * It's not trivial to eliminate this parameter because it's tied up with the cigar 178 * @param basesToRefCigar the cigar that maps the bases to the reference genome 179 * @return a byte[] containing the bases covering this interval, or null if we would start or end within a deletion 180 */ getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar)181 public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) { 182 if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd); 183 if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef); 184 Utils.nonNull( bases ); 185 Utils.nonNull( basesToRefCigar ); 186 if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar); 187 188 int refPos = basesStartOnRef; 189 int basesPos = 0; 190 int basesStart = -1; 191 int basesStop = -1; 192 boolean done = false; 193 194 for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) { 195 final CigarElement ce = basesToRefCigar.getCigarElement(iii); 196 switch ( ce.getOperator() ) { 197 case I: 198 basesPos += ce.getLength(); 199 break; 200 case M: case X: case EQ: 201 for ( int i = 0; i < ce.getLength(); i++ ) { 202 if ( refPos == refStart ) 203 basesStart = basesPos; 204 if ( refPos == refEnd ) { 205 basesStop = basesPos; 206 done = true; 207 break; 208 } 209 refPos++; 210 basesPos++; 211 } 212 break; 213 case D: 214 for ( int i = 0; i < ce.getLength(); i++ ) { 215 if ( refPos == refEnd || refPos == refStart ) { 216 // if we ever reach a ref position that is either a start or an end, we fail 217 return null; 218 } 219 refPos++; 220 } 221 break; 222 default: 223 throw new IllegalStateException("Unsupported operator " + ce); 224 } 225 } 226 227 if ( basesStart == -1 || basesStop == -1 ) 228 throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar); 229 230 return Arrays.copyOfRange(bases, basesStart, basesStop + 1); 231 } 232 233 /** 234 * Returns the "IGV View" of all the bases and base qualities in a read aligned to the reference according to the cigar, dropping any bases 235 * that might be in the read but aren't in the reference. Any bases that appear in the reference but not the read 236 * will be filled in with GAP_CHARACTER values for the read bases and 0's for base qualities to indicate that they don't exist. 237 * 238 * If the cigar for input read is all matches to the reference then this method will return references to the original 239 * read base/base quality byte arrays in the underlying SamRecord in order to save on array allocation/copying performance effects. 240 * 241 * @param read a read to return aligned to the reference 242 * @return A Pair of byte arrays where the left array corresponds to the bases aligned to the reference and right 243 * array corresponds to the baseQualities aligned to the reference. 244 */ getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read)245 public static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read) { 246 return getBasesAndBaseQualitiesAlignedOneToOne(read, GAP_CHARACTER, (byte)0); 247 } 248 getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read, final byte basePadCharacter, final byte qualityPadCharacter)249 private static Pair<byte[], byte[]> getBasesAndBaseQualitiesAlignedOneToOne(final GATKRead read, final byte basePadCharacter, final byte qualityPadCharacter) { 250 Utils.nonNull(read); 251 // As this code is performance sensitive in the HaplotypeCaller, we elect to use the noCopy versions of these getters. 252 // We can do this because we don't mutate base or quality arrays in this method or in its accessors 253 final byte[] bases = read.getBasesNoCopy(); 254 final byte[] baseQualities = read.getBaseQualitiesNoCopy(); 255 final int numCigarElements = read.numCigarElements(); 256 boolean sawIndel = false; 257 258 // Check if the cigar contains indels 259 // Note that we don't call ContainsOperator() here twice to avoid the performance hit of building stream iterators twice 260 for (int i = 0; i < numCigarElements; i++) { 261 final CigarOperator e = read.getCigarElement(i).getOperator(); 262 if (e == CigarOperator.INSERTION || e == CigarOperator.DELETION) { 263 sawIndel = true; 264 break; 265 } 266 } 267 if (!sawIndel) { 268 return new ImmutablePair<>(bases, baseQualities); 269 } 270 else { 271 final int numberRefBasesIncludingSoftclips = CigarUtils.countRefBasesAndSoftClips(read.getCigarElements(), 0, numCigarElements); 272 final byte[] paddedBases = new byte[numberRefBasesIncludingSoftclips]; 273 final byte[] paddedBaseQualities = new byte[numberRefBasesIncludingSoftclips]; 274 int literalPos = 0; 275 int paddedPos = 0; 276 for ( int i = 0; i < numCigarElements; i++ ) { 277 final CigarElement ce = read.getCigarElement(i); 278 final CigarOperator co = ce.getOperator(); 279 if (co.consumesReadBases()) { 280 if (!co.consumesReferenceBases()) { 281 literalPos += ce.getLength(); //skip inserted bases 282 } 283 else { 284 System.arraycopy(bases, literalPos, paddedBases, paddedPos, ce.getLength()); 285 System.arraycopy(baseQualities, literalPos, paddedBaseQualities, paddedPos, ce.getLength()); 286 literalPos += ce.getLength(); 287 paddedPos += ce.getLength(); 288 } 289 } 290 else if (co.consumesReferenceBases()) { 291 for ( int j = 0; j < ce.getLength(); j++ ) { //pad deleted bases 292 paddedBases[paddedPos] = basePadCharacter; 293 paddedBaseQualities[paddedPos] = qualityPadCharacter; 294 paddedPos++; 295 } 296 } 297 } 298 return new ImmutablePair<>(paddedBases, paddedBaseQualities); 299 } 300 } 301 302 /** 303 * Returns the number of bases in a read minus the number of softclipped bases. 304 */ unclippedReadLength(final GATKRead read)305 public static int unclippedReadLength(final GATKRead read) { 306 int softClippedBases = 0; 307 for (CigarElement element : read.getCigarElements()) { 308 if (element.getOperator()== CigarOperator.SOFT_CLIP) { 309 softClippedBases+= element.getLength(); 310 } 311 } 312 return read.getLength() - softClippedBases; 313 } 314 315 316 public static class MismatchCount { 317 public int numMismatches = 0; 318 public long mismatchQualities = 0; 319 } 320 321 // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single 322 // todo -- high performance implementation. We can do a lot better than this right now 323 /** 324 * @see #getMismatchCount(GATKRead, byte[], int, int, int) with startOnRead == 0 and nReadBases == read.getReadLength() 325 */ getMismatchCount(GATKRead r, byte[] refSeq, int refIndex)326 public static MismatchCount getMismatchCount(GATKRead r, byte[] refSeq, int refIndex) { 327 return getMismatchCount(r, refSeq, refIndex, 0, r.getLength()); 328 } 329 330 /** 331 * Count how many bases mismatch the reference. Indels are not considered mismatching. 332 * 333 * @param r the sam record to check against 334 * @param refSeq the byte array representing the reference sequence 335 * @param refIndex the index in the reference byte array of the read's first base (the reference index 336 * is matching the alignment start, there may be tons of soft-clipped bases before/after 337 * that so it's wrong to compare with getLength() here.). Note that refIndex is 338 * zero based, not 1 based 339 * @param startOnRead the index in the read's bases from which we start counting 340 * @param nReadBases the number of bases after (but including) startOnRead that we check 341 * @return non-null object representing the mismatch count 342 */ 343 @SuppressWarnings("fallthrough") getMismatchCount(GATKRead r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases)344 public static MismatchCount getMismatchCount(GATKRead r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { 345 Utils.nonNull( r ); 346 Utils.nonNull( refSeq ); 347 if ( refIndex < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a reference index that is negative"); 348 if ( startOnRead < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a read start that is negative"); 349 if ( nReadBases < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count for a negative number of read bases"); 350 if ( refSeq.length - refIndex < (r.getEnd() - r.getStart()) ) 351 throw new IllegalArgumentException("attempting to calculate the mismatch count against a reference string that is smaller than the read"); 352 353 MismatchCount mc = new MismatchCount(); 354 355 int readIdx = 0; 356 final int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count (note we are including soft-clipped bases with this math) 357 final byte[] readSeq = r.getBases(); 358 final Cigar c = r.getCigar(); 359 final byte[] readQuals = r.getBaseQualities(); 360 for (final CigarElement ce : c.getCigarElements()) { 361 362 if (readIdx > endOnRead) 363 break; 364 365 final int elementLength = ce.getLength(); 366 switch (ce.getOperator()) { 367 case X: 368 mc.numMismatches += elementLength; 369 for (int j = 0; j < elementLength; j++) 370 mc.mismatchQualities += readQuals[readIdx+j]; 371 case EQ: 372 refIndex += elementLength; 373 readIdx += elementLength; 374 break; 375 case M: 376 for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) { 377 if (refIndex >= refSeq.length) 378 continue; // TODO : It should never happen, we should throw exception here 379 if (readIdx < startOnRead) continue; 380 if (readIdx > endOnRead) break; 381 byte refChr = refSeq[refIndex]; 382 byte readChr = readSeq[readIdx]; 383 // Note: we need to count X/N's as mismatches because that's what SAM requires 384 //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || 385 // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) 386 // continue; // do not count Ns/Xs/etc ? 387 if (readChr != refChr) { 388 mc.numMismatches++; 389 mc.mismatchQualities += readQuals[readIdx]; 390 } 391 } 392 break; 393 case I: 394 case S: 395 readIdx += elementLength; 396 break; 397 case D: 398 case N: 399 refIndex += elementLength; 400 break; 401 case H: 402 case P: 403 break; 404 default: 405 throw new GATKException("The " + ce.getOperator() + " cigar element is not currently supported"); 406 } 407 408 } 409 return mc; 410 } 411 412 /** 413 * Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. 414 * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but 415 * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is 416 * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. 417 * Formally, this method simply returns the number of M elements in the cigar. 418 * 419 * @param r alignment 420 * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). 421 */ getNumAlignmentBlocks(final GATKRead r)422 public static int getNumAlignmentBlocks(final GATKRead r) { 423 Utils.nonNull( r ); 424 final Cigar cigar = r.getCigar(); 425 if (cigar == null) return 0; 426 427 int n = 0; 428 for (final CigarElement e : cigar.getCigarElements()) { 429 if (ALIGNED_TO_GENOME_OPERATORS.contains(e.getOperator())) 430 n++; 431 } 432 433 return n; 434 } 435 436 437 /** 438 * Get the number of bases aligned to the genome, including soft clips 439 * 440 * If read is not mapped (i.e., doesn't have a cigar) returns 0 441 * 442 * @param r a non-null Read 443 * @return the number of bases aligned to the genome in R, including soft clipped bases 444 */ getNumAlignedBasesCountingSoftClips(final GATKRead r)445 public static int getNumAlignedBasesCountingSoftClips(final GATKRead r) { 446 int n = 0; 447 final Cigar cigar = r.getCigar(); 448 if (cigar == null) return 0; 449 450 for (final CigarElement e : cigar.getCigarElements()) 451 if (ALIGNED_TO_GENOME_PLUS_SOFTCLIPS.contains(e.getOperator())) 452 n += e.getLength(); 453 454 return n; 455 } 456 457 /** 458 * Count the number of bases hard clipped from read 459 * 460 * If read's cigar is null, return 0 461 * 462 * @param r a non-null read 463 * @return a positive integer 464 */ getNumHardClippedBases(final GATKRead r)465 public static int getNumHardClippedBases(final GATKRead r) { 466 if ( r == null ) throw new IllegalArgumentException("Read cannot be null"); 467 468 int n = 0; 469 final Cigar cigar = r.getCigar(); 470 if (cigar == null) return 0; 471 472 for (final CigarElement e : cigar.getCigarElements()) 473 if (e.getOperator() == CigarOperator.H) 474 n += e.getLength(); 475 476 return n; 477 } 478 479 /** 480 * Calculate the number of bases that are soft clipped in read with quality score greater than threshold 481 * 482 * Handles the case where the cigar is null (i.e., the read is unmapped), returning 0 483 * 484 * @param read a non-null read. 485 * @param qualThreshold consider bases with quals > this value as high quality. Must be >= 0 486 * @return positive integer 487 */ countHighQualitySoftClips(final GATKRead read, final byte qualThreshold )488 public static int countHighQualitySoftClips(final GATKRead read, final byte qualThreshold ) { 489 Utils.nonNull(read); 490 ParamUtils.isPositiveOrZero(qualThreshold, "Expected qualThreshold to be positive"); 491 492 final Cigar cigar = read.getCigar(); 493 if ( cigar == null ) { // the read is unmapped 494 return 0; 495 } 496 497 int numHQSoftClips = 0; 498 int alignPos = 0; 499 for ( final CigarElement elem : read.getCigarElements() ) { 500 final int elementLength = elem.getLength(); 501 final CigarOperator operator = elem.getOperator(); 502 503 if (operator == CigarOperator.SOFT_CLIP) { 504 for (int n = 0; n < elementLength; n++) { 505 if( read.getBaseQuality(alignPos++) > qualThreshold ) { 506 numHQSoftClips++; 507 } 508 } 509 } else if (operator.consumesReadBases()) { 510 alignPos += elementLength; 511 } 512 } 513 514 return numHQSoftClips; 515 } 516 calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus)517 public static int calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus) { 518 return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), pileupElement.isDeletion(), alignmentStart, refLocus ); 519 } 520 521 /** 522 * Calculate the index into the read's bases of the beginning of the encompassing cigar element for a given cigar and offset 523 * 524 * @param cigar the read's CIGAR -- cannot be null 525 * @param offset the offset to use for the calculation or -1 if in the middle of a deletion 526 * @param isDeletion are we in the middle of a deletion? 527 * @param alignmentStart the alignment start of the read 528 * @param refLocus the reference position of the offset 529 * @return a non-negative int index 530 */ calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus)531 public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus) { 532 if ( cigar == null ) throw new IllegalArgumentException("attempting to find the alignment position from a CIGAR that is null"); 533 if ( offset < -1 ) throw new IllegalArgumentException("attempting to find the alignment position with an offset that is negative (and not -1)"); 534 if ( alignmentStart < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from an alignment start that is negative"); 535 if ( refLocus < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from a reference position that is negative"); 536 if ( offset >= cigar.getReadLength() ) throw new IllegalArgumentException("attempting to find the alignment position of an offset than is larger than the read length"); 537 538 int pileupOffset = offset; 539 540 // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases 541 if (isDeletion) { 542 pileupOffset = refLocus - alignmentStart; 543 final CigarElement ce = cigar.getCigarElement(0); 544 if (ce.getOperator() == CigarOperator.S) { 545 pileupOffset += ce.getLength(); 546 } 547 } 548 549 int pos = 0; 550 int alignmentPos = 0; 551 552 for (int iii = 0; iii < cigar.numCigarElements(); iii++) { 553 final CigarElement ce = cigar.getCigarElement(iii); 554 final int elementLength = ce.getLength(); 555 556 switch (ce.getOperator()) { 557 case I: 558 case S: // TODO -- I don't think that soft clips should be treated the same as inserted bases here. Investigation needed. 559 pos += elementLength; 560 if (pos >= pileupOffset) { 561 return alignmentPos; 562 } 563 break; 564 case D: 565 if (!isDeletion) { 566 alignmentPos += elementLength; 567 } else { 568 if (pos + elementLength - 1 >= pileupOffset) { 569 return alignmentPos + (pileupOffset - pos); 570 } else { 571 pos += elementLength; 572 alignmentPos += elementLength; 573 } 574 } 575 break; 576 case M: 577 case EQ: 578 case X: 579 if (pos + elementLength - 1 >= pileupOffset) { 580 return alignmentPos + (pileupOffset - pos); 581 } else { 582 pos += elementLength; 583 alignmentPos += elementLength; 584 } 585 break; 586 case H: 587 case P: 588 case N: 589 break; 590 default: 591 throw new GATKException("Unsupported cigar operator: " + ce.getOperator()); 592 } 593 } 594 595 return alignmentPos; 596 } 597 598 /** 599 * Is the offset inside a deletion? 600 * 601 * @param cigar the read's CIGAR -- cannot be null 602 * @param offset the offset into the CIGAR 603 * @return true if the offset is inside a deletion, false otherwise 604 */ isInsideDeletion(final Cigar cigar, final int offset)605 public static boolean isInsideDeletion(final Cigar cigar, final int offset) { 606 Utils.nonNull(cigar); 607 if ( offset < 0 ) return false; 608 609 // pos counts read bases 610 int pos = 0; 611 int prevPos = 0; 612 613 for (final CigarElement ce : cigar.getCigarElements()) { 614 615 switch (ce.getOperator()) { 616 case I: 617 case S: 618 case D: 619 case M: 620 case EQ: 621 case X: 622 prevPos = pos; 623 pos += ce.getLength(); 624 break; 625 case H: 626 case P: 627 case N: 628 break; 629 default: 630 throw new GATKException("Unsupported cigar operator: " + ce.getOperator()); 631 } 632 633 // Is the offset inside a deletion? 634 if ( prevPos < offset && pos >= offset && ce.getOperator() == CigarOperator.D ) { 635 return true; 636 637 } 638 } 639 640 return false; 641 } 642 643 /** 644 * Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions) 645 * 646 * @param cigar the read's CIGAR -- cannot be null 647 * @param read the read's base array 648 * @return a non-null array of bases (bytes) 649 */ 650 @SuppressWarnings("fallthrough") readToAlignmentByteArray(final Cigar cigar, final byte[] read)651 public static byte[] readToAlignmentByteArray(final Cigar cigar, final byte[] read) { 652 Utils.nonNull(cigar); 653 Utils.nonNull(read); 654 655 final int alignmentLength = cigar.getReferenceLength(); 656 final byte[] alignment = new byte[alignmentLength]; 657 int alignPos = 0; 658 int readPos = 0; 659 for (int iii = 0; iii < cigar.numCigarElements(); iii++) { 660 661 final CigarElement ce = cigar.getCigarElement(iii); 662 final int elementLength = ce.getLength(); 663 664 switch (ce.getOperator()) { 665 case I: 666 if (alignPos > 0) { 667 final int prevPos = alignPos - 1; 668 if (alignment[prevPos] == BaseUtils.Base.A.base) { 669 alignment[prevPos] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; 670 } else if (alignment[prevPos] == BaseUtils.Base.C.base) { 671 alignment[prevPos] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; 672 } else if (alignment[prevPos] == BaseUtils.Base.T.base) { 673 alignment[prevPos] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; 674 } else if (alignment[prevPos] == BaseUtils.Base.G.base) { 675 alignment[prevPos] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; 676 } 677 } 678 case S: 679 readPos += elementLength; 680 break; 681 case D: 682 case N: 683 for (int jjj = 0; jjj < elementLength; jjj++) { 684 alignment[alignPos++] = PileupElement.DELETION_BASE; 685 } 686 break; 687 case M: 688 case EQ: 689 case X: 690 for (int jjj = 0; jjj < elementLength; jjj++) { 691 alignment[alignPos++] = read[readPos++]; 692 } 693 break; 694 case H: 695 case P: 696 break; 697 default: 698 throw new GATKException("Unsupported cigar operator: " + ce.getOperator()); 699 } 700 } 701 return alignment; 702 } 703 lengthOnRead(final CigarElement element)704 private static int lengthOnRead(final CigarElement element) { 705 return element.getOperator().consumesReadBases() ? element.getLength() : 0; 706 } 707 lengthOnReference(final CigarElement element)708 private static int lengthOnReference(final CigarElement element) { 709 return element.getOperator().consumesReferenceBases() ? element.getLength() : 0; 710 } 711 712 /** 713 * Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code> 714 * starting at 0-based position <code>readStart</code> on the <code>ref</code> and specified by its <code>cigar</code>. 715 * <p/> 716 * If the alignment has one or more indels, this method attempts to move them left across a stretch of repetitive bases. 717 * For instance, if the original cigar specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output 718 * cigar will always mark the leftmost AT as deleted. If there is no indel in the original cigar or if the indel position 719 * is determined unambiguously (i.e. inserted/deleted sequence is not repeated), the original cigar is returned. 720 * 721 * Soft-clipped bases in the cigar are presumed to correspond to bases in the byte[] of read sequence. That is, this method 722 * assumes that inputs are precise about the distinction between hard clips (removed from the read sequence) and soft clips 723 * (kept in the read sequence but not aligned). For example, with the inputs {cigar: 2S2M2I, read sequence: TTAAAA, ref sequence: GGAA, read start: 2} 724 * the method lines up the AAAA (2M2I) of the read with the AA of the ref and left-aligns the indel to yield a cigar of 725 * 2S2I2M. 726 * 727 * @param cigar structure of the original alignment 728 * @param ref reference sequence the read is aligned to 729 * @param read read sequence 730 * @param readStart 0-based position on ref of the first aligned base in the read sequence 731 * @return a non-null cigar, in which the indels are guaranteed to be placed at the leftmost possible position across a repeat (if any) 732 */ leftAlignIndels(final Cigar cigar, final byte[] ref, final byte[] read, final int readStart)733 public static CigarBuilder.Result leftAlignIndels(final Cigar cigar, final byte[] ref, final byte[] read, final int readStart) { 734 ParamUtils.isPositiveOrZero(readStart, "read start within reference base array must be non-negative"); 735 736 if (cigar.getCigarElements().stream().noneMatch(elem -> elem.getOperator().isIndel())) { 737 return new CigarBuilder.Result(cigar, 0, 0); 738 } 739 740 // we need reference bases from the start of the read to the rightmost indel 741 final int lastIndel = IntStream.range(0, cigar.numCigarElements()).filter(n -> cigar.getCigarElement(n).getOperator().isIndel()).max().getAsInt(); 742 final int necessaryRefLength = readStart + cigar.getCigarElements().stream().limit(lastIndel + 1).mapToInt(e -> lengthOnReference(e)).sum(); 743 Utils.validateArg(necessaryRefLength <= ref.length, "read goes past end of reference"); 744 745 // at this point, we are one base past the end of the read. Now we traverse the cigar from right to left 746 final List<CigarElement> resultRightToLeft = new ArrayList<>(); 747 final int refLength = cigar.getReferenceLength(); 748 final IndexRange refIndelRange = new IndexRange(readStart + refLength, readStart + refLength); 749 final IndexRange readIndelRange = new IndexRange(read.length,read.length); 750 for (int n = cigar.numCigarElements() - 1; n >= 0; n--) { 751 final CigarElement element = cigar.getCigarElement(n); 752 // if it's an indel, just accumulate the read and ref bases consumed. We won't shift the indel until we hit an alignment 753 // block or the read start. 754 if (element.getOperator().isIndel()) { 755 refIndelRange.shiftStartLeft(lengthOnReference(element)); 756 readIndelRange.shiftStartLeft(lengthOnRead(element)); 757 } else if (refIndelRange.size() == 0 && readIndelRange.size() == 0) { // no indel, just add the cigar element to the result 758 resultRightToLeft.add(element); 759 refIndelRange.shiftLeft(lengthOnReference(element)); 760 readIndelRange.shiftLeft(lengthOnRead(element)); 761 } else { // there's an indel that we need to left-align 762 // we can left-align into match cigar elements but not into clips 763 final int maxShift = element.getOperator().isAlignment() ? element.getLength() : 0; 764 final Pair<Integer, Integer> shifts = normalizeAlleles(Arrays.asList(ref, read), Arrays.asList(refIndelRange, readIndelRange), maxShift, true); 765 766 // account for new match alignments on the right due to left-alignment 767 resultRightToLeft.add(new CigarElement(shifts.getRight(), CigarOperator.MATCH_OR_MISMATCH)); 768 769 // emit if we didn't go all the way to the start of an alignment block OR we have reached clips OR we have reached the start of the cigar 770 final boolean emitIndel = n == 0 || shifts.getLeft() < maxShift || !element.getOperator().isAlignment(); 771 final int newMatchOnLeftDueToTrimming = shifts.getLeft() < 0 ? -shifts.getLeft() : 0; // we may have actually shifted right to make the alleles parsimonious 772 final int remainingBasesOnLeft = shifts.getLeft() < 0 ? element.getLength() : (element.getLength() - shifts.getLeft()); 773 774 if (emitIndel) { // some of this alignment block remains after left-alignment -- emit the indel 775 resultRightToLeft.add(new CigarElement(refIndelRange.size(), CigarOperator.DELETION)); 776 resultRightToLeft.add(new CigarElement(readIndelRange.size(), CigarOperator.INSERTION)); 777 refIndelRange.shiftEndLeft(refIndelRange.size()); // ref indel range is now empty and points to start of left-aligned indel 778 readIndelRange.shiftEndLeft(readIndelRange.size()); // read indel range is now empty and points to start of left-aligned indel 779 780 refIndelRange.shiftLeft(newMatchOnLeftDueToTrimming + (element.getOperator().consumesReferenceBases() ? remainingBasesOnLeft : 0)); 781 readIndelRange.shiftLeft(newMatchOnLeftDueToTrimming + (element.getOperator().consumesReadBases() ? remainingBasesOnLeft : 0)); 782 // now read and ref indel ranges are empty and point to end of element preceding this block 783 } 784 resultRightToLeft.add(new CigarElement(newMatchOnLeftDueToTrimming, CigarOperator.MATCH_OR_MISMATCH)); 785 resultRightToLeft.add(new CigarElement(remainingBasesOnLeft, element.getOperator())); 786 } 787 } 788 789 // account for any indels at the start of the cigar that weren't processed because they have no adjacent non-indel element to the left 790 resultRightToLeft.add(new CigarElement(refIndelRange.size(), CigarOperator.DELETION)); 791 resultRightToLeft.add(new CigarElement(readIndelRange.size(), CigarOperator.INSERTION)); 792 793 Utils.validateArg(readIndelRange.getStart() == 0, "Given cigar does not account for all bases of the read"); 794 return new CigarBuilder().addAll(Lists.reverse(resultRightToLeft)).makeAndRecordDeletionsRemovedResult(); 795 } 796 797 /** 798 * Example usage: reference = GAAT, read = GAAAT (insertion of one A) and we initially consider the insertion of the A to occur before 799 * the T. Thus the reference range of this allele is [3,3) (no bases) and the read range is [3,4). This will be left-aligned so that 800 * the insertion occurs after the G, so that the ranges become [1,1) and [1,2) and the returned shifts are 2 bases for both the start and end 801 * of the range. 802 * 803 * If the given allele ranges are not parsimonious, for example [3,4) and [3,5) in the above example to include the common T in both alleles, 804 * the resulting ranges will be shifted by different amounts. In this case, the shifts are 2 bases in the front and 3 at the end. 805 * 806 * Note that we use the convention that the ref allele in the case of an alt insertion, or the alt allele in case of a deletion, is represented 807 * by [n,n) where n is the last aligned coordinate before the indel. This makes sense when you think in terms of alignment CIGARs: 808 * suppose for example we have a 5M5I5M read with start 100. Then the match bases are [100,105) on the ref and [0,5) on the read and the inserted bases are 809 * [5,10) on the read and [5,5) on the reference. 810 * 811 * @param sequences bases of sequences containing different alleles -- could be reference, a haplotype, a read, or subsequences thereof 812 * @param bounds initial ranges (inclusive start, exclusive end) of alleles in same order as {@code sequences} 813 * Note that this method adjusts these ranges as a side effect 814 * @param maxShift maximum allowable shift left. This may be less than the amount demanded by the array bounds. For example, when 815 * left-aligning a read with multiple indels, we don't want to realign one indel past another (if they "collide" we merge 816 * them into a single indel and continue -- see {@link AlignmentUtils::leftAlignIndels} 817 * @param trim If true, remove redundant shared bases at the start and end of alleles 818 * @return The number of bases the alleles were shifted left such that they still represented the same event. 819 */ 820 public static Pair<Integer, Integer> normalizeAlleles(final List<byte[]> sequences, final List<IndexRange> bounds, final int maxShift, final boolean trim) { 821 Utils.nonEmpty(sequences); 822 Utils.validateArg(sequences.size() == bounds.size(), "Must have one initial allele range per sequence"); 823 bounds.forEach(bound -> Utils.validateArg(maxShift <= bound.getStart(), "maxShift goes past the start of a sequence")); 824 825 int startShift = 0; 826 int endShift = 0; 827 828 // consume any redundant shared bases at the end of the alleles 829 int minSize = bounds.stream().mapToInt(IndexRange::size).min().getAsInt(); 830 while (trim && minSize > 0 && lastBaseOnRightIsSame(sequences, bounds)) { 831 bounds.forEach(bound -> bound.shiftEndLeft(1)); 832 minSize--; 833 endShift++; 834 } 835 836 while (trim && minSize > 0 && firstBaseOnLeftIsSame(sequences, bounds)) { 837 bounds.forEach(bound -> bound.shiftStart(1)); 838 minSize--; 839 startShift--; 840 } 841 842 // we shift left as long as the last bases on the right are equal among all sequences and the next bases on the left are all equal. 843 // if a sequence is empty (eg the reference relative to an insertion alt allele) the last base on the right is the next base on the left 844 while( startShift < maxShift && nextBaseOnLeftIsSame(sequences, bounds) && lastBaseOnRightIsSame(sequences, bounds)) { 845 bounds.forEach(b -> b.shiftLeft(1)); 846 startShift++; 847 endShift++; 848 } 849 850 return ImmutablePair.of(startShift, endShift); 851 } 852 853 // do all sequences share a common base at the end of the given index range lastBaseOnRightIsSame(List<byte[]> sequences, List<IndexRange> bounds)854 private static boolean lastBaseOnRightIsSame(List<byte[]> sequences, List<IndexRange> bounds) { 855 final byte lastBaseOnRight = sequences.get(0)[bounds.get(0).getEnd() - 1]; 856 for(int n = 0; n < sequences.size(); n++) { 857 if (sequences.get(n)[bounds.get(n).getEnd() - 1] != lastBaseOnRight) { 858 return false; 859 } 860 } 861 return true; 862 } 863 864 // do all sequences share a common first base firstBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds)865 private static boolean firstBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds) { 866 final byte nextBaseOnLeft = sequences.get(0)[bounds.get(0).getStart()]; 867 for(int n = 0; n < sequences.size(); n++) { 868 if (sequences.get(n)[bounds.get(n).getStart()] != nextBaseOnLeft) { 869 return false; 870 } 871 } 872 return true; 873 } 874 875 // do all sequences share a common base just before the given index ranges nextBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds)876 private static boolean nextBaseOnLeftIsSame(final List<byte[]> sequences, final List<IndexRange> bounds) { 877 final byte nextBaseOnLeft = sequences.get(0)[bounds.get(0).getStart() - 1]; 878 for(int n = 0; n < sequences.size(); n++) { 879 if (sequences.get(n)[bounds.get(n).getStart() - 1] != nextBaseOnLeft) { 880 return false; 881 } 882 } 883 return true; 884 } 885 886 /** 887 * Given a read's first aligned base on an alt haplotype, find the first aligned base in the reference haplotype. This 888 * method assumes that the alt haplotype and reference haplotype start at the same place. That is, the alt haplotype 889 * starts at index 0 within the reference base array. 890 * 891 * @param haplotypeVsRefCigar 892 * @param readStartOnHaplotype 893 * @return 894 */ 895 @VisibleForTesting readStartOnReferenceHaplotype(final Cigar haplotypeVsRefCigar, final int readStartOnHaplotype)896 static int readStartOnReferenceHaplotype(final Cigar haplotypeVsRefCigar, final int readStartOnHaplotype) { 897 if (readStartOnHaplotype == 0) { 898 return 0; 899 } 900 // move forward in the haplotype vs ref cigar until we have consumed enough haplotype bases to reach the read start 901 // the number of reference bases consumed during this traversal gives us the reference start 902 int refBasesConsumedBeforeReadStart = 0; 903 int haplotypeBasesConsumed = 0; 904 for (final CigarElement element : haplotypeVsRefCigar) { 905 refBasesConsumedBeforeReadStart += lengthOnReference(element); 906 haplotypeBasesConsumed += lengthOnRead(element); 907 908 if (haplotypeBasesConsumed >= readStartOnHaplotype) { 909 final int excess = element.getOperator().consumesReferenceBases() ? haplotypeBasesConsumed - readStartOnHaplotype : 0; 910 return refBasesConsumedBeforeReadStart - excess; 911 } 912 } 913 914 throw new IllegalArgumentException("Cigar doesn't reach the read start"); 915 } 916 917 /** 918 * Removing a trailing deletion from the incoming cigar if present 919 * 920 * @param c the cigar we want to update 921 * @return a non-null Cigar 922 */ removeTrailingDeletions(final Cigar c)923 public static Cigar removeTrailingDeletions(final Cigar c) { 924 925 final List<CigarElement> elements = c.getCigarElements(); 926 if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D ) 927 return c; 928 929 return new Cigar(elements.subList(0, elements.size() - 1)); 930 } 931 932 /** 933 * Does cigar start or end with a deletion operation? 934 * 935 * WARNING: there is usually no good reason to use this method, because one should handle the leading and trailing 936 * deletion via the {@link CigarBuilder} class. Do not use this method when you can instead use {@link CigarBuilder}. 937 * 938 * @param cigar a non-null cigar to test 939 * @return true if the first or last operator of cigar is a D 940 */ startsOrEndsWithInsertionOrDeletion(final Cigar cigar)941 public static boolean startsOrEndsWithInsertionOrDeletion(final Cigar cigar) { 942 Utils.nonNull(cigar); 943 944 if ( cigar.isEmpty() ) 945 return false; 946 947 final CigarOperator first = cigar.getCigarElement(0).getOperator(); 948 final CigarOperator last = cigar.getCigarElement(cigar.numCigarElements()-1).getOperator(); 949 return first == CigarOperator.D || first == CigarOperator.I || last == CigarOperator.D || last == CigarOperator.I; 950 } 951 952 953 /** 954 * Trim cigar down to one that starts at start reference on the left and extends to end on the reference 955 * 956 * @param cigar a non-null Cigar to trim down 957 * @param start Where should we start keeping bases on the reference? The first position is 0 958 * @param end Where should we stop keeping bases on the reference? The maximum value is cigar.getReferenceLength() 959 * @return a new Cigar with reference length == start - end + 1 960 */ trimCigarByReference(final Cigar cigar, final int start, final int end)961 public static CigarBuilder.Result trimCigarByReference(final Cigar cigar, final int start, final int end) { 962 return trimCigar(cigar, start, end, true); 963 } 964 965 /** 966 * Trim cigar down to one that starts at start base in the cigar and extends to (inclusive) end base 967 * 968 * @param cigar a non-null Cigar to trim down 969 * @param start Where should we start keeping bases in the cigar (inclusive)? The first position is 0 970 * @param end Where should we stop keeping bases in the cigar (inclusive)? The maximum value is cigar.getLength() - 1 971 * @return a new Cigar containing == start - end + 1 reads 972 */ trimCigarByBases(final Cigar cigar, final int start, final int end)973 public static CigarBuilder.Result trimCigarByBases(final Cigar cigar, final int start, final int end) { 974 return trimCigar(cigar, start, end, false); 975 } 976 977 978 /** 979 * Workhorse for trimCigarByBases and trimCigarByReference 980 * 981 * @param cigar a non-null Cigar to trim down 982 * @param start Where should we start keeping bases in the cigar (inclusive)? The first position is 0 983 * @param end Where should we stop keeping bases in the cigar (inclusive)? The maximum value is cigar.getLength() - 1 984 * @param byReference should start and end be interpreted as position in the reference or the read to trim to/from? 985 * @return a non-null cigar 986 */ 987 @SuppressWarnings("fallthrough") trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference)988 private static CigarBuilder.Result trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference) { 989 ParamUtils.isPositiveOrZero(start, "start position can't be negative"); 990 Utils.validateArg(end >= start, () -> "end " + end + " is before start " + start); 991 final CigarBuilder newElements = new CigarBuilder(); 992 993 // these variables track the inclusive start and exclusive end of the current cigar element in reference (if byReference) or read (otherwise) coordinates 994 int elementStart; // inclusive 995 int elementEnd = 0; // exclusive -- start of next element 996 for ( final CigarElement elt : cigar.getCigarElements() ) { 997 elementStart = elementEnd; 998 elementEnd = elementStart + (byReference ? lengthOnReference(elt) : lengthOnRead(elt)); 999 1000 // we are careful to include zero-length elements at both ends, that is, elements with elementStart == elementEnd == start and elementStart == elementEnd == end + 1 1001 if (elementEnd < start || (elementEnd == start && elementStart < start)) { 1002 continue; 1003 } else if (elementStart > end && elementEnd > end + 1) { 1004 break; 1005 } 1006 1007 final int overlapLength = elementEnd == elementStart ? elt.getLength() : Math.min(end + 1, elementEnd) - Math.max(start, elementStart); 1008 newElements.add(new CigarElement(overlapLength, elt.getOperator())); 1009 } 1010 Utils.validateArg(elementEnd > end, () -> "cigar elements don't reach end position (inclusive) " + end); 1011 1012 return newElements.makeAndRecordDeletionsRemovedResult(); 1013 } 1014 1015 /** 1016 * Generate a new Cigar that maps the operations of the first cigar through those in a second 1017 * 1018 * For example, if first is 5M and the second is 2M1I2M then the result is 2M1I2M. 1019 * However, if first is 1M2D3M and second is 2M1I3M this results in a cigar X 1020 * 1021 * ref : AC-GTA 1022 * hap : ACxGTA - 2M1I3M 1023 * read : A--GTA - 1M2D3M 1024 * result: A--GTA => 1M1D3M 1025 * 1026 * ref : ACxG-TA 1027 * hap : AC-G-TA - 2M1D3M 1028 * read : AC-GxTA - 3M1I2M 1029 * result: AC-GxTA => 2M1D1M1I2M 1030 * 1031 * ref : ACGTA 1032 * hap : ACGTA - 5M 1033 * read : A-GTA - 1M1I3M 1034 * result: A-GTA => 1M1I3M 1035 * 1036 * ref : ACGTAC 1037 * hap : AC---C - 2M3D1M 1038 * read : AC---C - 3M 1039 * result: AG---C => 2M3D 1040 * 1041 * The constraint here is that both cigars should imply that the result have the same number of 1042 * reference bases (i.e.g, cigar.getReferenceLength() are equals). 1043 * 1044 * @param firstToSecond the cigar mapping hap1 -> hap2 1045 * @param secondToThird the cigar mapping hap2 -> hap3 1046 * @return A cigar mapping hap1 -> hap3 1047 */ applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird)1048 public static Cigar applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird) { 1049 final CigarBuilder newElements = new CigarBuilder(); 1050 final int nElements12 = firstToSecond.numCigarElements(); 1051 final int nElements23 = secondToThird.numCigarElements(); 1052 1053 int cigar12I = 0, cigar23I = 0; 1054 int elt12I = 0, elt23I = 0; 1055 1056 while ( cigar12I < nElements12 && cigar23I < nElements23 ) { 1057 final CigarElement elt12 = firstToSecond.getCigarElement(cigar12I); 1058 final CigarElement elt23 = secondToThird.getCigarElement(cigar23I); 1059 1060 final CigarPairTransform transform = getTransformer(elt12.getOperator(), elt23.getOperator()); 1061 1062 if ( transform.op13 != null ) // skip no ops 1063 newElements.add(new CigarElement(1, transform.op13)); 1064 1065 elt12I += transform.advance12; 1066 elt23I += transform.advance23; 1067 1068 // if have exhausted our current element, advance to the next one 1069 if ( elt12I == elt12.getLength() ) { cigar12I++; elt12I = 0; } 1070 if ( elt23I == elt23.getLength() ) { cigar23I++; elt23I = 0; } 1071 } 1072 1073 return newElements.make(); 1074 } 1075 getTransformer(final CigarOperator op12, final CigarOperator op23)1076 private static CigarPairTransform getTransformer(final CigarOperator op12, final CigarOperator op23) { 1077 for ( final CigarPairTransform transform : cigarPairTransformers) { 1078 if ( transform.op12.contains(op12) && transform.op23.contains(op23) ) 1079 return transform; 1080 } 1081 1082 throw new IllegalStateException("No transformer for operators " + op12 + " and " + op23); 1083 } 1084 1085 /** 1086 * transformations that project one alignment state through another 1087 * 1088 * Think about this as a state machine, where we have: 1089 * 1090 * bases3 : xxx A zzz 1091 * bases2 : xxx B zzz 1092 * bases1 : xxx C zzz 1093 * 1094 * where A, B and C are alignment states of a three way alignment. We want to capture 1095 * the transition from operation mapping 1 -> 2 and an operation mapping 2 -> 3 and its 1096 * associated mapping from 1 -> 3 and the advancement of the cigar states of 1->2 and 2->3. 1097 * 1098 * Imagine that A, B, and C are all equivalent (so that op12 = M and op23 = M). This implies 1099 * a mapping of 1->3 of M, and in this case the next states to consider in the 3 way alignment 1100 * are the subsequent states in 1 and 2 (so that advance12 and advance23 are both 1). 1101 * 1102 * Obviously not all of the states and their associated transitions are so simple. Suppose instead 1103 * that op12 = I, and op23 = M. What does this look like: 1104 * 1105 * bases3 : xxx - A zzz 1106 * bases2 : xxx - B zzz 1107 * bases1 : xxx I C zzz 1108 * 1109 * It means that op13 must be an insertion (as we have an extra base in 1 thats not present in 2 and 1110 * so not present in 3). We advance the cigar in 1 by 1 (as we've consumed one base in 1 for the I) 1111 * but we haven't yet found the base corresponding to the M of op23. So we don't advance23. 1112 */ 1113 private static final class CigarPairTransform { 1114 private final EnumSet<CigarOperator> op12, op23; 1115 private final CigarOperator op13; 1116 private final int advance12, advance23; 1117 CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23)1118 private CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23) { 1119 this.op12 = getCigarSet(op12); 1120 this.op23 = getCigarSet(op23); 1121 this.op13 = op13; 1122 this.advance12 = advance12; 1123 this.advance23 = advance23; 1124 } 1125 getCigarSet(final CigarOperator masterOp)1126 private static EnumSet<CigarOperator> getCigarSet(final CigarOperator masterOp) { 1127 switch ( masterOp ) { 1128 case M: return EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); 1129 case I: return EnumSet.of(CigarOperator.I, CigarOperator.S); 1130 case D: return EnumSet.of(CigarOperator.D); 1131 default: throw new IllegalStateException("Unexpected state " + masterOp); 1132 } 1133 } 1134 1135 @Override toString()1136 public String toString() { 1137 return "CigarPairTransform{" + 1138 "op12=" + op12 + 1139 ", op23=" + op23 + 1140 ", op13=" + op13 + 1141 ", advance12=" + advance12 + 1142 ", advance23=" + advance23 + 1143 '}'; 1144 } 1145 } 1146 1147 1148 private static final List<CigarPairTransform> cigarPairTransformers = Arrays.asList( 1149 // 1150 // op12 is a match 1151 // 1152 // 3: xxx B yyy 1153 // ^^^^^^^^^^^^ 1154 // 2: xxx M yyy 1155 // 1: xxx M yyy 1156 new CigarPairTransform(CigarOperator.M, CigarOperator.M, CigarOperator.M, 1, 1), 1157 // 3: xxx I yyy 1158 // ^^^^^^^^^^^^ 1159 // 2: xxx I yyy 1160 // 1: xxx M yyy 1161 new CigarPairTransform(CigarOperator.M, CigarOperator.I, CigarOperator.I, 1, 1), 1162 // 3: xxx D yyy 1163 // ^^^^^^^^^^^^ 1164 // 2: xxx D yyy 1165 // 1: xxx M yyy 1166 new CigarPairTransform(CigarOperator.M, CigarOperator.D, CigarOperator.D, 0, 1), 1167 1168 // 1169 // op12 is a deletion 1170 // 1171 // 3: xxx D M yyy 1172 // ^^^^^^^^^^^^ 1173 // 2: xxx M yyy 1174 // 1: xxx D yyy 1175 new CigarPairTransform(CigarOperator.D, CigarOperator.M, CigarOperator.D, 1, 1), 1176 // 3: xxx D2 D1 yyy 1177 // ^^^^^^^^^^^^ 1178 // 2: xxx D2 yyy 1179 // 1: xxx D1 yyy 1180 new CigarPairTransform(CigarOperator.D, CigarOperator.D, CigarOperator.D, 0, 1), 1181 // 3: xxx X yyy => no-op, we skip emitting anything here 1182 // ^^^^^^^^^^^^ 1183 // 2: xxx I yyy 1184 // 1: xxx D yyy 1185 new CigarPairTransform(CigarOperator.D, CigarOperator.I, null, 1, 1), 1186 1187 // 1188 // op12 is a insertion 1189 // 1190 // 3: xxx I M yyy 1191 // ^^^^^^^^^^^^ 1192 // 2: xxx M yyy 1193 // 1: xxx I yyy 1194 new CigarPairTransform(CigarOperator.I, CigarOperator.M, CigarOperator.I, 1, 0), 1195 // 3: xxx I D yyy 1196 // ^^^^^^^^^^^^ 1197 // 2: xxx D yyy 1198 // 1: xxx I yyy 1199 new CigarPairTransform(CigarOperator.I, CigarOperator.D, CigarOperator.I, 1, 0), 1200 // 3: xxx I1 I2 yyy 1201 // ^^^^^^^^^^^^ 1202 // 2: xxx I2 yyy 1203 // 1: xxx I1 yyy 1204 new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0) 1205 ); 1206 }