1 package org.broadinstitute.hellbender.utils.clipping; 2 3 import htsjdk.samtools.Cigar; 4 import htsjdk.samtools.CigarElement; 5 import htsjdk.samtools.CigarOperator; 6 import org.broadinstitute.hellbender.utils.Nucleotide; 7 import org.broadinstitute.hellbender.utils.Utils; 8 import org.broadinstitute.hellbender.utils.read.CigarUtils; 9 import org.broadinstitute.hellbender.utils.read.GATKRead; 10 import org.broadinstitute.hellbender.utils.read.ReadUtils; 11 12 import java.util.Arrays; 13 import java.util.Iterator; 14 import java.util.List; 15 16 /** 17 * Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases 18 * of the read, plus an option extraInfo (useful for carrying info where needed). 19 * <p/> 20 * Also holds the critical apply function that actually execute the clipping operation on a provided read, 21 * according to the wishes of the supplied ClippingAlgorithm enum. 22 */ 23 public final class ClippingOp { 24 public final int start, stop; // inclusive 25 ClippingOp(final int start, final int stop)26 public ClippingOp(final int start, final int stop) { 27 this.start = start; 28 this.stop = stop; 29 } 30 31 getLength()32 public int getLength() { 33 return stop - start + 1; 34 } 35 36 37 /** 38 * Clips the bases in read according to this operation's start and stop. Uses the clipping 39 * representation used is the one provided by algorithm argument. 40 * @param algorithm clipping algorithm to use 41 * @param originalRead the read to be clipped 42 */ apply(final ClippingRepresentation algorithm, final GATKRead originalRead)43 GATKRead apply(final ClippingRepresentation algorithm, final GATKRead originalRead) { 44 switch (algorithm) { 45 // important note: 46 // it's not safe to call read.getBases()[i] = 'N' or read.getBaseQualities()[i] = 0 47 // because you're not guaranteed to get a pointer to the actual array of bytes in the GATKRead 48 case WRITE_NS: { 49 final GATKRead readCopied = originalRead.copy(); 50 applyWriteNs(readCopied); 51 return readCopied; 52 } 53 54 case WRITE_Q0S: { 55 final GATKRead readCopied = originalRead.copy(); 56 applyWriteQ0s(readCopied); 57 return readCopied; 58 } 59 60 case WRITE_NS_Q0S: { 61 final GATKRead readCopied = originalRead.copy(); 62 applyWriteNs(readCopied); 63 applyWriteQ0s(readCopied); 64 return readCopied; 65 } 66 67 case HARDCLIP_BASES: { 68 //Note: passing the original read here because the read is copied right away inside the method 69 return applyHardClipBases(originalRead, start, stop); 70 } 71 72 case SOFTCLIP_BASES: { 73 return applySoftClipBases(originalRead.copy()); 74 } 75 76 case REVERT_SOFTCLIPPED_BASES: { 77 return applyRevertSoftClippedBases(originalRead.copy()); 78 } 79 80 default: { 81 throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); 82 } 83 } 84 } 85 applySoftClipBases(final GATKRead readCopied)86 private GATKRead applySoftClipBases(final GATKRead readCopied) { 87 Utils.validateArg(!readCopied.isUnmapped(), "Read Clipper cannot soft clip unmapped reads"); 88 89 if (readCopied.getLength() <= 2) { // see GATK issue #2022 -- we can't soft-clip all bases 90 return readCopied; 91 } 92 93 // see GATK issue #2022 -- we can't soft-clip all bases 94 final int myStop = Math.min(stop, start + readCopied.getLength() - 2); 95 Utils.validate(start <= 0 || myStop == readCopied.getLength() - 1, () -> String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", readCopied.getName(), start, myStop)); 96 97 final Cigar oldCigar = readCopied.getCigar(); 98 final Cigar newCigar = CigarUtils.clipCigar(oldCigar, start, myStop + 1, CigarOperator.SOFT_CLIP); 99 readCopied.setCigar(newCigar); 100 101 final int alignmentStartShift = start == 0 ? CigarUtils.alignmentStartShift(oldCigar, stop + 1) : 0; 102 final int newStart = readCopied.getStart() + alignmentStartShift; 103 readCopied.setPosition(readCopied.getContig(), newStart); 104 return readCopied; 105 } 106 applyWriteQ0s(final GATKRead readCopied)107 private void applyWriteQ0s(final GATKRead readCopied) { 108 final byte[] newQuals = readCopied.getBaseQualities(); //this makes a copy so we can modify in place 109 overwriteFromStartToStop(newQuals, (byte) 0); 110 readCopied.setBaseQualities(newQuals); 111 } 112 applyWriteNs(final GATKRead readCopied)113 private void applyWriteNs(final GATKRead readCopied) { 114 final byte[] newBases = readCopied.getBases(); //this makes a copy so we can modify in place 115 overwriteFromStartToStop(newBases, Nucleotide.N.encodeAsByte()); 116 readCopied.setBases(newBases); 117 } 118 overwriteFromStartToStop(final byte[] arr, final byte newVal)119 private void overwriteFromStartToStop(final byte[] arr, final byte newVal) { 120 Arrays.fill(arr, start, Math.min(arr.length, stop + 1), newVal); 121 } 122 applyRevertSoftClippedBases(final GATKRead read)123 private GATKRead applyRevertSoftClippedBases(final GATKRead read) { 124 final Cigar originalCigar = read.getCigar(); 125 final List<CigarElement> originalElements = originalCigar.getCigarElements(); 126 if (originalElements.isEmpty() || !(originalElements.get(0).getOperator().isClipping() || originalElements.get(originalElements.size() - 1).getOperator().isClipping())) { 127 return read; 128 } 129 GATKRead unclipped = read.copy(); 130 final Cigar unclippedCigar = CigarUtils.revertSoftClips(originalCigar); 131 unclipped.setCigar(unclippedCigar); 132 133 final int newStart = read.getSoftStart(); 134 135 if (newStart <= 0) { 136 // if the start of the unclipped read occurs before the contig, 137 // we must hard clip away the bases since we cannot represent reads with 138 // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned) 139 140 // We cannot set the read to temporarily have a negative start position, as our Read 141 // interface will not allow it. Instead, since we know that the final start position will 142 // be 1 after the hard clip operation, set it to 1 explicitly. We have to set it twice: 143 // once before the hard clip (to reset the alignment stop / read length in read implementations 144 // that cache these values, such as SAMRecord), and again after the hard clip. 145 unclipped.setPosition(unclipped.getContig(), 1); 146 unclipped = applyHardClipBases(unclipped, 0, -newStart); 147 148 // Reset the position to 1 again only if we didn't end up with an empty, unmapped read after hard clipping. 149 // See https://github.com/broadinstitute/gatk/issues/3845 150 if (!unclipped.isUnmapped()) { 151 unclipped.setPosition(unclipped.getContig(), 1); 152 } 153 154 return unclipped; 155 } else { 156 unclipped.setPosition(unclipped.getContig(), newStart); 157 return unclipped; 158 } 159 } 160 161 /** 162 * Hard clip bases from read, from start to stop in base coordinates 163 * <p> 164 * If start == 0, then we will clip from the front of the read, otherwise we clip 165 * from the right. If start == 0 and stop == 10, this would clip out the first 166 * 10 bases of the read. 167 * <p> 168 * Note that this function works with reads with negative alignment starts, in order to 169 * allow us to hardClip reads that have had their soft clips reverted and so might have 170 * negative alignment starts 171 * <p> 172 * Works properly with reduced reads and insertion/deletion base qualities 173 * <p> 174 * Note: this method does not assume that the read is directly modifiable 175 * and makes a copy of it. 176 * 177 * @param read a non-null read 178 * @param start a start >= 0 and < read.length 179 * @param stop a stop >= 0 and < read.length. 180 * @return a cloned version of read that has been properly trimmed down (Could be an empty, unmapped read) 181 */ applyHardClipBases(final GATKRead read, final int start, final int stop)182 private GATKRead applyHardClipBases(final GATKRead read, final int start, final int stop) { 183 final int newLength = read.getLength() - (stop - start + 1); 184 185 // If the new read is going to be empty, return an empty read now. This avoids initializing the new 186 // read with invalid values below in certain cases (such as a negative alignment start). 187 // See https://github.com/broadinstitute/gatk/issues/3466 188 if (newLength == 0) { 189 return ReadUtils.emptyRead(read); 190 } 191 192 // If the read is unmapped there is no Cigar string and neither should we create a new cigar string 193 194 final Cigar cigar = read.getCigar();//Get the cigar once to avoid multiple calls because each makes a copy of the cigar 195 final Cigar newCigar = read.isUnmapped() ? new Cigar() : CigarUtils.clipCigar(cigar, start, stop + 1, CigarOperator.HARD_CLIP); 196 197 final byte[] newBases = new byte[newLength]; 198 final byte[] newQuals = new byte[newLength]; 199 final int copyStart = (start == 0) ? stop + 1 : 0; 200 201 System.arraycopy(read.getBases(), copyStart, newBases, 0, newLength); 202 System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); 203 204 final GATKRead hardClippedRead = read.copy(); 205 206 hardClippedRead.setBaseQualities(newQuals); 207 hardClippedRead.setBases(newBases); 208 hardClippedRead.setCigar(newCigar); 209 if (start == 0 && !read.isUnmapped()) { 210 hardClippedRead.setPosition(read.getContig(), read.getStart() + CigarUtils.alignmentStartShift(cigar, stop + 1)); 211 } 212 213 if (ReadUtils.hasBaseIndelQualities(read)) { 214 final byte[] newBaseInsertionQuals = new byte[newLength]; 215 final byte[] newBaseDeletionQuals = new byte[newLength]; 216 System.arraycopy(ReadUtils.getBaseInsertionQualities(read), copyStart, newBaseInsertionQuals, 0, newLength); 217 System.arraycopy(ReadUtils.getBaseDeletionQualities(read), copyStart, newBaseDeletionQuals, 0, newLength); 218 ReadUtils.setInsertionBaseQualities(hardClippedRead, newBaseInsertionQuals); 219 ReadUtils.setDeletionBaseQualities(hardClippedRead, newBaseDeletionQuals); 220 } 221 222 return hardClippedRead; 223 } 224 }