1 package org.broadinstitute.hellbender.utils.clipping;
2 
3 import htsjdk.samtools.Cigar;
4 import htsjdk.samtools.CigarElement;
5 import htsjdk.samtools.CigarOperator;
6 import org.broadinstitute.hellbender.utils.Nucleotide;
7 import org.broadinstitute.hellbender.utils.Utils;
8 import org.broadinstitute.hellbender.utils.read.CigarUtils;
9 import org.broadinstitute.hellbender.utils.read.GATKRead;
10 import org.broadinstitute.hellbender.utils.read.ReadUtils;
11 
12 import java.util.Arrays;
13 import java.util.Iterator;
14 import java.util.List;
15 
16 /**
17  * Represents a clip on a read.  It has a type (see the enum) along with a start and stop in the bases
18  * of the read, plus an option extraInfo (useful for carrying info where needed).
19  * <p/>
20  * Also holds the critical apply function that actually execute the clipping operation on a provided read,
21  * according to the wishes of the supplied ClippingAlgorithm enum.
22  */
23 public final class ClippingOp {
24     public final int start, stop; // inclusive
25 
ClippingOp(final int start, final int stop)26     public ClippingOp(final int start, final int stop) {
27         this.start = start;
28         this.stop = stop;
29     }
30 
31 
getLength()32     public int getLength() {
33         return stop - start + 1;
34     }
35 
36 
37     /**
38      * Clips the bases in read according to this operation's start and stop.  Uses the clipping
39      * representation used is the one provided by algorithm argument.
40      *  @param algorithm    clipping algorithm to use
41      * @param originalRead the read to be clipped
42      */
apply(final ClippingRepresentation algorithm, final GATKRead originalRead)43     GATKRead apply(final ClippingRepresentation algorithm, final GATKRead originalRead) {
44         switch (algorithm) {
45             // important note:
46             //   it's not safe to call read.getBases()[i] = 'N' or read.getBaseQualities()[i] = 0
47             //   because you're not guaranteed to get a pointer to the actual array of bytes in the GATKRead
48             case WRITE_NS: {
49                 final GATKRead readCopied = originalRead.copy();
50                 applyWriteNs(readCopied);
51                 return readCopied;
52             }
53 
54             case WRITE_Q0S: {
55                 final GATKRead readCopied = originalRead.copy();
56                 applyWriteQ0s(readCopied);
57                 return readCopied;
58             }
59 
60             case WRITE_NS_Q0S: {
61                 final GATKRead readCopied = originalRead.copy();
62                 applyWriteNs(readCopied);
63                 applyWriteQ0s(readCopied);
64                 return readCopied;
65             }
66 
67             case HARDCLIP_BASES: {
68                 //Note: passing the original read here because the read is copied right away inside the method
69                 return applyHardClipBases(originalRead, start, stop);
70             }
71 
72             case SOFTCLIP_BASES: {
73                 return applySoftClipBases(originalRead.copy());
74             }
75 
76             case REVERT_SOFTCLIPPED_BASES: {
77                 return applyRevertSoftClippedBases(originalRead.copy());
78             }
79 
80             default: {
81                 throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
82             }
83         }
84     }
85 
applySoftClipBases(final GATKRead readCopied)86     private GATKRead applySoftClipBases(final GATKRead readCopied) {
87         Utils.validateArg(!readCopied.isUnmapped(), "Read Clipper cannot soft clip unmapped reads");
88 
89         if (readCopied.getLength() <= 2) {  // see GATK issue #2022 -- we can't soft-clip all bases
90             return readCopied;
91         }
92 
93         // see GATK issue #2022 -- we can't soft-clip all bases
94         final int myStop = Math.min(stop, start + readCopied.getLength() - 2);
95         Utils.validate(start <= 0 || myStop == readCopied.getLength() - 1, () -> String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", readCopied.getName(), start, myStop));
96 
97         final Cigar oldCigar = readCopied.getCigar();
98         final Cigar newCigar = CigarUtils.clipCigar(oldCigar, start, myStop + 1, CigarOperator.SOFT_CLIP);
99         readCopied.setCigar(newCigar);
100 
101         final int alignmentStartShift = start == 0 ? CigarUtils.alignmentStartShift(oldCigar, stop + 1) : 0;
102         final int newStart = readCopied.getStart() + alignmentStartShift;
103         readCopied.setPosition(readCopied.getContig(), newStart);
104         return readCopied;
105     }
106 
applyWriteQ0s(final GATKRead readCopied)107     private void applyWriteQ0s(final GATKRead readCopied) {
108         final byte[] newQuals = readCopied.getBaseQualities(); //this makes a copy so we can modify in place
109         overwriteFromStartToStop(newQuals, (byte) 0);
110         readCopied.setBaseQualities(newQuals);
111     }
112 
applyWriteNs(final GATKRead readCopied)113     private void applyWriteNs(final GATKRead readCopied) {
114         final byte[] newBases = readCopied.getBases();       //this makes a copy so we can modify in place
115         overwriteFromStartToStop(newBases, Nucleotide.N.encodeAsByte());
116         readCopied.setBases(newBases);
117     }
118 
overwriteFromStartToStop(final byte[] arr, final byte newVal)119     private void overwriteFromStartToStop(final byte[] arr, final byte newVal) {
120         Arrays.fill(arr, start, Math.min(arr.length, stop + 1), newVal);
121     }
122 
applyRevertSoftClippedBases(final GATKRead read)123     private GATKRead applyRevertSoftClippedBases(final GATKRead read) {
124         final Cigar originalCigar = read.getCigar();
125         final List<CigarElement> originalElements = originalCigar.getCigarElements();
126         if (originalElements.isEmpty() || !(originalElements.get(0).getOperator().isClipping() || originalElements.get(originalElements.size() - 1).getOperator().isClipping())) {
127             return read;
128         }
129         GATKRead unclipped = read.copy();
130         final Cigar unclippedCigar = CigarUtils.revertSoftClips(originalCigar);
131         unclipped.setCigar(unclippedCigar);
132 
133         final int newStart = read.getSoftStart();
134 
135         if (newStart <= 0) {
136             // if the start of the unclipped read occurs before the contig,
137             // we must hard clip away the bases since we cannot represent reads with
138             // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned)
139 
140             // We cannot set the read to temporarily have a negative start position, as our Read
141             // interface will not allow it. Instead, since we know that the final start position will
142             // be 1 after the hard clip operation, set it to 1 explicitly. We have to set it twice:
143             // once before the hard clip (to reset the alignment stop / read length in read implementations
144             // that cache these values, such as SAMRecord), and again after the hard clip.
145             unclipped.setPosition(unclipped.getContig(), 1);
146             unclipped = applyHardClipBases(unclipped, 0, -newStart);
147 
148             // Reset the position to 1 again only if we didn't end up with an empty, unmapped read after hard clipping.
149             // See https://github.com/broadinstitute/gatk/issues/3845
150             if (!unclipped.isUnmapped()) {
151                 unclipped.setPosition(unclipped.getContig(), 1);
152             }
153 
154             return unclipped;
155         } else {
156             unclipped.setPosition(unclipped.getContig(), newStart);
157             return unclipped;
158         }
159     }
160 
161     /**
162      * Hard clip bases from read, from start to stop in base coordinates
163      * <p>
164      * If start == 0, then we will clip from the front of the read, otherwise we clip
165      * from the right.  If start == 0 and stop == 10, this would clip out the first
166      * 10 bases of the read.
167      * <p>
168      * Note that this function works with reads with negative alignment starts, in order to
169      * allow us to hardClip reads that have had their soft clips reverted and so might have
170      * negative alignment starts
171      * <p>
172      * Works properly with reduced reads and insertion/deletion base qualities
173      * <p>
174      * Note: this method does not assume that the read is directly modifiable
175      * and makes a copy of it.
176      *
177      * @param read  a non-null read
178      * @param start a start >= 0 and < read.length
179      * @param stop  a stop >= 0 and < read.length.
180      * @return a cloned version of read that has been properly trimmed down (Could be an empty, unmapped read)
181      */
applyHardClipBases(final GATKRead read, final int start, final int stop)182     private GATKRead applyHardClipBases(final GATKRead read, final int start, final int stop) {
183         final int newLength = read.getLength() - (stop - start + 1);
184 
185         // If the new read is going to be empty, return an empty read now. This avoids initializing the new
186         // read with invalid values below in certain cases (such as a negative alignment start).
187         // See https://github.com/broadinstitute/gatk/issues/3466
188         if (newLength == 0) {
189             return ReadUtils.emptyRead(read);
190         }
191 
192         // If the read is unmapped there is no Cigar string and neither should we create a new cigar string
193 
194         final Cigar cigar = read.getCigar();//Get the cigar once to avoid multiple calls because each makes a copy of the cigar
195         final Cigar newCigar = read.isUnmapped() ? new Cigar() : CigarUtils.clipCigar(cigar, start, stop + 1, CigarOperator.HARD_CLIP);
196 
197         final byte[] newBases = new byte[newLength];
198         final byte[] newQuals = new byte[newLength];
199         final int copyStart = (start == 0) ? stop + 1 : 0;
200 
201         System.arraycopy(read.getBases(), copyStart, newBases, 0, newLength);
202         System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength);
203 
204         final GATKRead hardClippedRead = read.copy();
205 
206         hardClippedRead.setBaseQualities(newQuals);
207         hardClippedRead.setBases(newBases);
208         hardClippedRead.setCigar(newCigar);
209         if (start == 0 && !read.isUnmapped()) {
210             hardClippedRead.setPosition(read.getContig(), read.getStart() + CigarUtils.alignmentStartShift(cigar, stop + 1));
211         }
212 
213         if (ReadUtils.hasBaseIndelQualities(read)) {
214             final byte[] newBaseInsertionQuals = new byte[newLength];
215             final byte[] newBaseDeletionQuals = new byte[newLength];
216             System.arraycopy(ReadUtils.getBaseInsertionQualities(read), copyStart, newBaseInsertionQuals, 0, newLength);
217             System.arraycopy(ReadUtils.getBaseDeletionQualities(read), copyStart, newBaseDeletionQuals, 0, newLength);
218             ReadUtils.setInsertionBaseQualities(hardClippedRead, newBaseInsertionQuals);
219             ReadUtils.setDeletionBaseQualities(hardClippedRead, newBaseDeletionQuals);
220         }
221 
222         return hardClippedRead;
223     }
224 }