1 package org.broadinstitute.hellbender.tools;
2 
3 import htsjdk.samtools.reference.ReferenceSequence;
4 import htsjdk.samtools.reference.ReferenceSequenceFile;
5 import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
6 import htsjdk.samtools.util.StringUtil;
7 import org.apache.commons.lang3.StringUtils;
8 import org.apache.commons.lang3.tuple.MutablePair;
9 import org.apache.commons.lang3.tuple.Pair;
10 import org.apache.logging.log4j.LogManager;
11 import org.apache.logging.log4j.Logger;
12 import org.broadinstitute.barclay.argparser.Argument;
13 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
14 import org.broadinstitute.barclay.argparser.WorkflowProperties;
15 import org.broadinstitute.barclay.argparser.WorkflowOutput;
16 import org.broadinstitute.barclay.help.DocumentedFeature;
17 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
18 import org.broadinstitute.hellbender.engine.GATKPath;
19 import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
20 import org.broadinstitute.hellbender.engine.FeatureContext;
21 import org.broadinstitute.hellbender.engine.ReadWalker;
22 import org.broadinstitute.hellbender.engine.ReferenceContext;
23 import org.broadinstitute.hellbender.utils.BaseUtils;
24 import org.broadinstitute.hellbender.utils.clipping.ClippingOp;
25 import org.broadinstitute.hellbender.utils.clipping.ClippingRepresentation;
26 import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
27 import org.broadinstitute.hellbender.utils.read.GATKRead;
28 import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
29 
30 import java.io.PrintStream;
31 import java.util.*;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34 
35 /**
36  * Read clipping based on quality, position or sequence matching. This tool provides simple, powerful read clipping
37  * capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing
38  * user-provided sequences.
39  *
40  * <p>There are three arguments for clipping (quality, position and sequence), which can be used alone or in
41  * combination. In addition, you can also specify a clipping representation, which determines exactly how ClipReads
42  * applies clips to the reads (soft clips, writing Q0 base quality scores, etc.). Please note that you MUST specify at
43  * least one of the three clipping arguments, and specifying a clipping representation is not sufficient. If you do not
44  * specify a clipping argument, the program will run but it will not do anything to your reads.</p>
45  *
46  * <h4>Quality score based clipping</h4>
47  *
48  * <p>Clip bases from the read in clipper from</p>
49  *
50  * <pre>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</pre>
51  *
52  * <p>to the end of the read.  This is copied from BWA.</p>
53  *
54  * <p>Walk through the read from the end (in machine cycle order) to the beginning, calculating the
55  * running sum of qTrimmingThreshold - qual.  While we do this, we track the maximum value of this
56  * sum where the delta > 0.  After the loop, clipPoint is either -1 (don't do anything) or the
57  * clipping index in the read (from the end).</p>
58  *
59  * <h4>Cycle based clipping</h4>
60  *
61  * <p>Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
62  * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
63  * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.</p>
64  *
65  * <h4>Sequence matching</h4>
66  *
67  * <p>Clips bases that exactly match one of a number of base sequences. This employs an exact match algorithm,
68  * filtering only bases whose sequence exactly matches SEQ.</p>
69  *
70  *
71  * <h3>Input</h3>
72  * <p>Any number of SAM/BAM/CRAM files.</p>
73  *
74  * <h3>Output</h3>
75  * <p>A new SAM/BAM/CRAM file containing all of the reads from the input SAM/BAM/CRAMs with the user-specified clipping
76  * operation applied to each read.</p>
77  *
78  * <h4>Summary output (console)</h4>
79  * <pre>
80  * Number of examined reads              13
81  * Number of clipped reads               13
82  * Percent of clipped reads              100.00
83  * Number of examined bases              988
84  * Number of clipped bases               126
85  * Percent of clipped bases              12.75
86  * Number of quality-score clipped bases 126
87  * Number of range clipped bases         0
88  * Number of sequence clipped bases      0
89  * </pre>
90  *
91  * <h3>Example usage</h3>
92  * <pre>gatk ClipReads \
93  *   -I input_reads.bam \
94  *   -O output_reads.bam \
95  *   -XF sequences.fasta \
96  *   -X CCCCC \
97  *   -CT "1-5,11-15" \
98  *   -QT 10</pre>
99  *
100  * <p>The command line shown above will apply all three arguments in combination. See the detailed examples below to see how the choice of clipping representation affects the output.</p>
101  *
102  *     <h4>Detailed clipping examples</h4>
103  *     <p>Suppose we are given this read:</p>
104  *     <pre>
105  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
106  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
107  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
108  *     </pre>
109  *
110  *     <p>If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:</p>
111  *
112  *     <pre>
113  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
114  *          NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
115  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
116  *     </pre>
117  *
118  *     <p>Whereas with -QT 10 -CR WRITE_Q0S:</p>
119  *     <pre>
120  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
121  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
122  *          !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
123  *     </pre>
124  *
125  *     <p>Or -QT 10 -CR SOFTCLIP_BASES:</p>
126  *     <pre>
127  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3133    29      17S59M  *       *       *
128  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
129  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
130  *     </pre>
131  *
132  */
133 @CommandLineProgramProperties(
134         summary = "Read clipping based on quality, position or sequence matching. This tool provides simple, powerful read clipping " +
135                 "capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing " +
136                 "user-provided sequences.",
137         oneLineSummary = "Clip reads in a SAM/BAM/CRAM file",
138         programGroup = ReadDataManipulationProgramGroup.class
139 )
140 @DocumentedFeature
141 @WorkflowProperties
142 public final class ClipReads extends ReadWalker {
143 
144     private final Logger logger = LogManager.getLogger(ClipReads.class);
145 
146     public static final String OUTPUT_STATISTICS_LONG_NAME = "output-statistics";
147     public static final String OUTPUT_STATISTICS_SHORT_NAME = "os";
148     public static final String Q_TRIMMING_THRESHOLD_LONG_NAME = "q-trimming-threshold";
149     public static final String Q_TRIMMING_THRESHOLD_SHORT_NAME = "QT";
150     public static final String CYCLES_TO_TRIM_LONG_NAME = "cycles-to-trim";
151     public static final String CYCLES_TO_TRIM_SHORT_NAME = "CT";
152     public static final String CLIP_SEQUENCES_FILE_LONG_NAME = "clip-sequences-file";
153     public static final String CLIP_SEQUENCES_FILE_SHORT_NAME = "XF";
154     public static final String CLIP_SEQUENCE_LONG_NAME = "clip-sequence";
155     public static final String CLIP_SEQUENCE_SHORT_NAME = "X";
156     public static final String CLIP_REPRESENTATION_LONG_NAME = "clip-representation";
157     public static final String CLIP_REPRESENTATION_SHORT_NAME = "CR";
158     public static final String READ_LONG_NAME = "read";
159     public static final String READ_SHORT_NAME = READ_LONG_NAME;
160 
161     /**
162      * The output SAM/BAM/CRAM file will be written here
163      */
164     @Argument(doc = "BAM output file", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
165     @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION})
166     GATKPath OUTPUT;
167 
168     /**
169      * If provided, ClipReads will write summary statistics about the clipping operations applied to the reads in this file.
170      */
171     @Argument(fullName = OUTPUT_STATISTICS_LONG_NAME, shortName = OUTPUT_STATISTICS_SHORT_NAME, doc = "File to output statistics", optional = true)
172     @WorkflowOutput
173     GATKPath STATSOUTPUT = null;
174 
175     /**
176      * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this
177      * quality score threshold.
178      */
179     @Argument(fullName = Q_TRIMMING_THRESHOLD_LONG_NAME, shortName = Q_TRIMMING_THRESHOLD_SHORT_NAME, doc = "If provided, the Q-score clipper will be applied", optional = true)
180     int qTrimmingThreshold = -1;
181 
182     /**
183      * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
184      * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based
185      * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11,
186      * and 12.
187      */
188     @Argument(fullName = CYCLES_TO_TRIM_LONG_NAME, shortName = CYCLES_TO_TRIM_SHORT_NAME, doc = "String indicating machine cycles to clip from the reads", optional = true)
189     String cyclesToClipArg = null;
190 
191     /**
192      * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the
193      * sequences in the file.
194      */
195     @Argument(fullName = CLIP_SEQUENCES_FILE_LONG_NAME, shortName = CLIP_SEQUENCES_FILE_SHORT_NAME, doc = "Remove sequences within reads matching the sequences in this FASTA file", optional = true)
196     GATKPath clipSequenceFile = null;
197 
198     /**
199      * Clips bases from the reads matching the provided SEQ.
200      */
201     @Argument(fullName = CLIP_SEQUENCE_LONG_NAME, shortName = CLIP_SEQUENCE_SHORT_NAME, doc = "Remove sequences within reads matching this sequence", optional = true)
202     List<String> clipSequencesArgs = null;
203 
204     /**
205      * The different values for this argument determines how ClipReads applies clips to the reads.  This can range
206      * from writing Ns over the clipped bases to hard clipping away the bases from the BAM.
207      */
208     @Argument(fullName = CLIP_REPRESENTATION_LONG_NAME, shortName = CLIP_REPRESENTATION_SHORT_NAME, doc = "How should we actually clip the bases?", optional = true)
209     ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS;
210 
211     @Argument(fullName=READ_LONG_NAME, shortName = READ_SHORT_NAME, doc="", optional = true)
212     String onlyDoRead = null;
213 
214     /**
215      * List of sequence that should be clipped from the reads
216      */
217     private final List<SeqToClip> sequencesToClip = new ArrayList<>();
218 
219     /**
220      * List of cycle start / stop pairs (0-based, stop is included in the cycle to remove) to clip from the reads
221      */
222     private List<Pair<Integer, Integer>> cyclesToClip = null;
223 
224     /**
225      * Output reads is written to this BAM.
226      */
227     private SAMFileGATKReadWriter outputBam;
228 
229     /**
230      * Accumulator for the stats.
231      */
232     private ClippingData accumulator;
233 
234     /**
235      * Output stream for the stats.
236      */
237     private PrintStream outputStats;
238 
239     /**
240      * The initialize function.
241      */
242     @Override
onTraversalStart()243     public void onTraversalStart() {
244         if (qTrimmingThreshold >= 0) {
245             logger.info(String.format("Creating Q-score clipper with threshold %d", qTrimmingThreshold));
246         }
247 
248         //
249         // Initialize the sequences to clip
250         //
251         if (clipSequencesArgs != null) {
252             int i = 0;
253             for (String toClip : clipSequencesArgs) {
254                 i++;
255                 ReferenceSequence rs = new ReferenceSequence("CMDLINE-" + i, -1, StringUtil.stringToBytes(toClip));
256                 addSeqToClip(rs.getName(), rs.getBases());
257             }
258         }
259 
260         if (clipSequenceFile != null) {
261             ReferenceSequenceFile rsf = ReferenceSequenceFileFactory.getReferenceSequenceFile(clipSequenceFile.toPath());
262 
263             while (true) {
264                 ReferenceSequence rs = rsf.nextSequence();
265                 if (rs == null)
266                     break;
267                 else {
268                     addSeqToClip(rs.getName(), rs.getBases());
269                 }
270             }
271         }
272 
273 
274         //
275         // Initialize the cycle ranges to clip
276         //
277         if (cyclesToClipArg != null) {
278             cyclesToClip = new ArrayList<>();
279             for (String range : cyclesToClipArg.split(",")) {
280                 try {
281                     String[] elts = range.split("-");
282                     int start = Integer.parseInt(elts[0]) - 1;
283                     int stop = Integer.parseInt(elts[1]) - 1;
284 
285                     if (start < 0) throw new Exception();
286                     if (stop < start) throw new Exception();
287 
288                     logger.info(String.format("Creating cycle clipper %d-%d", start, stop));
289                     cyclesToClip.add(new MutablePair<>(start, stop));
290                 } catch (Exception e) {
291                     throw new RuntimeException("Badly formatted cyclesToClip argument: " + cyclesToClipArg);
292                 }
293             }
294         }
295 
296         final boolean presorted = EnumSet.of(ClippingRepresentation.WRITE_NS, ClippingRepresentation.WRITE_NS_Q0S, ClippingRepresentation.WRITE_Q0S).contains(clippingRepresentation);
297         outputBam = createSAMWriter(OUTPUT, presorted);
298 
299         accumulator = new ClippingData(sequencesToClip);
300         outputStats = STATSOUTPUT == null ? null : new PrintStream(STATSOUTPUT.getOutputStream());
301     }
302 
303     @Override
apply( GATKRead read, ReferenceContext ref, FeatureContext featureContext )304     public void apply( GATKRead read, ReferenceContext ref, FeatureContext featureContext ) {
305         if ( onlyDoRead == null || read.getName().equals(onlyDoRead) ) {
306             if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES || clippingRepresentation == ClippingRepresentation.REVERT_SOFTCLIPPED_BASES )
307                 read = ReadClipper.revertSoftClippedBases(read);
308             ReadClipperWithData clipper = new ReadClipperWithData(read, sequencesToClip);
309 
310             //
311             // run all three clipping modules
312             //
313             clipBadQualityScores(clipper);
314             clipCycles(clipper);
315             clipSequences(clipper);
316             accumulate(clipper);
317         }
318     }
319 
320     @Override
onTraversalSuccess()321     public ClippingData onTraversalSuccess(){
322         if ( outputStats != null ){
323             outputStats.printf(accumulator.toString());
324         }
325         return accumulator;
326     }
327 
328     @Override
closeTool()329     public void closeTool() {
330         if ( outputStats != null ){
331             outputStats.close();
332         }
333         if ( outputBam != null ) {
334             outputBam.close();
335         }
336     }
337 
338     /**
339      * Helper function that adds a seq with name and bases (as bytes) to the list of sequences to be clipped
340      *
341      * @param name
342      * @param bases
343      */
addSeqToClip(String name, byte[] bases)344     private void addSeqToClip(String name, byte[] bases) {
345         SeqToClip clip = new SeqToClip(name, bases);
346         sequencesToClip.add(clip);
347         logger.info(String.format("Creating sequence clipper %s: %s/%s", clip.name, clip.seq, clip.revSeq));
348     }
349 
350 
351     /**
352      * clip sequences from the reads that match all of the sequences in the global sequencesToClip variable.
353      * Adds ClippingOps for each clip to clipper.
354      *
355      * @param clipper
356      */
clipSequences(ReadClipperWithData clipper)357     private void clipSequences(ReadClipperWithData clipper) {
358         if (sequencesToClip != null) {                // don't bother if we don't have any sequences to clip
359             GATKRead read = clipper.getRead();
360             ClippingData data = clipper.getData();
361 
362             for (SeqToClip stc : sequencesToClip) {
363                 // we have a pattern for both the forward and the reverse strands
364                 Pattern pattern = read.isReverseStrand() ? stc.revPat : stc.fwdPat;
365                 String bases = read.getBasesString();
366                 Matcher match = pattern.matcher(bases);
367 
368                 // keep clipping until match.find() says it can't find anything else
369                 boolean found = true;   // go through at least once
370                 while (found) {
371                     found = match.find();
372                     //System.out.printf("Matching %s against %s/%s => %b%n", bases, stc.seq, stc.revSeq, found);
373                     if (found) {
374                         int start = match.start();
375                         int stop = match.end() - 1;
376                         //ClippingOp op = new ClippingOp(ClippingOp.ClippingType.MATCHES_CLIP_SEQ, start, stop, stc.seq);
377                         ClippingOp op = new ClippingOp(start, stop);
378                         clipper.addOp(op);
379                         data.incSeqClippedBases(stc.seq, op.getLength());
380                     }
381                 }
382             }
383             clipper.setData(data);
384         }
385     }
386 
387     /**
388      * Convenence function that takes a read and the start / stop clipping positions based on the forward
389      * strand, and returns start/stop values appropriate for the strand of the read.
390      *
391      * @param read
392      * @param start
393      * @param stop
394      * @return
395      */
strandAwarePositions(GATKRead read, int start, int stop)396     private Pair<Integer, Integer> strandAwarePositions(GATKRead read, int start, int stop) {
397         if (read.isReverseStrand())
398             return new MutablePair<>(read.getLength() - stop - 1, read.getLength() - start - 1);
399         else
400             return new MutablePair<>(start, stop);
401     }
402 
403     /**
404      * clip bases at cycles between the ranges in cyclesToClip by adding appropriate ClippingOps to clipper.
405      *
406      * @param clipper
407      */
clipCycles(ReadClipperWithData clipper)408     private void clipCycles(ReadClipperWithData clipper) {
409         if (cyclesToClip != null) {
410             GATKRead read = clipper.getRead();
411             ClippingData data = clipper.getData();
412 
413             for (Pair<Integer, Integer> p : cyclesToClip) {   // iterate over each cycle range
414                 int cycleStart = p.getLeft();
415                 int cycleStop = p.getRight();
416 
417                 if (cycleStart < read.getLength()) {
418                     // only try to clip if the cycleStart is less than the read's length
419                     if (cycleStop >= read.getLength())
420                         // we do tolerate [for convenience) clipping when the stop is beyond the end of the read
421                         cycleStop = read.getLength() - 1;
422 
423                     Pair<Integer, Integer> startStop = strandAwarePositions(read, cycleStart, cycleStop);
424                     int start = startStop.getLeft();
425                     int stop = startStop.getRight();
426 
427                     //ClippingOp op = new ClippingOp(ClippingOp.ClippingType.WITHIN_CLIP_RANGE, start, stop, null);
428                     ClippingOp op = new ClippingOp(start, stop);
429                     clipper.addOp(op);
430                     data.incNRangeClippedBases(op.getLength());
431                 }
432             }
433             clipper.setData(data);
434         }
435     }
436 
437     /**
438      * Clip bases from the read in clipper from
439      * <p/>
440      * argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
441      * <p/>
442      * to the end of the read.  This is blatantly stolen from BWA.
443      * <p/>
444      * Walk through the read from the end (in machine cycle order) to the beginning, calculating the
445      * running sum of qTrimmingThreshold - qual.  While we do this, we track the maximum value of this
446      * sum where the delta > 0.  After the loop, clipPoint is either -1 (don't do anything) or the
447      * clipping index in the read (from the end).
448      *
449      * @param clipper
450      */
clipBadQualityScores(ReadClipperWithData clipper)451     private void clipBadQualityScores(ReadClipperWithData clipper) {
452         GATKRead read = clipper.getRead();
453         ClippingData data = clipper.getData();
454         int readLen = read.getLength();
455         byte[] quals = read.getBaseQualities();
456 
457 
458         int clipSum = 0, lastMax = -1, clipPoint = -1; // -1 means no clip
459         for (int i = readLen - 1; i >= 0; i--) {
460             int baseIndex = read.isReverseStrand() ? readLen - i - 1 : i;
461             byte qual = quals[baseIndex];
462             clipSum += (qTrimmingThreshold - qual);
463             if (clipSum >= 0 && (clipSum >= lastMax)) {
464                 lastMax = clipSum;
465                 clipPoint = baseIndex;
466             }
467         }
468 
469         if (clipPoint != -1) {
470             int start = read.isReverseStrand() ? 0 : clipPoint;
471             int stop = read.isReverseStrand() ? clipPoint : readLen - 1;
472             //clipper.addOp(new ClippingOp(ClippingOp.ClippingType.LOW_Q_SCORES, start, stop, null));
473             ClippingOp op = new ClippingOp(start, stop);
474             clipper.addOp(op);
475             data.incNQClippedBases(op.getLength());
476         }
477         clipper.setData(data);
478     }
479 
accumulate(ReadClipperWithData clipper)480     private void accumulate(ReadClipperWithData clipper) {
481         if ( clipper == null )
482             return;
483 
484         GATKRead clippedRead = clipper.clipRead(clippingRepresentation);
485         outputBam.addRead(clippedRead);
486 
487         accumulator.nTotalReads++;
488         accumulator.nTotalBases += clipper.getRead().getLength();
489         if (clipper.wasClipped()) {
490             accumulator.nClippedReads++;
491             accumulator.addData(clipper.getData());
492         }
493     }
494 
495     // --------------------------------------------------------------------------------------------------------------
496     //
497     // utility classes
498     //
499     // --------------------------------------------------------------------------------------------------------------
500 
501     private static final class SeqToClip {
502         String name;
503         String seq, revSeq;
504         Pattern fwdPat, revPat;
505 
SeqToClip(String name, byte[] bytez)506         public SeqToClip(String name, byte[] bytez) {
507             this.name = name;
508             this.seq = new String(bytez);
509             this.fwdPat = Pattern.compile(seq, Pattern.CASE_INSENSITIVE);
510             this.revSeq = new String(BaseUtils.simpleReverseComplement(bytez));
511             this.revPat = Pattern.compile(revSeq, Pattern.CASE_INSENSITIVE);
512         }
513     }
514 
515     public static final class ClippingData {
516         public long nTotalReads = 0;
517         public long nTotalBases = 0;
518         public long nClippedReads = 0;
519         public long nClippedBases = 0;
520         public long nQClippedBases = 0;
521         public long nRangeClippedBases = 0;
522         public long nSeqClippedBases = 0;
523 
524         SortedMap<String, Long> seqClipCounts = new TreeMap<>();
525 
ClippingData(List<SeqToClip> clipSeqs)526         public ClippingData(List<SeqToClip> clipSeqs) {
527             for (SeqToClip clipSeq : clipSeqs) {
528                 seqClipCounts.put(clipSeq.seq, 0L);
529             }
530         }
531 
incNQClippedBases(int n)532         public void incNQClippedBases(int n) {
533             nQClippedBases += n;
534             nClippedBases += n;
535         }
536 
incNRangeClippedBases(int n)537         public void incNRangeClippedBases(int n) {
538             nRangeClippedBases += n;
539             nClippedBases += n;
540         }
541 
incSeqClippedBases(final String seq, int n)542         public void incSeqClippedBases(final String seq, int n) {
543             nSeqClippedBases += n;
544             nClippedBases += n;
545             seqClipCounts.put(seq, seqClipCounts.get(seq) + n);
546         }
547 
addData(ClippingData data)548         public void addData (ClippingData data) {
549             nTotalReads += data.nTotalReads;
550             nTotalBases += data.nTotalBases;
551             nClippedReads += data.nClippedReads;
552             nClippedBases += data.nClippedBases;
553             nQClippedBases += data.nQClippedBases;
554             nRangeClippedBases += data.nRangeClippedBases;
555             nSeqClippedBases += data.nSeqClippedBases;
556 
557             for (String seqClip : data.seqClipCounts.keySet()) {
558                 Long count = data.seqClipCounts.get(seqClip);
559                 if (seqClipCounts.containsKey(seqClip))
560                     count += seqClipCounts.get(seqClip);
561                 seqClipCounts.put(seqClip, count);
562             }
563         }
564 
toString()565         public String toString() {
566             StringBuilder s = new StringBuilder();
567 
568             s.append(StringUtils.repeat('-', 80) + "\n")
569                     .append(String.format("Number of examined reads              %d%n", nTotalReads))
570                     .append(String.format("Number of clipped reads               %d%n", nClippedReads))
571                     .append(String.format("Percent of clipped reads              %.2f%n", (100.0 * nClippedReads) / nTotalReads))
572                     .append(String.format("Number of examined bases              %d%n", nTotalBases))
573                     .append(String.format("Number of clipped bases               %d%n", nClippedBases))
574                     .append(String.format("Percent of clipped bases              %.2f%n", (100.0 * nClippedBases) / nTotalBases))
575                     .append(String.format("Number of quality-score clipped bases %d%n", nQClippedBases))
576                     .append(String.format("Number of range clipped bases         %d%n", nRangeClippedBases))
577                     .append(String.format("Number of sequence clipped bases      %d%n", nSeqClippedBases));
578 
579             for (Map.Entry<String, Long> elt : seqClipCounts.entrySet()) {
580                 s.append(String.format("  %8d clip sites matching %s%n", elt.getValue(), elt.getKey()));
581             }
582 
583             s.append(StringUtils.repeat('-', 80) + "\n");
584             return s.toString();
585         }
586     }
587 
588     public static final class ReadClipperWithData extends ReadClipper {
589         private ClippingData data;
590 
ReadClipperWithData(GATKRead read, List<SeqToClip> clipSeqs)591         public ReadClipperWithData(GATKRead read, List<SeqToClip> clipSeqs) {
592             super(read);
593             data = new ClippingData(clipSeqs);
594         }
595 
getData()596         public ClippingData getData() {
597             return data;
598         }
599 
setData(ClippingData data)600         public void setData(ClippingData data) {
601             this.data = data;
602         }
603 
addData(ClippingData data)604         public void addData(ClippingData data) {
605             this.data.addData(data);
606         }
607     }
608 
609 
610 }
611