1 package org.broadinstitute.hellbender.tools; 2 3 import htsjdk.samtools.reference.ReferenceSequence; 4 import htsjdk.samtools.reference.ReferenceSequenceFile; 5 import htsjdk.samtools.reference.ReferenceSequenceFileFactory; 6 import htsjdk.samtools.util.StringUtil; 7 import org.apache.commons.lang3.StringUtils; 8 import org.apache.commons.lang3.tuple.MutablePair; 9 import org.apache.commons.lang3.tuple.Pair; 10 import org.apache.logging.log4j.LogManager; 11 import org.apache.logging.log4j.Logger; 12 import org.broadinstitute.barclay.argparser.Argument; 13 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; 14 import org.broadinstitute.barclay.argparser.WorkflowProperties; 15 import org.broadinstitute.barclay.argparser.WorkflowOutput; 16 import org.broadinstitute.barclay.help.DocumentedFeature; 17 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 18 import org.broadinstitute.hellbender.engine.GATKPath; 19 import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; 20 import org.broadinstitute.hellbender.engine.FeatureContext; 21 import org.broadinstitute.hellbender.engine.ReadWalker; 22 import org.broadinstitute.hellbender.engine.ReferenceContext; 23 import org.broadinstitute.hellbender.utils.BaseUtils; 24 import org.broadinstitute.hellbender.utils.clipping.ClippingOp; 25 import org.broadinstitute.hellbender.utils.clipping.ClippingRepresentation; 26 import org.broadinstitute.hellbender.utils.clipping.ReadClipper; 27 import org.broadinstitute.hellbender.utils.read.GATKRead; 28 import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; 29 30 import java.io.PrintStream; 31 import java.util.*; 32 import java.util.regex.Matcher; 33 import java.util.regex.Pattern; 34 35 /** 36 * Read clipping based on quality, position or sequence matching. This tool provides simple, powerful read clipping 37 * capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing 38 * user-provided sequences. 39 * 40 * <p>There are three arguments for clipping (quality, position and sequence), which can be used alone or in 41 * combination. In addition, you can also specify a clipping representation, which determines exactly how ClipReads 42 * applies clips to the reads (soft clips, writing Q0 base quality scores, etc.). Please note that you MUST specify at 43 * least one of the three clipping arguments, and specifying a clipping representation is not sufficient. If you do not 44 * specify a clipping argument, the program will run but it will not do anything to your reads.</p> 45 * 46 * <h4>Quality score based clipping</h4> 47 * 48 * <p>Clip bases from the read in clipper from</p> 49 * 50 * <pre>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</pre> 51 * 52 * <p>to the end of the read. This is copied from BWA.</p> 53 * 54 * <p>Walk through the read from the end (in machine cycle order) to the beginning, calculating the 55 * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this 56 * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the 57 * clipping index in the read (from the end).</p> 58 * 59 * <h4>Cycle based clipping</h4> 60 * 61 * <p>Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. 62 * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions). 63 * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.</p> 64 * 65 * <h4>Sequence matching</h4> 66 * 67 * <p>Clips bases that exactly match one of a number of base sequences. This employs an exact match algorithm, 68 * filtering only bases whose sequence exactly matches SEQ.</p> 69 * 70 * 71 * <h3>Input</h3> 72 * <p>Any number of SAM/BAM/CRAM files.</p> 73 * 74 * <h3>Output</h3> 75 * <p>A new SAM/BAM/CRAM file containing all of the reads from the input SAM/BAM/CRAMs with the user-specified clipping 76 * operation applied to each read.</p> 77 * 78 * <h4>Summary output (console)</h4> 79 * <pre> 80 * Number of examined reads 13 81 * Number of clipped reads 13 82 * Percent of clipped reads 100.00 83 * Number of examined bases 988 84 * Number of clipped bases 126 85 * Percent of clipped bases 12.75 86 * Number of quality-score clipped bases 126 87 * Number of range clipped bases 0 88 * Number of sequence clipped bases 0 89 * </pre> 90 * 91 * <h3>Example usage</h3> 92 * <pre>gatk ClipReads \ 93 * -I input_reads.bam \ 94 * -O output_reads.bam \ 95 * -XF sequences.fasta \ 96 * -X CCCCC \ 97 * -CT "1-5,11-15" \ 98 * -QT 10</pre> 99 * 100 * <p>The command line shown above will apply all three arguments in combination. See the detailed examples below to see how the choice of clipping representation affects the output.</p> 101 * 102 * <h4>Detailed clipping examples</h4> 103 * <p>Suppose we are given this read:</p> 104 * <pre> 105 * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * * 106 * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA 107 * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA 108 * </pre> 109 * 110 * <p>If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:</p> 111 * 112 * <pre> 113 * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * * 114 * NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA 115 * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA 116 * </pre> 117 * 118 * <p>Whereas with -QT 10 -CR WRITE_Q0S:</p> 119 * <pre> 120 * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * * 121 * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA 122 * !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA 123 * </pre> 124 * 125 * <p>Or -QT 10 -CR SOFTCLIP_BASES:</p> 126 * <pre> 127 * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * * 128 * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA 129 * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA 130 * </pre> 131 * 132 */ 133 @CommandLineProgramProperties( 134 summary = "Read clipping based on quality, position or sequence matching. This tool provides simple, powerful read clipping " + 135 "capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing " + 136 "user-provided sequences.", 137 oneLineSummary = "Clip reads in a SAM/BAM/CRAM file", 138 programGroup = ReadDataManipulationProgramGroup.class 139 ) 140 @DocumentedFeature 141 @WorkflowProperties 142 public final class ClipReads extends ReadWalker { 143 144 private final Logger logger = LogManager.getLogger(ClipReads.class); 145 146 public static final String OUTPUT_STATISTICS_LONG_NAME = "output-statistics"; 147 public static final String OUTPUT_STATISTICS_SHORT_NAME = "os"; 148 public static final String Q_TRIMMING_THRESHOLD_LONG_NAME = "q-trimming-threshold"; 149 public static final String Q_TRIMMING_THRESHOLD_SHORT_NAME = "QT"; 150 public static final String CYCLES_TO_TRIM_LONG_NAME = "cycles-to-trim"; 151 public static final String CYCLES_TO_TRIM_SHORT_NAME = "CT"; 152 public static final String CLIP_SEQUENCES_FILE_LONG_NAME = "clip-sequences-file"; 153 public static final String CLIP_SEQUENCES_FILE_SHORT_NAME = "XF"; 154 public static final String CLIP_SEQUENCE_LONG_NAME = "clip-sequence"; 155 public static final String CLIP_SEQUENCE_SHORT_NAME = "X"; 156 public static final String CLIP_REPRESENTATION_LONG_NAME = "clip-representation"; 157 public static final String CLIP_REPRESENTATION_SHORT_NAME = "CR"; 158 public static final String READ_LONG_NAME = "read"; 159 public static final String READ_SHORT_NAME = READ_LONG_NAME; 160 161 /** 162 * The output SAM/BAM/CRAM file will be written here 163 */ 164 @Argument(doc = "BAM output file", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME) 165 @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION}) 166 GATKPath OUTPUT; 167 168 /** 169 * If provided, ClipReads will write summary statistics about the clipping operations applied to the reads in this file. 170 */ 171 @Argument(fullName = OUTPUT_STATISTICS_LONG_NAME, shortName = OUTPUT_STATISTICS_SHORT_NAME, doc = "File to output statistics", optional = true) 172 @WorkflowOutput 173 GATKPath STATSOUTPUT = null; 174 175 /** 176 * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this 177 * quality score threshold. 178 */ 179 @Argument(fullName = Q_TRIMMING_THRESHOLD_LONG_NAME, shortName = Q_TRIMMING_THRESHOLD_SHORT_NAME, doc = "If provided, the Q-score clipper will be applied", optional = true) 180 int qTrimmingThreshold = -1; 181 182 /** 183 * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. 184 * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based 185 * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, 186 * and 12. 187 */ 188 @Argument(fullName = CYCLES_TO_TRIM_LONG_NAME, shortName = CYCLES_TO_TRIM_SHORT_NAME, doc = "String indicating machine cycles to clip from the reads", optional = true) 189 String cyclesToClipArg = null; 190 191 /** 192 * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the 193 * sequences in the file. 194 */ 195 @Argument(fullName = CLIP_SEQUENCES_FILE_LONG_NAME, shortName = CLIP_SEQUENCES_FILE_SHORT_NAME, doc = "Remove sequences within reads matching the sequences in this FASTA file", optional = true) 196 GATKPath clipSequenceFile = null; 197 198 /** 199 * Clips bases from the reads matching the provided SEQ. 200 */ 201 @Argument(fullName = CLIP_SEQUENCE_LONG_NAME, shortName = CLIP_SEQUENCE_SHORT_NAME, doc = "Remove sequences within reads matching this sequence", optional = true) 202 List<String> clipSequencesArgs = null; 203 204 /** 205 * The different values for this argument determines how ClipReads applies clips to the reads. This can range 206 * from writing Ns over the clipped bases to hard clipping away the bases from the BAM. 207 */ 208 @Argument(fullName = CLIP_REPRESENTATION_LONG_NAME, shortName = CLIP_REPRESENTATION_SHORT_NAME, doc = "How should we actually clip the bases?", optional = true) 209 ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS; 210 211 @Argument(fullName=READ_LONG_NAME, shortName = READ_SHORT_NAME, doc="", optional = true) 212 String onlyDoRead = null; 213 214 /** 215 * List of sequence that should be clipped from the reads 216 */ 217 private final List<SeqToClip> sequencesToClip = new ArrayList<>(); 218 219 /** 220 * List of cycle start / stop pairs (0-based, stop is included in the cycle to remove) to clip from the reads 221 */ 222 private List<Pair<Integer, Integer>> cyclesToClip = null; 223 224 /** 225 * Output reads is written to this BAM. 226 */ 227 private SAMFileGATKReadWriter outputBam; 228 229 /** 230 * Accumulator for the stats. 231 */ 232 private ClippingData accumulator; 233 234 /** 235 * Output stream for the stats. 236 */ 237 private PrintStream outputStats; 238 239 /** 240 * The initialize function. 241 */ 242 @Override onTraversalStart()243 public void onTraversalStart() { 244 if (qTrimmingThreshold >= 0) { 245 logger.info(String.format("Creating Q-score clipper with threshold %d", qTrimmingThreshold)); 246 } 247 248 // 249 // Initialize the sequences to clip 250 // 251 if (clipSequencesArgs != null) { 252 int i = 0; 253 for (String toClip : clipSequencesArgs) { 254 i++; 255 ReferenceSequence rs = new ReferenceSequence("CMDLINE-" + i, -1, StringUtil.stringToBytes(toClip)); 256 addSeqToClip(rs.getName(), rs.getBases()); 257 } 258 } 259 260 if (clipSequenceFile != null) { 261 ReferenceSequenceFile rsf = ReferenceSequenceFileFactory.getReferenceSequenceFile(clipSequenceFile.toPath()); 262 263 while (true) { 264 ReferenceSequence rs = rsf.nextSequence(); 265 if (rs == null) 266 break; 267 else { 268 addSeqToClip(rs.getName(), rs.getBases()); 269 } 270 } 271 } 272 273 274 // 275 // Initialize the cycle ranges to clip 276 // 277 if (cyclesToClipArg != null) { 278 cyclesToClip = new ArrayList<>(); 279 for (String range : cyclesToClipArg.split(",")) { 280 try { 281 String[] elts = range.split("-"); 282 int start = Integer.parseInt(elts[0]) - 1; 283 int stop = Integer.parseInt(elts[1]) - 1; 284 285 if (start < 0) throw new Exception(); 286 if (stop < start) throw new Exception(); 287 288 logger.info(String.format("Creating cycle clipper %d-%d", start, stop)); 289 cyclesToClip.add(new MutablePair<>(start, stop)); 290 } catch (Exception e) { 291 throw new RuntimeException("Badly formatted cyclesToClip argument: " + cyclesToClipArg); 292 } 293 } 294 } 295 296 final boolean presorted = EnumSet.of(ClippingRepresentation.WRITE_NS, ClippingRepresentation.WRITE_NS_Q0S, ClippingRepresentation.WRITE_Q0S).contains(clippingRepresentation); 297 outputBam = createSAMWriter(OUTPUT, presorted); 298 299 accumulator = new ClippingData(sequencesToClip); 300 outputStats = STATSOUTPUT == null ? null : new PrintStream(STATSOUTPUT.getOutputStream()); 301 } 302 303 @Override apply( GATKRead read, ReferenceContext ref, FeatureContext featureContext )304 public void apply( GATKRead read, ReferenceContext ref, FeatureContext featureContext ) { 305 if ( onlyDoRead == null || read.getName().equals(onlyDoRead) ) { 306 if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES || clippingRepresentation == ClippingRepresentation.REVERT_SOFTCLIPPED_BASES ) 307 read = ReadClipper.revertSoftClippedBases(read); 308 ReadClipperWithData clipper = new ReadClipperWithData(read, sequencesToClip); 309 310 // 311 // run all three clipping modules 312 // 313 clipBadQualityScores(clipper); 314 clipCycles(clipper); 315 clipSequences(clipper); 316 accumulate(clipper); 317 } 318 } 319 320 @Override onTraversalSuccess()321 public ClippingData onTraversalSuccess(){ 322 if ( outputStats != null ){ 323 outputStats.printf(accumulator.toString()); 324 } 325 return accumulator; 326 } 327 328 @Override closeTool()329 public void closeTool() { 330 if ( outputStats != null ){ 331 outputStats.close(); 332 } 333 if ( outputBam != null ) { 334 outputBam.close(); 335 } 336 } 337 338 /** 339 * Helper function that adds a seq with name and bases (as bytes) to the list of sequences to be clipped 340 * 341 * @param name 342 * @param bases 343 */ addSeqToClip(String name, byte[] bases)344 private void addSeqToClip(String name, byte[] bases) { 345 SeqToClip clip = new SeqToClip(name, bases); 346 sequencesToClip.add(clip); 347 logger.info(String.format("Creating sequence clipper %s: %s/%s", clip.name, clip.seq, clip.revSeq)); 348 } 349 350 351 /** 352 * clip sequences from the reads that match all of the sequences in the global sequencesToClip variable. 353 * Adds ClippingOps for each clip to clipper. 354 * 355 * @param clipper 356 */ clipSequences(ReadClipperWithData clipper)357 private void clipSequences(ReadClipperWithData clipper) { 358 if (sequencesToClip != null) { // don't bother if we don't have any sequences to clip 359 GATKRead read = clipper.getRead(); 360 ClippingData data = clipper.getData(); 361 362 for (SeqToClip stc : sequencesToClip) { 363 // we have a pattern for both the forward and the reverse strands 364 Pattern pattern = read.isReverseStrand() ? stc.revPat : stc.fwdPat; 365 String bases = read.getBasesString(); 366 Matcher match = pattern.matcher(bases); 367 368 // keep clipping until match.find() says it can't find anything else 369 boolean found = true; // go through at least once 370 while (found) { 371 found = match.find(); 372 //System.out.printf("Matching %s against %s/%s => %b%n", bases, stc.seq, stc.revSeq, found); 373 if (found) { 374 int start = match.start(); 375 int stop = match.end() - 1; 376 //ClippingOp op = new ClippingOp(ClippingOp.ClippingType.MATCHES_CLIP_SEQ, start, stop, stc.seq); 377 ClippingOp op = new ClippingOp(start, stop); 378 clipper.addOp(op); 379 data.incSeqClippedBases(stc.seq, op.getLength()); 380 } 381 } 382 } 383 clipper.setData(data); 384 } 385 } 386 387 /** 388 * Convenence function that takes a read and the start / stop clipping positions based on the forward 389 * strand, and returns start/stop values appropriate for the strand of the read. 390 * 391 * @param read 392 * @param start 393 * @param stop 394 * @return 395 */ strandAwarePositions(GATKRead read, int start, int stop)396 private Pair<Integer, Integer> strandAwarePositions(GATKRead read, int start, int stop) { 397 if (read.isReverseStrand()) 398 return new MutablePair<>(read.getLength() - stop - 1, read.getLength() - start - 1); 399 else 400 return new MutablePair<>(start, stop); 401 } 402 403 /** 404 * clip bases at cycles between the ranges in cyclesToClip by adding appropriate ClippingOps to clipper. 405 * 406 * @param clipper 407 */ clipCycles(ReadClipperWithData clipper)408 private void clipCycles(ReadClipperWithData clipper) { 409 if (cyclesToClip != null) { 410 GATKRead read = clipper.getRead(); 411 ClippingData data = clipper.getData(); 412 413 for (Pair<Integer, Integer> p : cyclesToClip) { // iterate over each cycle range 414 int cycleStart = p.getLeft(); 415 int cycleStop = p.getRight(); 416 417 if (cycleStart < read.getLength()) { 418 // only try to clip if the cycleStart is less than the read's length 419 if (cycleStop >= read.getLength()) 420 // we do tolerate [for convenience) clipping when the stop is beyond the end of the read 421 cycleStop = read.getLength() - 1; 422 423 Pair<Integer, Integer> startStop = strandAwarePositions(read, cycleStart, cycleStop); 424 int start = startStop.getLeft(); 425 int stop = startStop.getRight(); 426 427 //ClippingOp op = new ClippingOp(ClippingOp.ClippingType.WITHIN_CLIP_RANGE, start, stop, null); 428 ClippingOp op = new ClippingOp(start, stop); 429 clipper.addOp(op); 430 data.incNRangeClippedBases(op.getLength()); 431 } 432 } 433 clipper.setData(data); 434 } 435 } 436 437 /** 438 * Clip bases from the read in clipper from 439 * <p/> 440 * argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual) 441 * <p/> 442 * to the end of the read. This is blatantly stolen from BWA. 443 * <p/> 444 * Walk through the read from the end (in machine cycle order) to the beginning, calculating the 445 * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this 446 * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the 447 * clipping index in the read (from the end). 448 * 449 * @param clipper 450 */ clipBadQualityScores(ReadClipperWithData clipper)451 private void clipBadQualityScores(ReadClipperWithData clipper) { 452 GATKRead read = clipper.getRead(); 453 ClippingData data = clipper.getData(); 454 int readLen = read.getLength(); 455 byte[] quals = read.getBaseQualities(); 456 457 458 int clipSum = 0, lastMax = -1, clipPoint = -1; // -1 means no clip 459 for (int i = readLen - 1; i >= 0; i--) { 460 int baseIndex = read.isReverseStrand() ? readLen - i - 1 : i; 461 byte qual = quals[baseIndex]; 462 clipSum += (qTrimmingThreshold - qual); 463 if (clipSum >= 0 && (clipSum >= lastMax)) { 464 lastMax = clipSum; 465 clipPoint = baseIndex; 466 } 467 } 468 469 if (clipPoint != -1) { 470 int start = read.isReverseStrand() ? 0 : clipPoint; 471 int stop = read.isReverseStrand() ? clipPoint : readLen - 1; 472 //clipper.addOp(new ClippingOp(ClippingOp.ClippingType.LOW_Q_SCORES, start, stop, null)); 473 ClippingOp op = new ClippingOp(start, stop); 474 clipper.addOp(op); 475 data.incNQClippedBases(op.getLength()); 476 } 477 clipper.setData(data); 478 } 479 accumulate(ReadClipperWithData clipper)480 private void accumulate(ReadClipperWithData clipper) { 481 if ( clipper == null ) 482 return; 483 484 GATKRead clippedRead = clipper.clipRead(clippingRepresentation); 485 outputBam.addRead(clippedRead); 486 487 accumulator.nTotalReads++; 488 accumulator.nTotalBases += clipper.getRead().getLength(); 489 if (clipper.wasClipped()) { 490 accumulator.nClippedReads++; 491 accumulator.addData(clipper.getData()); 492 } 493 } 494 495 // -------------------------------------------------------------------------------------------------------------- 496 // 497 // utility classes 498 // 499 // -------------------------------------------------------------------------------------------------------------- 500 501 private static final class SeqToClip { 502 String name; 503 String seq, revSeq; 504 Pattern fwdPat, revPat; 505 SeqToClip(String name, byte[] bytez)506 public SeqToClip(String name, byte[] bytez) { 507 this.name = name; 508 this.seq = new String(bytez); 509 this.fwdPat = Pattern.compile(seq, Pattern.CASE_INSENSITIVE); 510 this.revSeq = new String(BaseUtils.simpleReverseComplement(bytez)); 511 this.revPat = Pattern.compile(revSeq, Pattern.CASE_INSENSITIVE); 512 } 513 } 514 515 public static final class ClippingData { 516 public long nTotalReads = 0; 517 public long nTotalBases = 0; 518 public long nClippedReads = 0; 519 public long nClippedBases = 0; 520 public long nQClippedBases = 0; 521 public long nRangeClippedBases = 0; 522 public long nSeqClippedBases = 0; 523 524 SortedMap<String, Long> seqClipCounts = new TreeMap<>(); 525 ClippingData(List<SeqToClip> clipSeqs)526 public ClippingData(List<SeqToClip> clipSeqs) { 527 for (SeqToClip clipSeq : clipSeqs) { 528 seqClipCounts.put(clipSeq.seq, 0L); 529 } 530 } 531 incNQClippedBases(int n)532 public void incNQClippedBases(int n) { 533 nQClippedBases += n; 534 nClippedBases += n; 535 } 536 incNRangeClippedBases(int n)537 public void incNRangeClippedBases(int n) { 538 nRangeClippedBases += n; 539 nClippedBases += n; 540 } 541 incSeqClippedBases(final String seq, int n)542 public void incSeqClippedBases(final String seq, int n) { 543 nSeqClippedBases += n; 544 nClippedBases += n; 545 seqClipCounts.put(seq, seqClipCounts.get(seq) + n); 546 } 547 addData(ClippingData data)548 public void addData (ClippingData data) { 549 nTotalReads += data.nTotalReads; 550 nTotalBases += data.nTotalBases; 551 nClippedReads += data.nClippedReads; 552 nClippedBases += data.nClippedBases; 553 nQClippedBases += data.nQClippedBases; 554 nRangeClippedBases += data.nRangeClippedBases; 555 nSeqClippedBases += data.nSeqClippedBases; 556 557 for (String seqClip : data.seqClipCounts.keySet()) { 558 Long count = data.seqClipCounts.get(seqClip); 559 if (seqClipCounts.containsKey(seqClip)) 560 count += seqClipCounts.get(seqClip); 561 seqClipCounts.put(seqClip, count); 562 } 563 } 564 toString()565 public String toString() { 566 StringBuilder s = new StringBuilder(); 567 568 s.append(StringUtils.repeat('-', 80) + "\n") 569 .append(String.format("Number of examined reads %d%n", nTotalReads)) 570 .append(String.format("Number of clipped reads %d%n", nClippedReads)) 571 .append(String.format("Percent of clipped reads %.2f%n", (100.0 * nClippedReads) / nTotalReads)) 572 .append(String.format("Number of examined bases %d%n", nTotalBases)) 573 .append(String.format("Number of clipped bases %d%n", nClippedBases)) 574 .append(String.format("Percent of clipped bases %.2f%n", (100.0 * nClippedBases) / nTotalBases)) 575 .append(String.format("Number of quality-score clipped bases %d%n", nQClippedBases)) 576 .append(String.format("Number of range clipped bases %d%n", nRangeClippedBases)) 577 .append(String.format("Number of sequence clipped bases %d%n", nSeqClippedBases)); 578 579 for (Map.Entry<String, Long> elt : seqClipCounts.entrySet()) { 580 s.append(String.format(" %8d clip sites matching %s%n", elt.getValue(), elt.getKey())); 581 } 582 583 s.append(StringUtils.repeat('-', 80) + "\n"); 584 return s.toString(); 585 } 586 } 587 588 public static final class ReadClipperWithData extends ReadClipper { 589 private ClippingData data; 590 ReadClipperWithData(GATKRead read, List<SeqToClip> clipSeqs)591 public ReadClipperWithData(GATKRead read, List<SeqToClip> clipSeqs) { 592 super(read); 593 data = new ClippingData(clipSeqs); 594 } 595 getData()596 public ClippingData getData() { 597 return data; 598 } 599 setData(ClippingData data)600 public void setData(ClippingData data) { 601 this.data = data; 602 } 603 addData(ClippingData data)604 public void addData(ClippingData data) { 605 this.data.addData(data); 606 } 607 } 608 609 610 } 611