1 package shared; 2 3 import java.io.Serializable; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 7 import align2.QualityTools; 8 import dna.AminoAcid; 9 import stream.Read; 10 import stream.SamLine; 11 import stream.SiteScore; 12 import structures.ByteBuilder; 13 14 /** 15 * Helper class for processes that do inline quality trimming. 16 * @author Brian Bushnell 17 * @date Mar 15, 2013 18 * 19 */ 20 public final class TrimRead implements Serializable { 21 22 /** 23 * 24 */ 25 private static final long serialVersionUID = 8791743639124592480L; 26 main(String[] args)27 public static void main(String[] args){ 28 byte[] bases=args[0].getBytes(); 29 byte[] quals=(args.length<2 ? null : args[1].getBytes()); 30 if(quals!=null){ 31 for(int i=0; i<quals.length; i++){quals[i]-=32;} 32 } 33 byte[] match=(args.length<3 ? null : args[2].getBytes()); 34 float minq=(args.length<4 ? 5 : Float.parseFloat(args[3])); 35 float minE=(float)QualityTools.phredToProbError(minq); 36 Read r=new Read(bases, quals, 1); 37 r.match=match; 38 System.out.println("Before trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match))); 39 System.out.println(Arrays.toString(r.quality)); 40 TrimRead tr=trim(r, true, true, minq, minE, 1); 41 System.out.println("\nAfter trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match))); 42 if(r.match==null){ 43 r.match=new byte[r.length()]; 44 for(int i=0; i<r.length(); i++){r.match[i]='m';} 45 } 46 tr.untrim(); 47 System.out.println("\nAfter untrim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match))); 48 } 49 50 // public static TrimRead trim(Read r, boolean trimLeft, boolean trimRight, float trimq, int minlen){ 51 // if(r==null || r.bases==null){return null;} 52 // 53 // final int a, b; 54 // if(optimalMode){ 55 // long packed=testOptimal(r.bases, r.quality, QualityTools.PROB_ERROR[trimq]); 56 // a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; 57 // b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; 58 // }else if(windowMode){ 59 // a=0; 60 // b=(trimRight ? testRightWindow(r.bases, r.quality, (byte)trimq, windowLength) : 0); 61 // }else{ 62 // a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0); 63 // b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0); 64 // } 65 // return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen)); 66 // } 67 trim(Read r, boolean trimLeft, boolean trimRight, float trimq, float avgErrorRate, int minlen)68 public static TrimRead trim(Read r, boolean trimLeft, boolean trimRight, 69 float trimq, float avgErrorRate, int minlen){ 70 if(r==null || r.bases==null){return null;} 71 72 final int a, b; 73 if(optimalMode){ 74 long packed=testOptimal(r.bases, r.quality, avgErrorRate); 75 a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; 76 b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; 77 }else if(windowMode){ 78 a=0; 79 b=(trimRight ? testRightWindow(r.bases, r.quality, (byte)trimq, windowLength) : 0); 80 }else{ 81 a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0); 82 b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0); 83 } 84 return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen)); 85 } 86 87 /** 88 * @param r Read to trim 89 * @param trimLeft Trim left side 90 * @param trimRight Trim right side 91 * @param trimq Maximum quality to trim 92 * @param minResult Ensure trimmed read is at least this long 93 * @return Number of bases trimmed 94 */ trimFast(Read r, boolean trimLeft, boolean trimRight, float trimq, float avgErrorRate, int minResult)95 public static int trimFast(Read r, boolean trimLeft, boolean trimRight, 96 float trimq, float avgErrorRate, int minResult){ 97 return trimFast(r, trimLeft, trimRight, trimq, avgErrorRate, minResult, false); 98 } 99 100 /** 101 * @param r Read to trim 102 * @param trimLeft Trim left side 103 * @param trimRight Trim right side 104 * @param trimq Maximum quality to trim 105 * @param minResult Ensure trimmed read is at least this long 106 * @return Number of bases trimmed 107 */ trimFast(Read r, boolean trimLeft, boolean trimRight, float trimq, float avgErrorRate, int minResult, boolean trimClip)108 public static int trimFast(Read r, boolean trimLeft, boolean trimRight, 109 float trimq, float avgErrorRate, int minResult, boolean trimClip){ 110 return trimFast(r, trimLeft, trimRight, trimq, avgErrorRate, minResult, 0, trimClip); 111 } 112 113 /** 114 * This method allows a "discardUnder" parameter, so that qtrim=r will still discard 115 * reads that if left-trimmed, would be shorter than the desired minimum length. 116 * It is not presently used. 117 * @param r Read to trim 118 * @param trimLeft Trim left side 119 * @param trimRight Trim right side 120 * @param trimq Maximum quality to trim 121 * @param minResult Ensure trimmed read is at least this long 122 * @param discardUnder Resulting reads shorter than this are not wanted 123 * @return Number of bases trimmed 124 */ trimFast(Read r, boolean trimLeft, boolean trimRight, float trimq, float avgErrorRate, int minResult, int discardUnder, boolean trimClip)125 public static int trimFast(Read r, boolean trimLeft, boolean trimRight, 126 float trimq, float avgErrorRate, int minResult, int discardUnder, boolean trimClip){ 127 // assert(avgErrorRate==(float)QualityTools.phredToProbError(trimq)) : trimq+", "+avgErrorRate+", "+(float)QualityTools.phredToProbError(trimq); 128 final byte[] bases=r.bases, qual=r.quality; 129 if(bases==null || bases.length<1){return 0;} 130 131 // assert(false) : trimLeft+", "+trimRight+", "+trimq+", "+minResult+", "+discardUnder; 132 133 final int len=r.length(); 134 final int a0, b0; 135 final int a, b; 136 if(optimalMode){ 137 long packed=testOptimal(bases, qual, avgErrorRate); 138 a0=(int)((packed>>32)&0xFFFFFFFFL); 139 b0=(int)((packed)&0xFFFFFFFFL); 140 if(trimLeft!=trimRight && discardUnder>0 && len-a0-b0<discardUnder){ 141 a=trimLeft ? len : 0; 142 b=trimRight ? len : 0; 143 }else{ 144 a=trimLeft ? a0 : 0; 145 b=trimRight ? b0 : 0; 146 } 147 // assert(false) : a0+", "+b0+" -> "+a+", "+b; 148 }else if(windowMode){ 149 a=0; 150 b=(trimRight ? testRightWindow(bases, qual, (byte)trimq, windowLength) : 0); 151 }else{ 152 a=(trimLeft ? testLeft(bases, qual, (byte)trimq) : 0); 153 b=(trimRight ? testRight(bases, qual, (byte)trimq) : 0); 154 } 155 return trimByAmount(r, a, b, minResult, trimClip); 156 } 157 untrim(Read r)158 public static boolean untrim(Read r){ 159 if(r==null || r.obj==null){return false;} 160 if(r.obj.getClass()!=TrimRead.class){return false;} 161 TrimRead tr=(TrimRead)r.obj; 162 return tr.untrim(); 163 } 164 165 // public TrimRead(Read r_, boolean trimLeft, boolean trimRight, float trimq_, int minlen_){ 166 // this(r_, (trimLeft ? testLeft(r_.bases, r_.quality, (byte)trimq_) : 0), (trimRight ? testRight(r_.bases, r_.quality, (byte)trimq_) : 0), trimq_, minlen_); 167 // } 168 TrimRead(Read r_, int trimLeft, int trimRight, float trimq_, int minlen_)169 public TrimRead(Read r_, int trimLeft, int trimRight, float trimq_, int minlen_){ 170 minlen_=Tools.max(minlen_, 0); 171 r=r_; 172 bases1=r.bases; 173 qual1=r.quality; 174 trimq=(byte)trimq_; 175 assert(bases1!=null || qual1==null) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n"; 176 assert(bases1==null || qual1==null || bases1.length==qual1.length) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n"; 177 int trimmed=trim(trimLeft, trimRight, minlen_); 178 if(trimmed>0){ 179 assert(bases2==null || bases2.length>=minlen_ || bases1.length<minlen_) : bases1.length+", "+bases2.length+", "+minlen_+", "+trimLeft+", "+trimRight; 180 r.bases=bases2; 181 r.quality=qual2; 182 r.obj=this; 183 trimMatch(r); 184 } 185 } 186 187 // /** Trim the left end of the read, from left to right */ 188 // private int trim(final boolean trimLeft, final boolean trimRight, final int minlen){ 189 // final int a, b; 190 // if(optimalMode){ 191 // long packed=testOptimal(bases1, qual1, QualityTools.PROB_ERROR[trimq]); 192 // a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; 193 // b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; 194 // }else{ 195 // a=(trimLeft ? testLeft(bases1, qual1, (byte)trimq) : 0); 196 // b=(trimRight ? testRight(bases1, qual1, (byte)trimq) : 0); 197 // } 198 // return trim(a, b, minlen); 199 // } 200 201 /** Trim the left end of the read, from left to right */ trim(int trimLeft, int trimRight, final int minlen)202 private int trim(int trimLeft, int trimRight, final int minlen){ 203 assert(trimLeft>=0 && trimRight>=0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length; 204 assert(trimLeft>0 || trimRight>0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length; 205 final int maxTrim=Tools.min(bases1.length, bases1.length-minlen); 206 if(trimLeft+trimRight>maxTrim){ 207 int excess=trimLeft+trimRight-maxTrim; 208 if(trimLeft>0 && excess>0){ 209 trimLeft=Tools.max(0, trimLeft-excess); 210 excess=trimLeft+trimRight-maxTrim; 211 } 212 if(trimRight>0 && excess>0){ 213 trimRight=Tools.max(0, trimRight-excess); 214 excess=trimLeft+trimRight-maxTrim; 215 } 216 217 } 218 219 leftTrimmed=trimLeft; 220 rightTrimmed=trimRight; 221 final int sum=leftTrimmed+rightTrimmed; 222 223 if(verbose){ 224 System.err.println("leftTrimmed="+leftTrimmed+", rightTrimmed="+rightTrimmed+", sum="+sum); 225 } 226 227 if(sum==0){ 228 bases2=bases1; 229 qual2=qual1; 230 }else{ 231 bases2=KillSwitch.copyOfRange(bases1, trimLeft, bases1.length-trimRight); 232 qual2=((qual1==null || (trimLeft+trimRight>=qual1.length)) ? null : KillSwitch.copyOfRange(qual1, trimLeft, qual1.length-trimRight)); 233 } 234 return sum; 235 } 236 237 /** Trim bases outside of leftLoc and rightLoc, excluding leftLoc and rightLoc */ trimToPosition(Read r, int leftLoc, int rightLoc, int minResultingLength)238 public static int trimToPosition(Read r, int leftLoc, int rightLoc, int minResultingLength){ 239 final int len=r.length(); 240 return trimByAmount(r, leftLoc, len-rightLoc-1, minResultingLength, false); 241 } 242 243 /** Remove non-genetic-code from reads */ trimBadSequence(Read r)244 public static int trimBadSequence(Read r){ 245 final byte[] bases=r.bases, quals=r.quality; 246 if(bases==null){return 0;} 247 final int minGenetic=20; 248 int lastNon=-1; 249 for(int i=0; i<bases.length; i++){ 250 byte b=bases[i]; 251 if(!AminoAcid.isACGTN(b)){lastNon=i;} 252 if(i-lastNon>minGenetic){break;} 253 } 254 if(lastNon>=0){ 255 r.bases=KillSwitch.copyOfRange(bases, lastNon+1, bases.length); 256 if(quals!=null){ 257 r.quality=KillSwitch.copyOfRange(quals, lastNon+1, quals.length); 258 } 259 } 260 return lastNon+1; 261 } 262 263 /** Trim this many bases from each end */ trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength)264 public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength){ 265 return trimByAmount(r, leftTrimAmount, rightTrimAmount, minResultingLength, false); 266 } 267 268 /** Trim this many bases from each end */ trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength, boolean trimClip)269 public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength, boolean trimClip){ 270 271 leftTrimAmount=Tools.max(leftTrimAmount, 0); 272 rightTrimAmount=Tools.max(rightTrimAmount, 0); 273 274 //These assertions are unnecessary if the mapping information will never be used or output. 275 // assert(r.match==null) : "TODO: Handle trimming of reads with match strings."; 276 assert(r.sites==null) : "TODO: Handle trimming of reads with SiteScores."; 277 278 if(r.match!=null){ 279 return trimReadWithMatch(r, r.samline, leftTrimAmount, rightTrimAmount, minResultingLength, Integer.MAX_VALUE, trimClip); 280 } 281 282 final byte[] bases=r.bases, qual=r.quality; 283 final int len=(bases==null ? 0 : bases.length), qlen=(qual==null ? 0 : qual.length); 284 if(len<1){return 0;} 285 minResultingLength=Tools.min(len, Tools.max(minResultingLength, 0)); 286 if(leftTrimAmount+rightTrimAmount+minResultingLength>len){ 287 rightTrimAmount=Tools.max(1, len-minResultingLength); 288 leftTrimAmount=0; 289 } 290 291 final int total=leftTrimAmount+rightTrimAmount; 292 if(total>0){ 293 r.bases=KillSwitch.copyOfRange(bases, leftTrimAmount, len-rightTrimAmount); 294 r.quality=(leftTrimAmount+rightTrimAmount>=qlen ? null : KillSwitch.copyOfRange(qual, leftTrimAmount, qlen-rightTrimAmount)); 295 trimMatch(r); 296 if(r.stop>r.start){ //TODO: Fixing mapped coordinates needs more work. 297 r.start+=leftTrimAmount; 298 r.stop-=rightTrimAmount; 299 } 300 } 301 302 if(verbose){ 303 System.err.println("leftTrimmed="+leftTrimAmount+", rightTrimmed="+rightTrimAmount+ 304 ", sum="+total+", final length="+r.length()); 305 } 306 return total; 307 } 308 309 /** Count number of bases that need trimming on each side, and pack into a long */ testOptimal(byte[] bases, byte[] qual, float avgErrorRate)310 public static long testOptimal(byte[] bases, byte[] qual, float avgErrorRate){ 311 if(optimalBias>=0){avgErrorRate=optimalBias;}//Override 312 assert(avgErrorRate>0 && avgErrorRate<=1) : "Average error rate ("+avgErrorRate+") must be between 0 (exclusive) and 1 (inclusive)"; 313 if(bases==null || bases.length==0){return 0;} 314 if(qual==null){return avgErrorRate>=1 ? 0 : ((((long)testLeftN(bases))<<32) | (((long)testRightN(bases))&0xFFFFFFFFL));} 315 316 float maxScore=0; 317 float score=0; 318 int maxLoc=-1; 319 int maxCount=-1; 320 int count=0; 321 322 final float nprob=Tools.max(Tools.min(avgErrorRate*1.1f, 1), NPROB); 323 324 for(int i=0; i<bases.length; i++){ 325 final byte b=bases[i]; 326 byte q=qual[i]; 327 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]); 328 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbGeoAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]); 329 // float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProb2(qual, bases, i) : QualityTools.PROB_ERROR[q]); 330 331 // float probError=(b=='N' ? nprob : q==1 ? PROB1 : QualityTools.PROB_ERROR[q]); 332 // float probError=(b=='N' ? nprob : QualityTools.PROB_ERROR[q]); 333 float probError=((b=='N' || q<1) ? nprob : QualityTools.PROB_ERROR[q]); 334 335 // assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; 336 337 float delta=avgErrorRate-probError; 338 score=score+delta; 339 if(score>0){ 340 count++; 341 if(score>maxScore || (score==maxScore && count>maxCount)){ 342 maxScore=score; 343 maxCount=count; 344 maxLoc=i; 345 } 346 }else{ 347 score=0; 348 count=0; 349 } 350 } 351 352 final int left, right; 353 if(maxScore>0){ 354 assert(maxLoc>=0); 355 assert(maxCount>0); 356 left=maxLoc-maxCount+1; 357 assert(left>=0 && left<=bases.length); 358 right=bases.length-maxLoc-1; 359 }else{ 360 left=0; 361 right=bases.length; 362 } 363 final long packed=((((long)left)<<32) | (((long)right)&0xFFFFFFFFL)); 364 365 if(verbose){ 366 System.err.println(Arrays.toString(qual)); 367 System.err.println("After testLocal: maxScore="+maxScore+", maxLoc="+maxLoc+", maxCount="+maxCount+ 368 ", left="+left+", right="+right+", returning "+Long.toHexString(packed)); 369 } 370 return packed; 371 } 372 373 /** Count number of bases that need trimming on left side */ testLeft(byte[] bases, byte[] qual, final byte trimq)374 private static int testLeft(byte[] bases, byte[] qual, final byte trimq){ 375 if(bases==null || bases.length==0){return 0;} 376 if(qual==null){return trimq<0 ? 0 : testLeftN(bases);} 377 int good=0; 378 int lastBad=-1; 379 int i=0; 380 for(; i<bases.length && good<minGoodInterval; i++){ 381 final byte q=qual[i]; 382 final byte b=bases[i]; 383 assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; 384 if(q>trimq){good++;} 385 else{good=0; lastBad=i;} 386 } 387 if(verbose){ 388 // System.err.println(Arrays.toString(qual)); 389 System.err.println("After testLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(lastBad+1)); 390 // assert(false); 391 } 392 return lastBad+1; 393 } 394 395 /** Count number of bases that need trimming on right side using a sliding window */ testRightWindow(byte[] bases, byte[] qual, final byte trimq, final int window)396 private static int testRightWindow(byte[] bases, byte[] qual, final byte trimq, final int window){ 397 if(bases==null || bases.length==0){return 0;} 398 if(qual==null || qual.length<window){return trimq>0 ? 0 : testRightN(bases);} 399 final int thresh=Tools.max(window*trimq, 1); 400 int sum=0; 401 for(int i=0, j=-window; i<qual.length; i++, j++){ 402 final byte q=qual[i]; 403 sum+=q; 404 if(j>=-1){ 405 if(j>=0){sum-=qual[j];} 406 if(sum<thresh){ 407 return qual.length-j-1; 408 } 409 } 410 } 411 return 0; 412 } 413 414 /** Count number of bases that need trimming on right side */ testRight(byte[] bases, byte[] qual, final byte trimq)415 private static int testRight(byte[] bases, byte[] qual, final byte trimq){ 416 if(bases==null || bases.length==0){return 0;} 417 if(qual==null){return trimq<0 ? 0 : testRightN(bases);} 418 int good=0; 419 int lastBad=bases.length; 420 int i=bases.length-1; 421 for(; i>=0 && good<minGoodInterval; i--){ 422 final byte q=qual[i]; 423 final byte b=bases[i]; 424 assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; 425 if(q>trimq){good++;} 426 else{good=0; lastBad=i;} 427 } 428 if(verbose){ 429 System.err.println("After trimLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(bases.length-lastBad)); 430 } 431 return bases.length-lastBad; 432 } 433 434 /** Count number of bases that need trimming on left side, considering only N as bad */ testLeftN(byte[] bases)435 public static int testLeftN(byte[] bases){ 436 if(bases==null || bases.length==0){return 0;} 437 int good=0; 438 int lastBad=-1; 439 for(int i=0; i<bases.length && good<minGoodInterval; i++){ 440 final byte b=bases[i]; 441 //if(dna.AminoAcid.isFullyDefined(b)){good++;} 442 if(b!=((byte)'N')){good++;} 443 else{good=0; lastBad=i;} 444 } 445 return lastBad+1; 446 } 447 448 /** Count number of bases that need trimming on right side, considering only N as bad */ testRightN(byte[] bases)449 public static int testRightN(byte[] bases){ 450 if(bases==null || bases.length==0){return 0;} 451 int good=0; 452 int lastBad=bases.length; 453 for(int i=bases.length-1; i>=0 && good<minGoodInterval; i--){ 454 final byte b=bases[i]; 455 //if(dna.AminoAcid.isFullyDefined(b)){good++;} 456 if(b!=((byte)'N')){good++;} 457 else{good=0; lastBad=i;} 458 } 459 return bases.length-lastBad; 460 } 461 untrim()462 public boolean untrim(){ 463 if(leftTrimmed==0 && rightTrimmed==0){return false;} 464 r.setPerfect(false); 465 466 final int lt, rt; 467 if(r.strand()==Shared.PLUS){ 468 lt=leftTrimmed; 469 rt=rightTrimmed; 470 }else{ 471 lt=rightTrimmed; 472 rt=leftTrimmed; 473 } 474 475 boolean returnToShort=false; 476 if(verbose){System.err.println("Untrimming");} 477 if(r.match!=null){ 478 if(r.shortmatch()){ 479 r.toLongMatchString(false); 480 returnToShort=true; 481 } 482 byte[] match2=new byte[r.match.length+lt+rt]; 483 int i=0; 484 for(; i<lt; i++){ 485 match2[i]='C'; 486 } 487 for(int j=0; j<r.match.length; i++, j++){ 488 match2[i]=r.match[j]; 489 } 490 for(; i<match2.length; i++){ 491 match2[i]='C'; 492 } 493 r.match=match2; 494 } 495 r.bases=bases1; 496 r.quality=qual1; 497 r.start-=lt; 498 r.stop+=rt; 499 if(returnToShort){ 500 r.toShortMatchString(true); 501 } 502 503 if(r.sites!=null){ 504 for(SiteScore ss : r.sites){untrim(ss);} 505 } 506 507 return true; 508 } 509 untrim(SiteScore ss)510 private boolean untrim(SiteScore ss){ 511 if(ss==null){return false;} 512 if(leftTrimmed==0 && rightTrimmed==0){return false;} 513 ss.perfect=ss.semiperfect=false; 514 515 final int lt, rt; 516 if(ss.strand==Shared.PLUS){ 517 lt=leftTrimmed; 518 rt=rightTrimmed; 519 }else{ 520 lt=rightTrimmed; 521 rt=leftTrimmed; 522 } 523 524 boolean returnToShort=false; 525 if(verbose){System.err.println("Untrimming ss "+ss);} 526 if(ss.match!=null){ 527 528 boolean shortmatch=false; 529 for(byte b : ss.match){ 530 if(Tools.isDigit(b)){shortmatch=true; break;} 531 } 532 533 if(shortmatch){ 534 ss.match=Read.toLongMatchString(ss.match); 535 returnToShort=true; 536 } 537 byte[] match2=new byte[ss.match.length+lt+rt]; 538 int i=0; 539 for(; i<lt; i++){ 540 match2[i]='C'; 541 } 542 for(int j=0; j<ss.match.length; i++, j++){ 543 match2[i]=ss.match[j]; 544 } 545 for(; i<match2.length; i++){ 546 match2[i]='C'; 547 } 548 ss.match=match2; 549 } 550 ss.setLimits(ss.start-lt, ss.stop+rt); 551 if(returnToShort){ss.match=Read.toShortMatchString(ss.match);} 552 return true; 553 } 554 trimMatch(Read r)555 private static boolean trimMatch(Read r){ 556 if(r.match==null && r.sites==null){return false;} 557 558 //Easy mode! 559 r.match=null; 560 if(r.sites!=null){ 561 for(SiteScore ss : r.sites){ 562 if(ss!=null){ss.match=null;} 563 } 564 } 565 return true; 566 567 //TODO - need to adjust read start and stop based on this information. Also check strand! 568 // byte[] match=r.match; 569 // if(r.shortmatch()){match=Read.toLongMatchString(match);} 570 // byte[] match2=new byte[match.length-leftTrimmed-rightTrimmed]; 571 // for(int mpos=0, bpos=0; bpos<leftTrimmed; mpos++){ 572 // byte m=match[mpos]; 573 // if(m=='D'){ 574 // //do nothing 575 // }else{ 576 // bpos++; 577 // } 578 // } 579 } 580 581 /** Special case of 100% match */ trimReadWithMatchFast(final Read r, final SamLine sl, final int leftTrimAmount, final int rightTrimAmount, final int minFinalLength)582 public static int trimReadWithMatchFast(final Read r, final SamLine sl, final int leftTrimAmount, final int rightTrimAmount, final int minFinalLength){ 583 assert(r.match!=null); 584 if(r.bases==null){return 0;} 585 if(leftTrimAmount<1 && rightTrimAmount<1){return 0;} 586 if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;} 587 588 final boolean shortmatch=r.shortmatch(); 589 final byte[] old=r.match; 590 r.match=null; 591 final int trimmed; 592 if(sl.strand()==Shared.MINUS){ 593 trimmed=trimByAmount(r, rightTrimAmount, leftTrimAmount, minFinalLength, false); 594 }else{ 595 trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false); 596 } 597 if(trimmed<1){ 598 r.match=old; 599 return 0; 600 } 601 602 ByteBuilder bb=new ByteBuilder(); 603 final int len=r.length(); 604 if(shortmatch){ 605 bb.append((byte)'m'); 606 if(len>1){bb.append(len);} 607 }else{ 608 for(int i=0; i<len; i++){bb.append((byte)'m');} 609 } 610 r.match=bb.toBytes(); 611 bb.clear(); 612 613 if(sl!=null){ 614 sl.pos+=leftTrimAmount; 615 if(sl.cigar!=null){ 616 bb.append(len); 617 bb.append(SamLine.VERSION>1.3 ? '=' : 'm'); 618 sl.cigar=bb.toString(); 619 } 620 sl.seq=r.bases; 621 sl.qual=r.quality; 622 if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){ 623 ArrayList<String> list=new ArrayList<String>(2); 624 for(int i=0; i<sl.optional.size(); i++){ 625 String s=sl.optional.get(i); 626 if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags. 627 } 628 sl.optional.clear(); 629 sl.optional.addAll(list); 630 } 631 } 632 return trimmed; 633 } 634 635 //TODO: This is slow 636 //TODO: Note, returns a negative number if the whole read is supposed to be trimmed trimReadWithMatch(final Read r, final SamLine sl, int leftTrimAmount, int rightTrimAmount, int minFinalLength, int scafLen, boolean trimClip)637 public static int trimReadWithMatch(final Read r, final SamLine sl, 638 int leftTrimAmount, int rightTrimAmount, int minFinalLength, int scafLen, boolean trimClip){ 639 if(r.bases==null || (leftTrimAmount<1 && rightTrimAmount<1 && !trimClip)){return 0;} 640 if(!r.containsNonM() && !trimClip){ 641 return trimReadWithMatchFast(r, sl, leftTrimAmount, rightTrimAmount, minFinalLength); 642 } 643 644 // if(r.match==null){ 645 // assert(sl.cigar==null); 646 // int trimmed=trimByAmount(r, leftTrimAmount, rightTrimAmount, minFinalLength, false); 647 // if(sl!=null){ 648 // assert(sl.cigar==null); 649 // int start=sl.pos-1; 650 // int stop=start+sl.seq.length; 651 // sl.seq=r.bases; 652 // sl.qual=r.quality; 653 // if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){ 654 // ArrayList<String> list=new ArrayList<String>(2); 655 // for(int i=0; i<sl.optional.size(); i++){ 656 // String s=sl.optional.get(i); 657 // if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags. 658 // } 659 // sl.optional.clear(); 660 // sl.optional.addAll(list); 661 // } 662 // } 663 // } 664 665 if(trimClip){ 666 r.toLongMatchString(false); 667 byte[] match=r.match; 668 int leftClip=0, rightClip=0; 669 for(int i=0; i<match.length; i++){ 670 if(match[i]=='C'){leftClip++;} 671 else{break;} 672 } 673 for(int i=match.length-1; i>=0; i--){ 674 if(match[i]=='C'){rightClip++;} 675 else{break;} 676 } 677 leftTrimAmount=Tools.max(leftTrimAmount, leftClip); 678 rightTrimAmount=Tools.max(rightTrimAmount, rightClip); 679 } 680 681 if(leftTrimAmount<1 && rightTrimAmount<1){return 0;} 682 if(leftTrimAmount+rightTrimAmount>=r.length()){return -leftTrimAmount-rightTrimAmount;} 683 684 final int oldPos=(sl==null ? 1 : sl.pos); 685 686 r.toLongMatchString(false); 687 byte[] match=r.match; 688 689 assert(!r.shortmatch()); 690 691 // System.err.println("Q: "+sl); 692 // System.err.println(new String(match)); 693 694 ByteBuilder bb=new ByteBuilder(match.length); 695 // System.err.println("leftTrimAmount="+leftTrimAmount+", rightTrimAmount="+rightTrimAmount); 696 697 int leftTrim=leftTrimAmount>0 ? r.length() : 0, rightTrim=rightTrimAmount>0 ? r.length() : 0; 698 // System.err.println("leftTrim="+leftTrim+", rightTrim="+rightTrim); 699 { 700 final int mlen=match.length; 701 boolean keep=leftTrimAmount<1; 702 int rpos=(sl==null ? 1 : sl.pos); 703 for(int mpos=0, cpos=0; mpos<mlen; mpos++){ 704 final byte m=match[mpos]; 705 if(m=='m' || m=='S' || m=='V' || m=='N'){ 706 cpos++; 707 rpos++; 708 }else if(m=='D'){ 709 rpos++; 710 }else if(m=='I' || m=='C'){ 711 cpos++; 712 }else if(m=='d'){ 713 rpos++; 714 }else if(m=='i'){ 715 cpos++; 716 }else{ 717 assert(false) : "Unknown symbol "+(char)m; 718 } 719 720 if(keep){ 721 bb.append(m); 722 }else if(cpos>=leftTrimAmount){ 723 byte next=(mpos<mlen-1 ? match[mpos+1] : (byte)'m'); 724 if(m=='m' || m=='S' || m=='V' || m=='N' || next=='m' || next=='S' || next=='V' || next=='N'){ 725 keep=true; 726 if(sl!=null){sl.pos=rpos;} 727 leftTrim=cpos; 728 } 729 } 730 } 731 } 732 // System.err.println("R: "+sl); 733 match=bb.toBytes(); 734 Tools.reverseInPlace(match); 735 bb.clear(); 736 { 737 final int mlen=match.length; 738 boolean keep=rightTrimAmount<1; 739 for(int mpos=0, cpos=0; mpos<mlen; mpos++){ 740 final byte m=match[mpos]; 741 if(m=='m' || m=='S' || m=='V' || m=='N' || m=='I' || m=='C'){ 742 cpos++; 743 }else if(m=='D'){ 744 }else if(m=='d'){ 745 }else if(m=='i'){ 746 cpos++; 747 }else{ 748 assert(false) : "Unknown symbol "+(char)m; 749 } 750 751 if(keep){ 752 bb.append(m); 753 }else if(cpos>=rightTrimAmount){ 754 byte next=(mpos<mlen-1 ? match[mpos+1] : (byte)'m'); 755 // if(m=='m' || m=='S' || m=='V' || m=='N' || next=='m' || next=='S' || next=='V' || next=='N'){ 756 if(next!='I' && next!='D' && next!='i' && next!='d'){ 757 keep=true; 758 rightTrim=cpos; 759 } 760 // } 761 } 762 } 763 } 764 // System.err.println("S: "+sl); 765 match=bb.toBytes(); 766 Tools.reverseInPlace(match); 767 768 // System.err.println("leftTrim="+leftTrim+", rightTrim="+rightTrim); 769 770 if(leftTrim+rightTrim>=r.length()){ 771 if(sl!=null){sl.pos=oldPos;} 772 return -leftTrim-rightTrim; 773 } 774 775 // System.err.println("T: "+sl); 776 r.match=null; 777 if(sl!=null && sl.strand()==Shared.MINUS){ 778 int temp=leftTrim; 779 leftTrim=rightTrim; 780 rightTrim=temp; 781 } 782 final int trimmed=trimByAmount(r, leftTrim, rightTrim, minFinalLength, false); 783 // System.err.println("tba: "+leftTrim+", "+rightTrim+", "+minFinalLength); 784 r.match=match; 785 786 if(sl!=null){ 787 int start=sl.pos-1; 788 int stop=start+Read.calcMatchLength(match)-1; 789 if(SamLine.VERSION>1.3){ 790 sl.cigar=SamLine.toCigar14(match, start, stop, scafLen, r.bases); 791 }else{ 792 sl.cigar=SamLine.toCigar13(match, start, stop, scafLen, r.bases); 793 } 794 sl.seq=r.bases; 795 sl.qual=r.quality; 796 if(trimmed>0 && sl.optional!=null && sl.optional.size()>0){ 797 ArrayList<String> list=new ArrayList<String>(2); 798 for(int i=0; i<sl.optional.size(); i++){ 799 String s=sl.optional.get(i); 800 if(s.startsWith("PG:") || s.startsWith("RG:") || s.startsWith("X") || s.startsWith("Y") || s.startsWith("Z")){list.add(s);} //Only keep safe flags. 801 } 802 sl.optional.clear(); 803 sl.optional.addAll(list); 804 } 805 } 806 return trimmed; 807 } 808 trimmed()809 public int trimmed() { 810 return leftTrimmed+rightTrimmed; 811 } 812 813 public final Read r; 814 815 /** untrimmed bases */ 816 public final byte[] bases1; 817 /** untrimmed qualities */ 818 public final byte[] qual1; 819 /** trimmed bases */ 820 public byte[] bases2; 821 /** trimmed qualities */ 822 public byte[] qual2; 823 824 825 public final float trimq; 826 public int leftTrimmed; 827 public int rightTrimmed; 828 829 /** Require this many consecutive good bases to stop trimming. Minimum is 1. 830 * This is for the old trimming mode and not really used anymore */ 831 public static int minGoodInterval=2; 832 833 public static boolean verbose=false; 834 public static boolean optimalMode=true; 835 public static boolean windowMode=false; 836 public static float optimalBias=-1f; 837 838 public static int windowLength=4; 839 840 private static final float NPROB=0.75f; 841 // public static float PROB1=QualityTools.PROB_ERROR[1]; 842 843 844 } 845