1 package jgi; 2 3 import java.util.Arrays; 4 import java.util.Locale; 5 6 import assemble.Tadpole; 7 import assemble.Tadpole1; 8 import assemble.Tadpole2; 9 import dna.AminoAcid; 10 import shared.KillSwitch; 11 import shared.Shared; 12 import shared.Tools; 13 import stream.Read; 14 import ukmer.Kmer; 15 16 /** 17 * @author Brian Bushnell 18 * @date Apr 15, 2014 19 * 20 */ 21 public final class BBMergeOverlapper { 22 23 static{ 24 if(false && Shared.USE_JNI){//TODO: Disabled! Shared.loadJNI()25 Shared.loadJNI(); 26 } 27 } 28 mateByOverlapJNI(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int margin, int maxMismatches0, int maxMismatches, int minq)29 private static native final int mateByOverlapJNI(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality, 30 float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int margin, 31 int maxMismatches0, int maxMismatches, int minq); 32 mateByOverlapRatioJNI_WithQualities(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, float margin, float offset)33 private static native final int mateByOverlapRatioJNI_WithQualities(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality, 34 float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, 35 float margin, float offset); 36 mateByOverlapRatioJNI(byte[] a_bases, byte[] b_bases, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, float margin, float offset, float gIncr, float bIncr)37 private static native final int mateByOverlapRatioJNI(byte[] a_bases, byte[] b_bases, 38 int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, 39 float margin, float offset, float gIncr, float bIncr); 40 mateByOverlap(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq)41 protected static final int mateByOverlap(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, 42 int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq) { 43 if(rvector==null){rvector=KillSwitch.allocInt1D(5);} 44 final int x; 45 if(false && Shared.USE_JNI){//TODO: Disabled! 46 x=mateByOverlapJNI(a.bases, b.bases, a.quality, b.quality, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, margin, maxMismatches0, maxMismatches, minq); 47 }else{ 48 x=mateByOverlapJava_unrolled(a, b, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, margin, maxMismatches0, maxMismatches, minq); 49 } 50 return x; 51 } 52 mateByOverlapRatio(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, final float maxRatio, final float minSecondRatio, final float margin, final float offset, final float gIncr, final float bIncr, boolean useQuality)53 protected static final int mateByOverlapRatio(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, 54 int minOverlap0, int minOverlap, int minInsert0, int minInsert, final float maxRatio, final float minSecondRatio, 55 final float margin, final float offset, final float gIncr, final float bIncr, boolean useQuality) { 56 if(rvector==null){rvector=KillSwitch.allocInt1D(5);} 57 // final boolean swapped; 58 // if(a.length()>b.length()){ 59 // swapped=true; 60 // a.swapBasesWithMate(); 61 // a.reverseComplement(); 62 // b.reverseComplement(); 63 // }else{ 64 // swapped=false; 65 // } 66 67 final int x; 68 if(false && Shared.USE_JNI/* && !useQuality*/){ 69 if(useQuality && a.quality!=null && b.quality!=null){ 70 x=mateByOverlapRatioJNI_WithQualities(a.bases, b.bases, a.quality, b.quality, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset); 71 }else{ 72 x=mateByOverlapRatioJNI(a.bases, b.bases, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset, gIncr, bIncr); 73 } 74 }else{ 75 if(useQuality && a.quality!=null && b.quality!=null){ 76 x=mateByOverlapRatioJava_WithQualities(a, b, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, minInsert, 77 maxRatio, minSecondRatio, margin, offset); 78 }else{ 79 x=mateByOverlapRatioJava(a, b, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, minSecondRatio, margin, offset, gIncr, bIncr); 80 } 81 } 82 // if(swapped){ 83 // a.swapBasesWithMate(); 84 // a.reverseComplement(); 85 // b.reverseComplement(); 86 // } 87 return x; 88 } 89 mateByOverlapRatioJava_WithQualities(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio, final float margin, final float offset)90 protected static final int mateByOverlapRatioJava_WithQualities(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, 91 int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio, 92 final float margin, final float offset) { 93 assert(rvector!=null); 94 assert(margin>=1); 95 minOverlap=Tools.max(4, minOverlap0, minOverlap); 96 minOverlap0=Tools.mid(4, minOverlap0, minOverlap); 97 if(rvector==null){rvector=KillSwitch.allocInt1D(5);} 98 99 final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality; 100 final int alen=abases.length, blen=bbases.length; 101 final int minLength=Tools.min(alen, blen); 102 103 assert(aqual!=null && bqual!=null); 104 { 105 for(int i=0; i<aqual.length; i++){aprob[i]=probCorrect3[aqual[i]];} 106 for(int i=0; i<bqual.length; i++){bprob[i]=probCorrect3[bqual[i]];} 107 } 108 109 { 110 float x=findBestRatio_WithQualities(a, b, aprob, bprob, minOverlap0, minOverlap, minInsert, maxRatio, offset); 111 if(!TAG_CUSTOM && x>maxRatio){ 112 rvector[2]=minLength; 113 rvector[4]=0; 114 return -1; 115 } 116 maxRatio=Tools.min(maxRatio, x); 117 } 118 119 // final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+2; 120 final float margin2=(margin+offset)/minLength; 121 122 int bestInsert=-1; 123 int bestOverlap=-1; 124 int bestBadInt=-1; 125 float bestBad=minLength; 126 float bestRatio=1; 127 boolean ambig=false; 128 129 float secondBestBad=0; 130 float secondBestRatio=1; 131 int secondBestInsert=0; 132 int secondBestOverlap=0; 133 int secondBestBadInt=-1; 134 135 final float extraMult=(TAG_CUSTOM ? 80 : 1.2f); 136 137 final int largestInsertToTest=(alen+blen-minOverlap0); 138 final int smallestInsertToTest=minInsert0; 139 for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){ 140 if(verbose){System.err.println("d\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));} 141 142 float good=0, bad=0; 143 int badInt=0; 144 145 final int istart=(insert<=blen ? 0 : insert-blen); 146 final int jstart=(insert>=blen ? 0 : blen-insert); 147 148 final int overlapLength=Tools.min(alen-istart, blen-jstart, insert); 149 // final float badlimit=extraMult*Tools.max(minSecondRatio, Tools.min(altBadlimit, (Tools.min(bestRatio, maxRatio)))*margin*overlapLength)+1f; 150 // final float badlimit=extraMult*Tools.max(minSecondRatio, Tools.min(bestRatio, maxRatio)*margin*overlapLength)+1f; 151 final float badlimit=extraMult*(Tools.min(bestRatio, maxRatio)*margin*overlapLength)+1f; 152 // final float badlimit=extraMult*Tools.max(minSecondRatio, Tools.min(bestRatio, maxRatio))*margin*overlapLength+1f; 153 154 final int imax=istart+overlapLength; 155 for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){ 156 assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+ 157 ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+ 158 ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good; 159 final byte ca=abases[i], cb=bbases[j]; 160 161 final float x=aprob[i]*bprob[j]; 162 163 if(ca==cb){good+=x;} 164 else{ 165 bad+=x; 166 badInt++; 167 } 168 } 169 170 // if(verbose || true){ 171 // System.err.println("istart="+istart+", jstart="+jstart+", overlapLength="+overlapLength+", overlap="+overlap+", bestOverlap="+bestOverlap); 172 // System.err.println("overlap="+overlap+", bad="+bad+", good="+good); 173 // System.err.println("bestGood="+bestGood+", bestBad="+bestBad); 174 // System.err.println(); 175 // } 176 177 if(bad<=badlimit){ 178 if(bad==0 && good>minOverlap0 && good<minOverlap){ 179 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 180 rvector[4]=1; 181 return -1; 182 } 183 184 float ratio=(bad+offset)/overlapLength; 185 // System.err.println("*** ratio="+ratio+", bestRatio="+bestRatio); 186 187 if(ratio<bestRatio*margin){ 188 189 ambig=(ratio*margin>=bestRatio || good<minOverlap); 190 if(ratio<bestRatio){ 191 secondBestInsert=bestInsert; 192 secondBestOverlap=bestOverlap; 193 secondBestBad=bestBad; 194 secondBestRatio=bestRatio; 195 secondBestBadInt=bestBadInt; 196 197 bestInsert=insert; 198 bestOverlap=overlapLength; 199 bestBad=bad; 200 bestRatio=ratio; 201 bestBadInt=badInt; 202 } 203 else if(ratio<secondBestRatio){ 204 secondBestInsert=insert; 205 secondBestOverlap=overlapLength; 206 secondBestBad=bad; 207 secondBestRatio=ratio; 208 secondBestBadInt=badInt; 209 } 210 211 if(!TAG_CUSTOM && ((ambig && bestRatio<margin2) || secondBestRatio<minSecondRatio)){ 212 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 213 rvector[4]=1; 214 return -1; 215 } 216 } 217 } 218 } 219 if(secondBestRatio<minSecondRatio){ambig=true;} 220 221 if(TAG_CUSTOM){ 222 if(secondBestRatio<minSecondRatio){ambig=true;} 223 StringBuilder sb=new StringBuilder(a.id); 224 225 sb.append("_bi=").append(bestInsert); 226 sb.append("_bo=").append(bestOverlap); 227 sb.append("_bb=").append(String.format(Locale.ROOT, "%.4f", bestBad)); 228 sb.append("_br=").append(String.format(Locale.ROOT, "%.4f", bestRatio)); 229 sb.append("_bbi=").append(bestBadInt); 230 231 sb.append("_sbi=").append(secondBestInsert); 232 sb.append("_sbo=").append(secondBestOverlap); 233 sb.append("_sbb=").append(String.format(Locale.ROOT, "%.4f", secondBestBad)); 234 sb.append("_sbr=").append(String.format(Locale.ROOT, "%.4f", secondBestRatio)); 235 sb.append("_sbbi=").append(secondBestBadInt); 236 237 a.id=sb.toString(); 238 239 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 240 rvector[4]=(ambig ? 1 : 0); 241 242 return (bestInsert<0 ? -1 : bestInsert); 243 } 244 245 if(!ambig && bestRatio>maxRatio){bestInsert=-1;} 246 247 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 248 rvector[4]=(ambig ? 1 : 0); 249 250 // System.err.println("***C : "+bestOverlap+", "+ambig+", "+bestBad+", "+(bestOverlap<0 ? -1 : alen+blen-bestOverlap)+", "+ 251 // (bestOverlap<0 ? -1 : (bestOverlap<alen && alen>=blen) ? bestOverlap+alen-blen : bestOverlap)+", "+alen+", "+blen); 252 return (bestInsert<0 ? -1 : bestInsert); 253 } 254 mateByOverlapRatioJava(Read a, Read b, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio, final float margin, final float offset, final float gIncr, final float bIncr)255 protected static final int mateByOverlapRatioJava(Read a, Read b, int[] rvector, 256 int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio, 257 final float margin, final float offset, final float gIncr, final float bIncr) { 258 assert(rvector!=null); 259 assert(margin>=1); 260 minOverlap=Tools.max(4, minOverlap0, minOverlap); 261 minOverlap0=Tools.mid(4, minOverlap0, minOverlap); 262 if(rvector==null){rvector=KillSwitch.allocInt1D(5);} 263 264 final byte[] abases=a.bases, bbases=b.bases; 265 final int alen=abases.length, blen=bbases.length; 266 final int minLength=Tools.min(alen, blen); 267 268 { 269 float x=findBestRatio(a, b, minOverlap0, minOverlap, minInsert, maxRatio, offset, gIncr, bIncr); 270 if(verbose){ 271 System.err.println(x+", "+maxRatio+", "+Arrays.toString(rvector)); 272 } 273 if(x>maxRatio){ 274 rvector[2]=minLength; 275 rvector[4]=0; 276 return -1; 277 } 278 maxRatio=Tools.min(maxRatio, x); 279 } 280 281 // final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1; 282 final float margin2=(margin+offset)/minLength; 283 final byte N='N'; 284 285 int bestInsert=-1; 286 int bestOverlap=-1; 287 int bestBadInt=-1; 288 float bestBad=minLength; 289 float bestRatio=1; 290 boolean ambig=false; 291 292 float secondBestBad=0; 293 float secondBestRatio=1; 294 int secondBestInsert=0; 295 int secondBestOverlap=0; 296 int secondBestBadInt=-1; 297 298 final float extraMult=(TAG_CUSTOM ? 80 : 1.2f); 299 300 final int largestInsertToTest=(alen+blen-minOverlap0); 301 final int smallestInsertToTest=minInsert0; 302 for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){ 303 // if(verbose){System.err.println("Testing read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));} 304 305 final int istart=(insert<=blen ? 0 : insert-blen); 306 final int jstart=(insert>=blen ? 0 : blen-insert); 307 final int overlapLength=Tools.min(alen-istart, blen-jstart, insert); 308 309 // final float badlimit=Tools.min(altBadlimit, Tools.min(bestRatio, maxRatio)*margin*overlapLength); 310 final float badlimit=extraMult*(Tools.min(bestRatio, maxRatio)*margin*overlapLength)+1f; 311 float good=0, bad=0; 312 int badInt=0; 313 314 final int imax=istart+overlapLength; 315 for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){ 316 assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+ 317 ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+ 318 ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good; 319 final byte ca=abases[i], cb=bbases[j]; 320 321 if(ca==cb){ 322 if(ca!=N){good+=gIncr;} 323 }else{ 324 bad+=bIncr; 325 badInt++; 326 } 327 } 328 329 // if(verbose || true){ 330 // System.err.println("istart="+istart+", jstart="+jstart+", overlapLength="+overlapLength+", overlap="+overlap+", bestOverlap="+bestOverlap); 331 // System.err.println("overlap="+overlap+", bad="+bad+", good="+good); 332 // System.err.println("bestGood="+bestGood+", bestBad="+bestBad); 333 // System.err.println(); 334 // } 335 336 if(bad<=badlimit){ 337 if(bad==0 && good>minOverlap0 && good<minOverlap){ 338 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 339 rvector[4]=1; 340 return -1; 341 } 342 343 float ratio=(bad+offset)/overlapLength; 344 // System.err.println("*** ratio="+ratio+", bestRatio="+bestRatio); 345 346 347 if(ratio<bestRatio*margin){ 348 349 ambig=(ratio*margin>=bestRatio || good<minOverlap); 350 351 if(ratio<bestRatio){ 352 353 if(verbose){ 354 System.err.println("A: Set ambig="+ambig+": "+ratio+"*"+margin+"="+(ratio*margin)+">="+bestRatio+" || "+good+"<"+minOverlap); 355 } 356 secondBestInsert=bestInsert; 357 secondBestOverlap=bestOverlap; 358 secondBestBad=bestBad; 359 secondBestRatio=bestRatio; 360 secondBestBadInt=bestBadInt; 361 362 bestInsert=insert; 363 bestOverlap=overlapLength; 364 bestBad=bad; 365 bestRatio=ratio; 366 bestBadInt=badInt; 367 } 368 else if(ratio<secondBestRatio){ 369 370 if(verbose){ 371 System.err.println("B: Set ambig="+ambig+": "+ratio+"*"+margin+"="+(ratio*margin)+">="+bestRatio+" || "+good+"<"+minOverlap); 372 } 373 secondBestInsert=insert; 374 secondBestOverlap=overlapLength; 375 secondBestBad=bad; 376 secondBestRatio=ratio; 377 secondBestBadInt=badInt; 378 } 379 380 if(!TAG_CUSTOM && ((ambig && bestRatio<margin2) || secondBestRatio<minSecondRatio)){ 381 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 382 rvector[4]=1; 383 return -1; 384 } 385 } 386 } 387 } 388 389 if(verbose){ 390 System.err.println("minSecondRatio="+minSecondRatio+", secondBestRatio="+secondBestRatio+", margin="+margin+", bestRatio="+bestRatio+", bestBad="+bestBad); 391 } 392 393 if(TAG_CUSTOM){ 394 if(secondBestRatio<minSecondRatio){ambig=true;} 395 StringBuilder sb=new StringBuilder(a.id); 396 397 sb.append("_bi=").append(bestInsert); 398 sb.append("_bo=").append(bestOverlap); 399 sb.append("_bb=").append(String.format(Locale.ROOT, "%.4f", bestBad)); 400 sb.append("_br=").append(String.format(Locale.ROOT, "%.4f", bestRatio)); 401 sb.append("_bbi=").append(bestBadInt); 402 403 sb.append("_sbi=").append(secondBestInsert); 404 sb.append("_sbo=").append(secondBestOverlap); 405 sb.append("_sbb=").append(String.format(Locale.ROOT, "%.4f", secondBestBad)); 406 sb.append("_sbr=").append(String.format(Locale.ROOT, "%.4f", secondBestRatio)); 407 sb.append("_sbbi=").append(secondBestBadInt); 408 409 a.id=sb.toString(); 410 411 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 412 rvector[4]=(ambig ? 1 : 0); 413 414 return (bestInsert<0 ? -1 : bestInsert); 415 } 416 417 if(!ambig && bestRatio>maxRatio){bestInsert=-1;} 418 419 rvector[2]=bestBadInt;//=(int)(bestBad+0.95f); 420 rvector[4]=(ambig ? 1 : 0); 421 422 // System.err.println("***C : "+bestOverlap+", "+ambig+", "+bestBad+", "+(bestOverlap<0 ? -1 : alen+blen-bestOverlap)+", "+ 423 // (bestOverlap<0 ? -1 : (bestOverlap<alen && alen>=blen) ? bestOverlap+alen-blen : bestOverlap)+", "+alen+", "+blen); 424 425 return (bestInsert<0 ? -1 : bestInsert); 426 } 427 findBestRatio_WithQualities(Read a, Read b, final float[] aprob, final float[] bprob, final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset)428 protected static final float findBestRatio_WithQualities(Read a, Read b, final float[] aprob, final float[] bprob, 429 final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset) { 430 final byte[] abases=a.bases, bbases=b.bases; 431 final int alen=abases.length, blen=bbases.length; 432 433 float bestRatio=maxRatio+0.0001f; 434 // final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1; 435 final float halfmax=maxRatio*0.5f; 436 437 438 final int largestInsertToTest=(alen+blen-minOverlap); //TODO: test speed with minOverlap0 439 final int smallestInsertToTest=minInsert; 440 for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){ 441 if(verbose){System.err.println("a\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));} 442 443 final int istart=(insert<=blen ? 0 : insert-blen); 444 final int jstart=(insert>=blen ? 0 : blen-insert); 445 final int overlapLength=Tools.min(alen-istart, blen-jstart, insert); 446 447 // final float badlimit=(Tools.min(altBadlimit, bestRatio*overlapLength)); 448 final float badlimit=bestRatio*overlapLength; 449 float good=0, bad=0; 450 451 final int imax=istart+overlapLength; 452 for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){ 453 assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+ 454 ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+ 455 ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good; 456 final byte ca=abases[i], cb=bbases[j]; 457 458 final float x=aprob[i]*bprob[j]; 459 460 if(ca==cb){good+=x;} 461 else{bad+=x;} 462 } 463 464 if(bad<=badlimit){ 465 if(bad==0 && good>minOverlap0 && good<minOverlap){ 466 return 100f; 467 } 468 469 float ratio=(bad+offset)/overlapLength; 470 471 if(ratio<bestRatio){ 472 bestRatio=ratio; 473 if(good>=minOverlap && ratio<halfmax){return bestRatio;} 474 } 475 } 476 } 477 478 return bestRatio; 479 } 480 findBestRatio(Read a, Read b, final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset, final float gIncr, final float bIncr)481 protected static final float findBestRatio(Read a, Read b, 482 final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset, final float gIncr, final float bIncr) { 483 final byte[] abases=a.bases, bbases=b.bases; 484 final int alen=abases.length, blen=bbases.length; 485 486 float bestRatio=maxRatio+0.0001f; 487 // final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1; 488 final float halfmax=maxRatio*0.5f; 489 final byte N='N'; 490 491 492 final int largestInsertToTest=(alen+blen-minOverlap); //TODO: test speed with minOverlap0 493 final int smallestInsertToTest=minInsert; 494 for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){ 495 // if(verbose){System.err.println("Testing read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));} 496 497 final int istart=(insert<=blen ? 0 : insert-blen); 498 final int jstart=(insert>=blen ? 0 : blen-insert); 499 final int overlapLength=Tools.min(alen-istart, blen-jstart, insert); 500 501 // final float badlimit=(Tools.min(altBadlimit, bestRatio*overlapLength)); 502 final float badlimit=bestRatio*overlapLength; 503 float good=0, bad=0; 504 505 final int imax=istart+overlapLength; 506 for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){ 507 assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+ 508 ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+ 509 ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good; 510 final byte ca=abases[i], cb=bbases[j]; 511 512 if(ca==cb){ 513 if(ca!=N){good+=gIncr;} 514 }else{bad+=bIncr;} 515 } 516 517 if(bad<=badlimit){ 518 if(bad==0 && good>minOverlap0 && good<minOverlap){ 519 return 100f; 520 } 521 522 float ratio=(bad+offset)/overlapLength; 523 524 if(ratio<bestRatio){ 525 bestRatio=ratio; 526 if(good>=minOverlap && ratio<halfmax){return bestRatio;} 527 } 528 } 529 } 530 531 return bestRatio; 532 } 533 mateByOverlapJava_unrolled(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq)534 protected static final int mateByOverlapJava_unrolled(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, 535 int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq) { 536 assert(rvector!=null); 537 minOverlap0=Tools.min(Tools.max(1, minOverlap0), minOverlap); 538 assert(maxMismatches<=maxMismatches0); 539 margin=Tools.max(margin, 0); 540 assert(maxMismatches>=margin); 541 542 final byte[] abases=a.bases, bbases=b.bases; 543 final byte[] aqual=a.quality, bqual=b.quality; 544 final int alen=abases.length, blen=bbases.length; 545 546 int bestOverlap=-1; 547 int bestGood=-1; 548 int bestBad=maxMismatches0; 549 550 boolean ambig=false; 551 final int maxOverlap=alen+blen-Tools.max(minOverlap, minInsert0); 552 // assert(false) : minOverlap+", "+maxOverlap; 553 554 if(aqual!=null && bqual!=null){ 555 for(int i=0; i<aqual.length; i++){aprob[i]=probCorrect3[aqual[i]];} 556 for(int i=0; i<bqual.length; i++){bprob[i]=probCorrect3[bqual[i]];} 557 }else{ 558 for(int i=0; i<alen; i++){aprob[i]=0.98f;} 559 for(int i=0; i<blen; i++){bprob[i]=0.98f;} 560 } 561 562 final float minprob=probCorrect3[Tools.mid(1, minq, 41)]; 563 564 for(int overlap=Tools.max(minOverlap0, 0); overlap<maxOverlap; overlap++){ 565 if(verbose){System.err.println("c\nTesting read "+a.numericID+", overlap "+overlap+", insert "+(alen+blen-overlap));} 566 567 int good=0, bad=0; 568 569 int istart=(overlap<=alen ? 0 : overlap-alen); 570 int jstart=(overlap<=alen ? alen-overlap : 0); 571 572 { 573 final int iters=Tools.min(overlap-istart, blen-istart, alen-jstart); 574 final int imax=istart+iters; 575 final int badlim=bestBad+margin; 576 577 for(int i=istart, j=jstart; i<imax && bad<=badlim; i++, j++){ 578 assert(j>=0 && j<=alen && i>=0 && i<=blen) : "\njstart="+jstart+", j="+j+ 579 ", istart="+istart+", i="+i+" \n"+"overlap="+overlap+", a.length="+alen+ 580 ", b.length="+blen+", bad="+bad+", badlim="+badlim+", good="+good; 581 final byte ca1=abases[j], cb1=bbases[i]; 582 final float pc=aprob[j]*bprob[i]; 583 584 if(pc<=minprob){//do nothing 585 }else if(ca1==cb1){good++;} 586 else{bad++;} 587 } 588 589 if(verbose){ 590 final int overlapLen=(imax-istart); 591 System.err.println("overlapLen="+overlapLen+"; coordinates ("+jstart+"-"+(jstart+overlapLen)+"), ("+istart+"-"+imax+")"); 592 System.err.println(new String(abases, jstart, overlapLen)); 593 System.err.println(new String(bbases, istart, overlapLen)); 594 } 595 596 if(verbose){System.err.println("overlap="+overlap+", bad="+bad+", good="+good+", badlim="+badlim+", bestOverlap="+ 597 bestOverlap+", bestGood="+bestGood+", bestBad="+bestBad+", ambig="+ambig+", mino="+minOverlap+", mino0="+minOverlap0+ 598 ", margin="+margin+", maxMismatches="+maxMismatches);} 599 } 600 601 if(bad*2<good){ 602 if(verbose){System.err.print("a");} 603 if(good>minOverlap){//Candidate 604 if(verbose){System.err.print("b");} 605 if(bad<=bestBad){ 606 607 if(verbose){System.err.print("c");} 608 if(bad<bestBad || (bad==bestBad && good>bestGood)){//Current winner 609 if(verbose){System.err.print("d");} 610 if(bestBad-bad<margin){ambig=true;} 611 bestOverlap=overlap; 612 bestBad=bad; 613 bestGood=good; 614 }else if(bad==bestBad){ 615 if(verbose){System.err.print("e");} 616 ambig=true; 617 } 618 619 if(ambig && bestBad<margin){ 620 if(verbose){System.err.print("f");} 621 rvector[2]=bestBad; 622 rvector[4]=(ambig ? 1 : 0); 623 return -1; 624 } 625 } 626 }else if(bad<margin){ 627 if(verbose){System.err.print("g");} 628 ambig=true; 629 rvector[2]=bestBad; 630 rvector[4]=(ambig ? 1 : 0); 631 return -1; 632 }else{ 633 if(verbose){System.err.print("h");} 634 } 635 } 636 } 637 638 if(!ambig && bestBad>maxMismatches-margin){bestOverlap=-1;} 639 640 rvector[2]=bestBad; 641 rvector[4]=(ambig ? 1 : 0); 642 643 if(verbose){System.err.println("bestOverlap="+ 644 bestOverlap+", bestGood="+bestGood+", bestBad="+bestBad+", ambig="+ambig+", mino="+minOverlap+", mino0="+minOverlap0+ 645 ", margin="+margin+", maxMismatches="+maxMismatches);} 646 647 return (bestOverlap<0 ? -1 : alen+blen-bestOverlap); 648 } 649 650 651 /** 652 * TODO Use this 653 * @param a 654 * @param b 655 * @param overlap 656 * @return 657 */ expectedMismatches(Read a, Read b, int overlap)658 protected static final float expectedMismatches(Read a, Read b, int overlap) { 659 660 final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality; 661 final int alen=abases.length, blen=bbases.length; 662 final int istart=(overlap<=blen ? 0 : overlap-blen); 663 final int jstart=(overlap<=alen ? alen-overlap : 0); 664 665 float expected=0; 666 float actual=0; 667 668 if(aqual==null || bqual==null){return (overlap+0)/16;} 669 670 // System.err.println(istart); 671 // System.err.println(jstart); 672 // System.err.println(); 673 // 674 // System.err.println(a.id); 675 // System.err.println(overlap); 676 // System.err.println(new String(a.bases)); 677 // System.err.println(new String(b.bases)); 678 // System.err.println(); 679 // for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){ 680 // final byte ca=abases[i]; 681 // System.err.print((char)ca); 682 // } 683 // System.err.println(); 684 // for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){ 685 // final byte cb=bbases[j]; 686 // System.err.print((char)cb); 687 // } 688 // System.err.println(); 689 690 for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){ 691 final byte ca=abases[i], cb=bbases[j]; 692 final byte qa=aqual[i], qb=bqual[j]; 693 694 if(ca=='N' || cb=='N'){ 695 //do nothing 696 }else{ 697 assert(AminoAcid.isFullyDefined(ca) && AminoAcid.isFullyDefined(cb)) : 698 "A non-ACGTN base was detected. Please rerun with the flag 'itn'.\n"+(char)ca+", "+(char)cb+"\n"; 699 float probC=probCorrect4[qa]*probCorrect4[qb]; 700 float probE=1-probC; 701 // expected+=Tools.max(0.0005f, probE); 702 expected+=probE; 703 actual+=(ca==cb ? 0 : probC); 704 // assert((probE==1) == (ca=='N' || cb=='N')) : ((char)ca)+", "+((char)cb)+", "+qa+", "+qb+", "+probC+", "+probE; 705 } 706 } 707 708 // System.err.println("*expected: \t"+expected); 709 // System.err.println("*Actual: \t"+actual); 710 // System.err.println(); 711 // 712 // assert(a.id.equals("insert=142 /1") || a.id.equals("insert=263 /1")) : a.id; 713 714 return expected; 715 } 716 717 /** Attempt at quantifying probability of an event like this. 718 * TODO: This returns an incorrect answer if reads are unequal lengths. */ probability(Read a, Read b, int insert)719 protected static final float probability(Read a, Read b, int insert) { 720 final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality; 721 final int alen=abases.length, blen=bbases.length; 722 final int istart=(insert<=blen ? 0 : insert-blen); 723 final int jstart=(insert>=blen ? 0 : blen-insert); 724 725 if(aqual==null || bqual==null){return 1;} 726 727 float probActual=1; 728 float probCommon=1; 729 730 // float expected=0; 731 // float actual=0; 732 // int measuredOverlap=0; 733 734 // assert(false) : "\n"+a.toFastq()+"\n"+b.toFastq()+"\n"+"istart="+istart+", jstart="+jstart+", insert="+insert+", alen="+alen+", blen="+blen; 735 736 for(int i=istart, j=jstart; i<insert && i<alen && j<blen; i++, j++){ 737 final byte ca=abases[i], cb=bbases[j]; 738 final byte qa=aqual[i], qb=bqual[j]; 739 740 if(ca=='N' || cb=='N'){ 741 //do nothing 742 }else{ 743 744 // System.err.println(((char)ca)+", "+((char)cb)+", "+i+", "+j); 745 746 assert(AminoAcid.isFullyDefined(ca) && AminoAcid.isFullyDefined(cb)) : 747 "A non-ACGTN base was detected. Please rerun with the flag 'itn'.\n"+(char)ca+", "+(char)cb+"\n"; 748 float probC=probCorrect4[qa]*probCorrect4[qb]; 749 float probM=probC+(1-probC)*0.25f; //probability of matching 750 float probE=1-probM; 751 752 assert(probM>0) : qa+", "+qb+", "+probC+", "+probM+", "+probE; 753 assert(probE>0) : qa+", "+qb+", "+probC+", "+probM+", "+probE; 754 755 probCommon*=Tools.max(probM, probE); 756 probActual*=(ca==cb ? probM : probE); 757 758 // expected+=probE; 759 // actual+=(ca==cb ? 0 : probM); 760 // measuredOverlap++; 761 } 762 } 763 764 // if(probActual>probCommon){ 765 // System.err.println("expected: \t"+expected); 766 // System.err.println("Actual: \t"+actual); 767 // System.err.println("probCommon: \t"+probCommon); 768 // System.err.println("probActual: \t"+probActual); 769 // System.err.println(); 770 // assert(false) : "\n"+a.toFastq()+"\n"+b.toFastq()+"\n"; 771 // } 772 773 assert(probActual<=probCommon); 774 775 return (float)Math.sqrt(probActual/probCommon); //sqrt is just so people don't need to type so many zeros. 776 } 777 minCoverage(final Read r, final Tadpole tadpole, final int k, int cutoff)778 protected static int minCoverage(final Read r, final Tadpole tadpole, final int k, int cutoff){ 779 if(k<32){ 780 return minCoverage(r, (Tadpole1)tadpole, k, cutoff); 781 }else{ 782 return minCoverage(r, (Tadpole2)tadpole, k, cutoff); 783 } 784 } 785 minCoverage(final Read r, final Tadpole1 tadpole, final int k, int cutoff)786 protected static int minCoverage(final Read r, final Tadpole1 tadpole, final int k, int cutoff){ 787 final byte[] bases=r.bases; 788 if(bases==null || bases.length<k){return cutoff;} 789 790 final int shift=2*k; 791 final int shift2=shift-2; 792 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 793 long kmer=0, rkmer=0; 794 int len=0; 795 int min=cutoff; 796 797 for(int i=0; i<bases.length; i++){ 798 final byte b=bases[i]; 799 final long x=AminoAcid.baseToNumber[b]; 800 final long x2=AminoAcid.baseToComplementNumber[b]; 801 802 //Update kmers 803 kmer=((kmer<<2)|x)&mask; 804 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 805 806 //Handle Ns 807 if(x<0){ 808 len=0; 809 kmer=rkmer=0; 810 }else{len++;} 811 812 if(len>=k){ 813 int cov=tadpole.getCount(kmer, rkmer); 814 min=Tools.min(min, cov); 815 if(min<cutoff){return min;} 816 } 817 } 818 819 return min; 820 } 821 minCoverage(final Read r, final Tadpole2 tadpole, final int k, int cutoff)822 protected static int minCoverage(final Read r, final Tadpole2 tadpole, final int k, int cutoff){ 823 final byte[] bases=r.bases; 824 if(bases==null || bases.length<k){return cutoff;} 825 826 Kmer kmer=new Kmer(k); 827 assert(kmer!=null); 828 int min=cutoff; 829 830 for(int i=0; i<bases.length; i++){ 831 final byte b=bases[i]; 832 833 //Update kmers 834 kmer.addRight(b); 835 836 if(kmer.len>=k){ 837 int cov=tadpole.getCount(kmer); 838 min=Tools.min(min, cov); 839 if(min<cutoff){return min;} 840 } 841 } 842 843 return min; 844 } 845 calcMinOverlapByEntropy(byte[] bases, int k, short[] counts, int minscore)846 protected static int calcMinOverlapByEntropy(byte[] bases, int k, short[] counts, int minscore){ 847 return Tools.max(calcMinOverlapByEntropyTail(bases, k, counts, minscore), calcMinOverlapByEntropyHead(bases, k, counts, minscore)); 848 // return calcMinOverlapByEntropyTail(bases, k, counts, minscore); 849 } 850 calcMinOverlapByEntropyTail(byte[] bases, int k, short[] counts, int minscore)851 protected static int calcMinOverlapByEntropyTail(byte[] bases, int k, short[] counts, int minscore){ 852 final int bits=2*k; 853 final int mask=~((-1)<<(bits)); 854 int kmer=0, len=0, ones=0, twos=0; 855 856 if(counts==null){ 857 counts=localKmerCounts.get(); 858 if(counts==null){ 859 counts=new short[1<<(bits)]; 860 localKmerCounts.set(counts); 861 } 862 } 863 864 Arrays.fill(counts, (short)0); 865 866 for(int i=0, j=bases.length-1; i<bases.length; i++, j--){ 867 if(i<bases.length){ 868 final byte b=bases[j]; 869 if(!AminoAcid.isFullyDefined(b)){ 870 len=0; 871 kmer=0; 872 }else{ 873 len++; 874 final int n=Dedupe.baseToNumber[b]; 875 kmer=((kmer<<2)|n)&mask; 876 877 if(len>=k){ 878 counts[kmer]++; 879 if(counts[kmer]==1){ones++;} 880 else if(counts[kmer]==2){twos++;} 881 if(ones*4+twos>=minscore){return i;} 882 } 883 } 884 } 885 } 886 return bases.length+1; 887 } 888 calcMinOverlapByEntropyHead(byte[] bases, int k, short[] counts, int minscore)889 protected static int calcMinOverlapByEntropyHead(byte[] bases, int k, short[] counts, int minscore){ 890 final int bits=2*k; 891 final int mask=~((-1)<<(bits)); 892 int kmer=0, len=0, ones=0, twos=0; 893 894 if(counts==null){ 895 counts=localKmerCounts.get(); 896 if(counts==null){ 897 counts=new short[1<<(bits)]; 898 localKmerCounts.set(counts); 899 } 900 } 901 902 Arrays.fill(counts, (short)0); 903 904 for(int i=0; i<bases.length; i++){ 905 if(i<bases.length){ 906 final byte b=bases[i]; 907 if(!AminoAcid.isFullyDefined(b)){ 908 len=0; 909 kmer=0; 910 }else{ 911 len++; 912 final int n=Dedupe.baseToNumber[b]; 913 kmer=((kmer<<2)|n)&mask; 914 915 if(len>=k){ 916 counts[kmer]++; 917 if(counts[kmer]==1){ones++;} 918 else if(counts[kmer]==2){twos++;} 919 if(ones*4+twos>=minscore){return i;} 920 } 921 } 922 } 923 } 924 return bases.length+1; 925 } 926 927 private static ThreadLocal<short[]> localKmerCounts=new ThreadLocal<short[]>(); 928 929 private static final int BAD_MULT=6; 930 private static final int GOOD_MULT_1=8; 931 private static final int GOOD_MULT_2=400; 932 933 private static final boolean TAG_CUSTOM=BBMerge.TAG_CUSTOM; 934 935 protected static final boolean verbose=false; 936 937 private static final float[] probCorrect3= 938 {0.000f, 0.251f, 0.369f, 0.499f, 0.602f, 0.684f, 0.749f, 0.800f, 0.842f, 0.874f, 939 0.900f, 0.921f, 0.937f, 0.950f, 0.960f, 0.968f, 0.975f, 0.980f, 0.984f, 0.987f, 940 0.990f, 0.992f, 0.994f, 0.995f, 0.996f, 0.997f, 0.997f, 0.998f, 0.998f, 0.999f, 941 0.999f, 0.999f, 0.999f, 0.999f, 1, 1, 1, 1, 1, 1, 942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 945 946 private static final float[] probCorrect4= 947 {0.0000f, 0.2501f, 0.3690f, 0.4988f, 0.6019f, 0.6838f, 0.7488f, 0.8005f, 0.8415f, 0.8741f, 948 0.9000f, 0.9206f, 0.9369f, 0.9499f, 0.9602f, 0.9684f, 0.9749f, 0.9800f, 0.9842f, 0.9874f, 949 0.9900f, 0.9921f, 0.9937f, 0.9950f, 0.9960f, 0.9968f, 0.9975f, 0.9980f, 0.9984f, 0.9987f, 950 0.9990f, 0.9992f, 0.9994f, 0.9995f, 0.9996f, 0.9997f, 0.9997f, 0.9998f, 0.9998f, 0.9999f, 951 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 952 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f}; 953 954 private static final float[] probCorrect5= 955 {0.20000f, 0.20567f, 0.36904f, 0.49881f, 0.60189f, 0.68377f, 0.74881f, 0.80047f, 0.84151f, 0.87411f, 956 0.90000f, 0.92057f, 0.93690f, 0.94988f, 0.96019f, 0.96838f, 0.97488f, 0.98005f, 0.98415f, 0.98741f, 957 0.99000f, 0.99206f, 0.99369f, 0.99499f, 0.99602f, 0.99684f, 0.99749f, 0.99800f, 0.99842f, 0.99874f, 958 0.99900f, 0.99921f, 0.99937f, 0.99950f, 0.99960f, 0.99968f, 0.99975f, 0.99980f, 0.99984f, 0.99987f, 959 0.99990f, 0.99992f, 0.99994f, 0.99995f, 0.99996f, 0.99997f, 0.99997f, 0.99998f, 0.99998f, 0.99999f, 960 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f}; 961 962 } 963