1 package align2; 2 3 import java.util.ArrayList; 4 import java.util.Arrays; 5 import java.util.HashMap; 6 7 import dna.AminoAcid; 8 import dna.Data; 9 import shared.KillSwitch; 10 import shared.Shared; 11 import shared.Tools; 12 import stream.Read; 13 import stream.SiteScore; 14 import structures.LongM; 15 16 17 /** 18 * Based on Index11f 19 * 20 * 21 * 22 * 23 * @author Brian Bushnell 24 * @date Jul 11, 2012 25 * 26 */ 27 public final class BBIndexPacBio extends AbstractIndex { 28 29 main(String[] args)30 public static void main(String[] args){ 31 32 int k=12; 33 34 for(int i=0; i<args.length; i++){ 35 String s=args[i].toLowerCase(); 36 if(s.contains("=")){ 37 String[] split=s.split("="); 38 String a=split[0]; 39 String b=split[1]; 40 if(a.equals("build") || a.equals("b")){ 41 Data.setGenome(Integer.parseInt(b)); 42 }else if(a.equals("minchrom")){ 43 MINCHROM=Integer.parseInt(b); 44 }else if(a.equals("maxchrom")){ 45 MAXCHROM=Integer.parseInt(b); 46 }else if(a.equals("keylen") || a.equals("k")){ 47 k=Integer.parseInt(b); 48 } 49 } 50 } 51 52 if(MINCHROM==-1){MINCHROM=1;} 53 if(MAXCHROM==-1){ 54 assert(Data.numChroms<=Byte.MAX_VALUE) : "TODO"; 55 MAXCHROM=Data.numChroms; 56 } 57 58 59 System.err.println("Writing build "+Data.GENOME_BUILD+" "+ 60 "BASESPACE index, keylen="+k+", chrom bits="+NUM_CHROM_BITS); 61 62 63 int first=(NUM_CHROM_BITS==0 ? 1 : 0); 64 65 66 Data.sysout.println("Loading index for chunk "+first+"-"+MAXCHROM+", build "+Data.GENOME_BUILD); 67 index=IndexMaker4.makeIndex(Data.GENOME_BUILD, first, MAXCHROM, 68 k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, true, false, index); 69 70 71 System.err.println("Finished all chroms, may still be writing."); 72 } 73 74 BBIndexPacBio(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_)75 public BBIndexPacBio(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_){ 76 super(k_, kfilter_, BASE_HIT_SCORE, minChrom_, maxChrom_, msa_); 77 INV_BASE_KEY_HIT_SCORE=1f/BASE_KEY_HIT_SCORE; 78 INDEL_PENALTY=(BASE_KEY_HIT_SCORE/8)-1; //default (HIT_SCORE/2)-1 79 INDEL_PENALTY_MULT=25; //default 20; penalty for indel length 80 MAX_PENALTY_FOR_MISALIGNED_HIT=BASE_KEY_HIT_SCORE-(1+BASE_KEY_HIT_SCORE/8); 81 SCOREZ_1KEY=Z_SCORE_MULT*KEYLEN; 82 { 83 int cyc=0; 84 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){cyc+=2;} 85 cycles=cyc; 86 } 87 prescoreArray=new int[cycles]; 88 precountArray=new int[cycles]; 89 } 90 91 /** Load or generate index from minChrom to maxChrom, inclusive, with keylength k. 92 * This range can encompass multiple blocks. 93 * Should only be called once in a process. */ loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid)94 public static final synchronized void loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid){ 95 if(minChrom<1){minChrom=1;} 96 if(maxChrom>Data.numChroms){maxChrom=Data.numChroms;} 97 assert(minChrom<=maxChrom); 98 Data.sysout.println("Loading index for chunk "+minChrom+"-"+maxChrom+", build "+Data.GENOME_BUILD); 99 index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom, 100 k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, writeToDisk, diskInvalid, index); 101 } 102 103 /** Calculate statistics of index, such as list lengths, and find clumpy keys */ analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k)104 public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){ 105 assert(lengthHistogram==null); 106 assert(COUNTS==null); 107 108 int KEYSPACE=1<<(2*k); 109 COUNTS=new int[KEYSPACE]; 110 maxChrom=maxChrom(maxChrom); 111 112 HashMap<Integer, LongM> cmap=new HashMap<Integer, LongM>(); 113 114 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ 115 Block b=index[chrom]; 116 final int[] sites=b.sites; 117 final int[] starts=b.starts; 118 119 for(int key=0; key<KEYSPACE; key++){ 120 121 long clumps=0; 122 123 final int start1=starts[key]; 124 final int stop1=starts[key+1]; 125 final int len1=stop1-start1; 126 COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len1); 127 128 if(REMOVE_CLUMPY){ 129 for(int i=start1+1; i<stop1; i++){ 130 int dif=sites[i]-sites[i-1]; 131 assert(dif!=0); 132 if(dif>0 && dif<=CLUMPY_MAX_DIST){ 133 clumps++; 134 } 135 } 136 if(clumps>0){ 137 final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k)); 138 final Integer ko=x; 139 LongM lm=cmap.get(ko); 140 if(lm==null){ 141 lm=new LongM(0); 142 cmap.put(ko, lm); 143 } 144 lm.increment(clumps); 145 } 146 } 147 } 148 } 149 150 for(int key=0; key<COUNTS.length; key++){ 151 int rkey=AminoAcid.reverseComplementBinaryFast(key, k); 152 if(key<rkey){ 153 int x=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+(long)COUNTS[rkey]); 154 COUNTS[key]=COUNTS[rkey]=x; 155 } 156 } 157 158 if(REMOVE_CLUMPY){ 159 Integer[] keys=cmap.keySet().toArray(new Integer[cmap.size()]); 160 Arrays.sort(keys); 161 162 for(Integer key : keys){ 163 long clumps=cmap.get(key).value(); 164 long len=COUNTS[key]; 165 if((len>CLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){ 166 int rkey=AminoAcid.reverseComplementBinaryFast(key, k); 167 assert(key<=rkey); 168 assert(key==KeyRing.reverseComplementKey(rkey, k)); 169 COUNTS[key]=0; 170 COUNTS[rkey]=0; 171 } 172 } 173 } 174 175 lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); 176 177 //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} 178 179 if(REMOVE_FREQUENT_GENOME_FRACTION){ 180 181 int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); 182 int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); 183 184 MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); 185 MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); 186 187 if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} 188 } 189 190 Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); 191 if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} 192 if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} 193 assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; 194 } 195 196 197 /** Returns the filename for the block holding this chrom */ 198 public static final String fname(int chrom, int k){ 199 return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS); 200 } 201 202 /** Ensure key offsets are strictly ascending. */ 203 private static boolean checkOffsets(int[] offsets){ 204 for(int i=1; i<offsets.length; i++){ 205 if(offsets[i]<=offsets[i-1]){return false;} 206 } 207 return true; 208 } 209 210 @Deprecated 211 private final static int trimExcessHitLists(int[] keys, int[][] hits){ 212 213 assert(false) : "Needs to be redone because hits are no longer sorted by length."; 214 215 assert(hits.length==keys.length); 216 // assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits"; 217 //And also, incorporate the "remove the longest list" function 218 219 final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length; 220 final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]); 221 final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]); 222 223 int sum=0; 224 int initialHitCount=0; 225 226 int shortest=Integer.MAX_VALUE-1; 227 int shortest2=Integer.MAX_VALUE; 228 229 for(int i=0; i<keys.length; i++){ 230 int key=keys[i]; 231 int x=COUNTS[key]; 232 sum+=x; 233 initialHitCount+=(x==0 ? 0 : 1); 234 if(x>0){ 235 if(x<shortest2){ 236 shortest2=x; 237 if(shortest2<shortest){ 238 shortest2=shortest; 239 shortest=x; 240 } 241 } 242 } 243 } 244 assert(shortest2>=shortest); 245 if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;} 246 if(shortest>limit3 && !SLOW){ 247 for(int i=0; i<hits.length; i++){hits[i]=null;} 248 return 0; 249 } 250 if(sum<=limit && sum/initialHitCount<=limit2){return initialHitCount;} 251 252 Pointer[] ptrs=Pointer.loadMatrix(hits); 253 // ptrs[0].value/=2; 254 // ptrs[ptrs.length-1].value/=2; 255 Arrays.sort(ptrs); 256 257 int finalHitCount=initialHitCount; 258 for(int i=ptrs.length-1; sum>limit || sum/finalHitCount>limit2; i--){ 259 Pointer p=ptrs[i]; 260 sum-=hits[p.key].length; 261 hits[p.key]=null; 262 finalHitCount--; 263 } 264 265 return finalHitCount; 266 } 267 268 /** Remove least useful keys to accelerate search */ 269 public final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){ 270 271 float[] keyWeights=getKeyWeightArray(keyScores.length); 272 for(int i=0; i<keyScores.length; i++){ 273 keyWeights[i]=keyScores[i]*INV_BASE_KEY_HIT_SCORE; 274 } 275 276 // assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits"; 277 //And also, incorporate the "remove the longest list" function 278 279 final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length; 280 final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]); 281 final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]); 282 // final int limitS=lengthHistogram[chrom][MAX_SINGLE_LIST_TO_SEARCH]; 283 284 int sum=0; 285 int initialHitCount=0; 286 287 int shortest=Integer.MAX_VALUE-1; 288 int shortest2=Integer.MAX_VALUE; 289 290 // for(int i=0; i<hits.length; i++){ 291 // if(hits[i]!=null && hits[i].length>limitS){hits[i]=null;} 292 // } 293 294 final int[] lengths=getGenericArray(keys.length); 295 296 for(int i=0; i<keys.length; i++){ 297 int key=keys[i]; 298 int x=count(key); 299 lengths[i]=x; 300 sum+=x; 301 initialHitCount+=(x==0 ? 0 : 1); 302 if(x>0){ 303 if(x<shortest2){ 304 shortest2=x; 305 if(shortest2<shortest){ 306 shortest2=shortest; 307 shortest=x; 308 } 309 } 310 } 311 } 312 assert(shortest2>=shortest); 313 if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;} 314 if(shortest>limit3 && !SLOW){ 315 for(int i=0; i<keys.length; i++){keys[i]=-1;} 316 return 0; 317 } 318 319 int hitsCount=initialHitCount; 320 int worstValue=Integer.MIN_VALUE; 321 322 while(hitsCount>=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){ 323 final int[] lists=getGreedyListArray(hitsCount); 324 for(int i=0, j=0; j<lists.length; i++){ 325 if(lengths[i]>0){ 326 lists[j]=i; 327 j++; 328 } 329 } 330 331 Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn); 332 int worstIndex=greedyReturn[0]; 333 int worst=lists[worstIndex]; 334 worstValue=greedyReturn[1]; 335 sum-=lengths[worst]; 336 337 // if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]<excessListLimit)){return hitsCount;} 338 if(worstValue>0 || lengths[worst]<SMALL_GENOME_LIST){return hitsCount;} //This line increases accuracy at expense of speed. Lower constant = more accurate, default 0. 339 hitsCount--; 340 lengths[worst]=0; 341 keys[worst]=-1; 342 } 343 return hitsCount; 344 } 345 346 347 private final int getHits(final int[] keys, final int chrom, final int maxLen, final int[] starts, final int[] stops){ 348 int numHits=0; 349 final Block b=index[chrom]; 350 for(int i=0; i<keys.length; i++){ 351 final int key=keys[i]; 352 starts[i]=-1; 353 stops[i]=-1; 354 if(key>=0){ 355 final int len=count(key); 356 if(len>0 && len<maxLen){ 357 final int len2=b.length(key); 358 if(len2>0){ 359 starts[i]=b.starts[key]; 360 stops[i]=starts[i]+len2; 361 numHits++; 362 } 363 } 364 } 365 } 366 return numHits; 367 } 368 369 370 private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){ 371 int numHits=0; 372 for(int i=0; i<keys.length; i++){ 373 final int key=keys[i]; 374 if(key>=0){ 375 final int len=count(key); 376 if(len>0 && len<maxLen){ 377 numHits++; 378 }else if(clearBadKeys){ 379 keys[i]=-1; 380 } 381 } 382 } 383 return numHits; 384 } 385 386 387 @Override 388 public final ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){ 389 assert(minChrom<=maxChrom && minChrom>=0); 390 ArrayList<SiteScore> result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id); 391 if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);} 392 393 return result; 394 } 395 396 397 public final ArrayList<SiteScore> find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){ 398 399 assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 400 final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN); 401 int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length); 402 403 initialKeys+=offsetsP.length; 404 initialKeyIterations++; 405 406 final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2); 407 408 int numHits=0; 409 numHits=countHits(keysP, maxLen, true); 410 if(numHits>0){ //TODO: Change these to higher numbers 411 int trigger=(3*keysP.length)/4; 412 if(numHits<20 && numHits<trigger){ 413 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 414 numHits=countHits(keysP, (maxLen*3)/2, true); 415 } 416 if(numHits<18 && numHits<trigger){ 417 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 418 numHits=countHits(keysP, maxLen*2, true); 419 } 420 if(numHits<16 && numHits<trigger){ 421 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 422 numHits=countHits(keysP, maxLen*3, true); 423 } 424 if(numHits<14 && numHits<trigger){ 425 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 426 numHits=countHits(keysP, maxLen*5, true); 427 } 428 } 429 // assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 430 if(numHits<keysP.length){ 431 int[][] r=shrink2(offsetsP, keysP, keyScoresP); 432 assert(r!=null); 433 if(r!=null){ 434 offsetsP=r[0]; 435 keysP=r[1]; 436 keyScoresP=r[2]; 437 } 438 }else{ 439 assert(shrink2(offsetsP, keysP, keyScoresP)==null); 440 } 441 initialKeys2+=numHits; 442 //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 443 444 // assert(checkOffsets(offsets)) : Arrays.toString(offsets); 445 if(TRIM_BY_GREEDY && obeyLimits){ 446 int maxLists=Tools.max((int)(HIT_FRACTION_TO_RETAIN*keysP.length), MIN_HIT_LISTS_TO_RETAIN); 447 numHits=trimExcessHitListsByGreedy(offsetsP, keyScoresP, maxLists, keysP); 448 } 449 // System.out.println("After greedy: numHits = "+numHits); 450 451 if(TRIM_BY_TOTAL_SITE_COUNT && obeyLimits){ 452 throw new RuntimeException("Needs to be redone."); 453 // numHits=trimExcessHitLists(keys, hits); 454 } 455 456 if(TRIM_LONG_HIT_LISTS && obeyLimits && numHits>MIN_APPROX_HITS_TO_KEEP){ 457 int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits); 458 459 int zeroes=keysP.length-numHits; 460 int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1)); 461 cutoffIndex=Tools.max(cutoffIndex, altMinIndex); 462 463 assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits; 464 465 if(cutoffIndex<(keysP.length-1)){ 466 int[] lens=getGenericArray(keysP.length); 467 for(int i=0; i<keysP.length; i++){lens[i]=count(keysP[i]);} 468 Arrays.sort(lens); 469 int cutoff=lens[cutoffIndex]; 470 471 cutoff=Tools.max(lengthHistogram[MIN_INDEX_TO_DROP_LONG_HIT_LIST], cutoff); 472 473 int removed=0; 474 475 for(int i=0; i<keysP.length; i++){ 476 int key=keysP[i]; 477 if(count(key)>cutoff){ 478 keysP[i]=-1; 479 removed++; 480 numHits--; 481 } 482 } 483 } 484 } 485 // assert(checkOffsets(offsets)) : Arrays.toString(offsets); 486 487 final ArrayList<SiteScore> result=new ArrayList<SiteScore>(8); 488 if(numHits<MIN_APPROX_HITS_TO_KEEP){return result;} 489 //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 490 if(numHits<keysP.length){ 491 int[][] r=shrink2(offsetsP, keysP, keyScoresP); 492 assert(r!=null); 493 if(r!=null){ 494 offsetsP=r[0]; 495 keysP=r[1]; 496 keyScoresP=r[2]; 497 } 498 }else{ 499 assert(shrink2(offsetsP, keysP, keyScoresP)==null); 500 } 501 assert(keysP.length==numHits); 502 //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 503 //Reverse the offsets for minus-strand mapping, since they are generated based on quality 504 int[] offsetsM=KeyRing.reverseOffsets(offsetsP, KEYLEN, basesP.length); 505 final int[] keysM=KeyRing.reverseComplementKeys(keysP, KEYLEN); 506 507 // assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 508 // assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM); 509 510 assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length))); 511 assert(keyScoresP!=null); 512 assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP); 513 final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1)); 514 final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1)); 515 final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); 516 517 assert(offsetsM.length==offsetsP.length); 518 assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM)); 519 520 /* 521 * bestScores: 522 * 523 * bestScores[0] currentTopScore 524 * bestScores[1] maxHits 525 * bestScores[2] qcutoff 526 * bestScores[3] bestqscore 527 * bestScores[4] maxQuickScore 528 * bestScores[5] perfectsFound 529 */ 530 final int[] bestScores=KillSwitch.allocInt1D(6); 531 532 //This prevents filtering by qscore when a low-quality read only uses a few keys. 533 //In that case, extending is more important. 534 final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5); 535 536 int[][] prescanResults=null; 537 int[] precounts=null; 538 int[] prescores=null; 539 540 int hitsCutoff=0; 541 int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore); 542 543 boolean allBasesCovered=true; 544 { 545 if(offsetsP[0]!=0){allBasesCovered=false;} 546 else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;} 547 else{ 548 for(int i=1; i<offsetsP.length; i++){ 549 if(offsetsP[i]>offsetsP[i-1]+KEYLEN){ 550 allBasesCovered=false; 551 break; 552 } 553 } 554 } 555 } 556 557 //TODO I don't understand this logic 558 final boolean pretendAllBasesAreCovered=(allBasesCovered || 559 keysP.length>=keysOriginal.length-4 || 560 (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f)))); 561 562 // System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP)); 563 // assert(allBasesCovered); 564 565 if(prescan_qscore){ 566 prescanResults=prescanAllBlocks(bestScores, 567 keysP, keyScoresP, offsetsP, 568 keysM, keyScoresM, offsetsM, 569 pretendAllBasesAreCovered); 570 571 if(prescanResults!=null){ 572 precounts=prescanResults[0]; 573 prescores=prescanResults[1]; 574 } 575 576 if(bestScores[1]<MIN_APPROX_HITS_TO_KEEP){return result;} 577 if(bestScores[3]<maxQuickScore*MIN_QSCORE_MULT2){return result;} 578 579 if(bestScores[3]>=maxQuickScore && pretendAllBasesAreCovered){ 580 assert(bestScores[3]==maxQuickScore); 581 assert(bestScores[1]==numHits); 582 583 hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true); 584 qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT)); 585 }else{ 586 hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false); 587 qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH)); 588 } 589 } 590 591 final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true); 592 final boolean fullyDefined=AminoAcid.isFullyDefined(basesP); 593 assert(bestScores[2]<=0) : Arrays.toString(bestScores); 594 595 int cycle=0; 596 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ 597 if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ 598 find(keysP, basesP, baseScoresP, keyScoresP, chrom, Shared.PLUS, 599 offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); 600 } 601 if(QUIT_AFTER_TWO_PERFECTS){ 602 if(bestScores[5]>=2){break;} 603 } 604 cycle++; 605 if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ 606 find(keysM, basesM, baseScoresM, keyScoresM, chrom, Shared.MINUS, 607 offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); 608 } 609 if(QUIT_AFTER_TWO_PERFECTS){ 610 if(bestScores[5]>=2){break;} 611 } 612 cycle++; 613 } 614 615 assert(Read.CHECKSITES(result, basesP, basesM, id, false)); //TODO: Comment out once checked 616 617 return result; 618 } 619 620 /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */ 621 private final int[][] prescanAllBlocks(int[] bestScores, 622 int[] keysP, int[] keyScoresP, int[] offsetsP, 623 int[] keysM, int[] keyScoresM, int[] offsetsM, 624 final boolean allBasesCovered){ 625 626 int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}}; 627 628 int bestqscore=0; 629 int maxHits=0; 630 int minHitsToScore=MIN_APPROX_HITS_TO_KEEP; 631 632 final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); 633 634 final int[] counts=precountArray; 635 final int[] scores=prescoreArray; 636 final int[][] ret=prescanReturn; 637 Arrays.fill(counts, keysP.length); 638 Arrays.fill(scores, maxQuickScore); 639 ret[0]=counts; 640 ret[1]=scores; 641 642 int cycle=0; 643 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ 644 final int baseChrom=baseChrom(chrom); 645 for(int pmi=0; pmi<2; pmi++, cycle++){ 646 647 int[] keys=pm[pmi][0]; 648 int[] keyScores=pm[pmi][1]; 649 int[] offsets=pm[pmi][2]; 650 // int[][] hits=getHitArray(offsets.length); 651 652 int[] starts=startArray; 653 int[] stops=stopArray; 654 int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); 655 656 if(numHits<minHitsToScore){ 657 scores[cycle]=-9999; 658 counts[cycle]=0; 659 }else{ 660 661 // final int maxQuickScore=maxQuickScore(offsets, keyScores); 662 // System.err.println("maxScore = "+maxScore); 663 664 if(numHits<keys.length){ 665 int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); 666 if(r!=null){ 667 starts=r[0]; 668 stops=r[1]; 669 offsets=r[2]; 670 keyScores=r[4]; 671 } 672 } 673 674 assert(numHits==offsets.length); 675 assert(numHits==keyScores.length); 676 heap.clear(); 677 final Quad[] triples=tripleStorage; 678 final int[] values=valueArray; 679 680 int[] temp=findMaxQscore2(starts, stops, offsets, keyScores, baseChrom, triples, values, minHitsToScore, true, 681 bestqscore>=maxQuickScore && allBasesCovered); 682 683 scores[cycle]=temp[0]; 684 counts[cycle]=temp[1]; 685 686 bestqscore=Tools.max(temp[0], bestqscore); 687 maxHits=Tools.max(maxHits, temp[1]); 688 if(bestqscore>=maxQuickScore && allBasesCovered){ 689 assert(bestqscore==maxQuickScore); 690 assert(maxHits==keysP.length) : 691 "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" + 692 "Scores: \t"+Arrays.toString(scores)+ 693 "Counts: \t"+Arrays.toString(counts)+ 694 "bestqscore: \t"+bestqscore+ 695 "maxHits: \t"+maxHits+ 696 "maxQuickScore: \t"+maxQuickScore+ 697 "numHits: \t"+numHits+ 698 "minHitsToScore: \t"+minHitsToScore+ 699 "keys.length: \t"+keys.length; 700 701 minHitsToScore=Tools.max(minHitsToScore, maxHits); 702 703 { 704 //This early exit is optional. Does not seem to impact speed much either way. 705 bestScores[1]=Tools.max(bestScores[1], maxHits); 706 bestScores[3]=Tools.max(bestScores[3], bestqscore); 707 return ret; 708 } 709 } 710 } 711 } 712 } 713 714 bestScores[1]=Tools.max(bestScores[1], maxHits); 715 bestScores[3]=Tools.max(bestScores[3], bestqscore); 716 717 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 718 719 return ret; 720 } 721 722 723 /** Search a single block and strand */ 724 public final ArrayList<SiteScore> find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores, 725 final int chrom, final byte strand, 726 int[] offsets, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores, 727 final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ 728 729 assert(checkOffsets(offsets)) : Arrays.toString(offsets); 730 731 int[] starts=startArray; 732 int[] stops=stopArray; 733 734 int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); 735 if(numHits<MIN_APPROX_HITS_TO_KEEP){return ssl;} 736 if(USE_SLOWALK3){ 737 if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);} 738 ssl=slowWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined); 739 }else{ 740 ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined); 741 } 742 743 return ssl; 744 } 745 746 /** Compress arrays by removing null/empty lists */ 747 private final int[][] shrink(int[] starts, int[] stops, int[] offsets, int[] keyScores, final int len){ 748 int numHits=0; 749 for(int i=0; i<len; i++){ 750 if(starts[i]>=0){numHits++;} 751 } 752 753 if(numHits==offsets.length){ 754 return null; 755 }else{ 756 int[][] r=shrinkReturn3; 757 int[] starts2=startArray; 758 int[] stops2=stopArray; 759 int[] offsets2=getOffsetArray(numHits); 760 int[] keyScores2=new int[numHits]; 761 762 for(int i=0, j=0; i<len; i++){ 763 if(starts[i]>=0){ 764 starts2[j]=starts[i]; 765 stops2[j]=stops[i]; 766 offsets2[j]=offsets[i]; 767 keyScores2[j]=keyScores[i]; 768 j++; 769 } 770 } 771 r[0]=starts2; 772 r[1]=stops2; 773 r[2]=offsets2; 774 r[4]=keyScores2; 775 return r; 776 } 777 } 778 779 /** Removes "-1" keys. */ 780 private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){ 781 782 783 int numHits=0; 784 for(int i=0; i<keys.length; i++){ 785 if(keys[i]>=0){numHits++;} 786 } 787 788 789 assert(checkOffsets(offsets)) : Arrays.toString(offsets); 790 if(numHits==keys.length){ 791 return null; 792 }else{ 793 int[][] r=shrinkReturn2; 794 int[] offsets2=getOffsetArray(numHits); 795 assert(offsets2!=offsets); 796 assert(offsets2.length<offsets.length); 797 int[] keys2=new int[numHits]; 798 int[] keyScores2=new int[numHits]; 799 800 for(int i=0, j=0; i<keys.length; i++){ 801 if(keys[i]>=0){ 802 offsets2[j]=offsets[i]; 803 keys2[j]=keys[i]; 804 keyScores2[j]=keyScores[i]; 805 j++; 806 } 807 } 808 assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+ 809 "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n"; 810 r[0]=offsets2; 811 r[1]=keys2; 812 r[2]=keyScores2; 813 return r; 814 } 815 } 816 817 818 /** This uses a heap to track next column to increment */ 819 private final ArrayList<SiteScore> slowWalk2(int[] starts, int[] stops, final byte[] bases, 820 final byte[] baseScores, int[] keyScores, int[] offsets, 821 final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, final boolean fullyDefined){ 822 823 if(SHRINK_BEFORE_WALK){ 824 int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); 825 if(r!=null){ 826 starts=r[0]; 827 stops=r[1]; 828 offsets=r[2]; 829 keyScores=r[4]; 830 } 831 } 832 833 final int numHits=offsets.length; //After shrink 834 835 assert(numHits==offsets.length); 836 assert(numHits==keyScores.length); 837 838 //// System.out.println("After SHRINK_BEFORE_WALK: numHits = "+hits.length); 839 // Block b=index[baseChrom_]; 840 // int[][] hits=b.getHitLists(starts, stops); 841 // if(SHRINK_BEFORE_WALK){ 842 // Object[] r=shrink(hits, offsets, keyScores); 843 // if(r!=null){ 844 // hits=(int[][])r[0]; 845 // offsets=(int[])r[1]; 846 // keyScores=(int[])r[3]; 847 // } 848 // } 849 // 850 // final int numHits=hits.length; //After shrink 851 852 853 assert(numHits==offsets.length); 854 assert(numHits==keyScores.length); 855 856 assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); 857 858 //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. 859 final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true); 860 // final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets)); 861 // System.err.println("maxScore = "+maxScore); 862 863 // final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); 864 final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); 865 // final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f)); 866 // final int minScore=(int)(MIN_SCORE_MULT*maxScore); 867 // System.err.println("minScore = "+minScore); 868 869 final int baseChrom=baseChrom(baseChrom_); 870 871 872 // final PriorityQueue<Quad> heap=new PriorityQueue<Quad>(numHits); 873 heap.clear(); 874 // final Quad[] triples=new Quad[numHits]; 875 final Quad[] triples=tripleStorage; 876 877 final Block b=index[baseChrom]; 878 final int[] values=valueArray; 879 final int[] sizes=sizeArray; 880 final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); 881 882 for(int i=0; i<numHits; i++){ 883 final int[] sites=b.sites; 884 final int start=starts[i]; 885 sizes[i]=b.length(start, stops[i]); 886 assert(sizes[i]>0); 887 888 int a=sites[start]; 889 int a2; 890 if((a&SITE_MASK)>=offsets[i]){ 891 a2=a-offsets[i]; 892 }else{ 893 int ch=numberToChrom(a, baseChrom); 894 int st=numberToSite(a); 895 int st2=Tools.max(st-offsets[i], 0); 896 a2=toNumber(st2, ch); 897 } 898 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); 899 900 Quad t=triples[i]; 901 assert(t!=null) : "Should be using tripleStorage"; 902 assert(i==t.column); 903 t.row=start; 904 t.site=a2; 905 t.list=sites; 906 values[i]=a2; 907 908 heap.add(t); 909 } 910 911 if(ssl==null){ssl=new ArrayList<SiteScore>(8);} 912 913 int currentTopScore=-999999999; 914 915 int cutoff=minScore; 916 917 int maxHits=0; 918 int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; 919 920 // System.out.println("\nEntering SS loop:"); 921 // System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + 922 // "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); 923 // System.out.println(); 924 925 SiteScore prevSS=null; 926 while(!heap.isEmpty()){ 927 Quad t=heap.peek(); 928 final int site=t.site; 929 final int centerIndex=t.column; 930 931 int maxNearbySite=site; 932 933 934 int approxHits=0; 935 936 {//Inner loop 937 final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; 938 for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){ 939 final int x=values[column]; 940 assert(x==triples[column].site); 941 if(x>=minsite && x<=maxsite){ 942 maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); 943 approxHits++; 944 }else{chances--;} 945 } 946 } 947 948 assert(centerIndex>=0) : centerIndex; 949 950 //I don't remember what this assertion was for or why, but it's causing trouble. 951 //assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; 952 if(approxHits>=approxHitsCutoff){ 953 954 int score; 955 956 int mapStart=site, mapStop=maxNearbySite; 957 958 if(USE_EXTENDED_SCORE){ 959 final int chrom=numberToChrom(site, baseChrom); 960 score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); 961 if(true/*USE_AFFINE_SCORE*/){ 962 //Correct begin and end positions if they changed. 963 int min=Integer.MAX_VALUE; 964 int max=Integer.MIN_VALUE; 965 for(int i=0; i<locArray.length; i++){ 966 int x=locArray[i]; 967 if(x>-1){ 968 if(x<min){min=x;} 969 if(x>max){max=x;} 970 } 971 } 972 973 // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? 974 if(min<0 || max<0){ 975 //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries 976 if(!Shared.anomaly){ 977 Shared.anomaly=true; 978 System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ 979 chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ 980 Arrays.toString(locArray)+"\n"+ 981 Arrays.toString(values)+"\n"+ 982 new String(bases)+"\nstrand="+strand+"\n"); 983 System.err.println(); 984 } 985 score=-99999; 986 } 987 988 989 mapStart=toNumber(min, chrom); 990 mapStop=toNumber(max, chrom); 991 992 993 994 // System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+ 995 // ", mapStart="+mapStart+", mapStop="+mapStop); 996 997 // if(chrom==17 && absdif(min, 30354420)<2000){ 998 // System.err.println("\n*****\n"); 999 // System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+ 1000 // " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+ 1001 // ", mapStart="+mapStart+", mapStop="+mapStop); 1002 // System.err.println(); 1003 // System.err.println(Arrays.toString(locArray)); 1004 // System.err.println(); 1005 // System.err.println("chrom="+chrom); 1006 // System.err.println("score="+score); 1007 // } 1008 } 1009 }else{ 1010 score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); 1011 if(ADD_SCORE_Z){ 1012 int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); 1013 score+=scoreZ; 1014 } 1015 } 1016 1017 1018 // score=score(values, centerIndex, offsets, hits); 1019 // if(ADD_SCORE_Z){ 1020 // int scoreZ=scoreZ2(values, centerIndex, offsets); 1021 // score+=scoreZ; 1022 // } 1023 // 1024 // if(USE_EXTENDED_SCORE){ 1025 // if(score>minQuickScore){ 1026 //// System.out.println(score+" > "+minQuickScore); 1027 // score=extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex, locArray); 1028 // }else{ 1029 //// System.out.print("."); 1030 // score=-1; 1031 // } 1032 // } 1033 1034 1035 // System.err.println("maxScore = "+maxScore); 1036 // System.err.println("hits = "+approxHits+" / "+approxHitsCutoff); 1037 // System.err.println("score = "+score+" / "+cutoff); 1038 1039 if(score>=cutoff){ 1040 1041 // System.err.println("Passed!"); 1042 1043 // System.out.println("approxHits="+approxHits+" / "+approxHitsCutoff); 1044 // System.out.println("score="+score+" / "+cutoff); 1045 // System.out.println("strand="+Gene.strandCodes[strand]); 1046 // System.out.println("center="+values[centerIndex]); 1047 // System.out.println("values="+Arrays.toString(values)); 1048 // extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex); 1049 // System.out.println(); 1050 1051 if(score>currentTopScore){ 1052 // System.err.println("New top score!"); 1053 1054 if(DYNAMICALLY_TRIM_LOW_SCORES){ 1055 1056 maxHits=Tools.max(approxHits, maxHits); 1057 // approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); 1058 approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower 1059 1060 cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); 1061 1062 // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH)); 1063 if(USE_EXTENDED_SCORE && score>=maxScore){ 1064 cutoff=Tools.max(cutoff, (int)(score*0.95f)); 1065 } 1066 } 1067 1068 currentTopScore=score; 1069 1070 // System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); 1071 } 1072 1073 // final int chrom=numberToChrom(site, baseChrom); 1074 // final int site2=numberToSite(site); 1075 // final int site3=numberToSite(maxNearbySite)+read.length; 1076 1077 final int chrom=numberToChrom(mapStart, baseChrom); 1078 final int site2=numberToSite(mapStart); 1079 final int site3=numberToSite(mapStop)+bases.length-1; 1080 1081 assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+ 1082 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+ 1083 "\n\n"+Arrays.toString(b.getHitList(centerIndex)); 1084 assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+ 1085 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length; 1086 1087 1088 //Note: I could also do this as soon as score is calculated. 1089 // if(ADD_SCORE_Z){ 1090 // int scoreZ=scoreZ2(values, centerIndex, offsets); 1091 // score+=scoreZ; 1092 // } 1093 1094 //This block is optional, but tries to eliminate multiple identical alignments 1095 // SiteScore prevSS=(ssl.size()<1 ? null : ssl.get(ssl.size()-1)); 1096 1097 SiteScore ss=null; 1098 final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; 1099 1100 int[] gapArray=null; 1101 if(site3-site2>=MINGAP+bases.length){ 1102 gapArray=makeGapArray(locArray, mapStart, MINGAP); 1103 if(gapArray!=null){ 1104 int sub=site2-mapStart;//thus site2=mapStart+sub 1105 for(int i=0; i<gapArray.length; i++){ 1106 gapArray[i]+=sub; 1107 } 1108 assert(gapArray[0]==mapStart); 1109 assert(gapArray[gapArray.length-1]==mapStop); 1110 } 1111 assert(false) : Arrays.toString(locArray); 1112 } 1113 1114 if(gapArray==null && prevSS!=null && prevSS.gaps==null && 1115 prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ 1116 1117 int betterScore=Tools.max(score, prevSS.score); 1118 int minStart=Tools.min(prevSS.start, site2); 1119 int maxStop=Tools.max(prevSS.stop, site3); 1120 final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined; 1121 assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect); 1122 1123 boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length)); 1124 1125 if(prevSS.start==site2 && prevSS.stop==site3){ 1126 prevSS.score=prevSS.quickScore=betterScore; 1127 }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2){ 1128 if(perfect2){ 1129 //do nothing 1130 }else if(perfect1){ 1131 prevSS.setStop(site3); 1132 prevSS.setPerfect(); 1133 }else{ 1134 prevSS.setStop(maxStop); 1135 } 1136 prevSS.score=prevSS.quickScore=betterScore; 1137 }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3){ 1138 if(perfect2){ 1139 //do nothing 1140 }else if(perfect1){ 1141 prevSS.setStart(site2); 1142 prevSS.setPerfect(); 1143 }else{ 1144 prevSS.setStart(minStart); 1145 } 1146 prevSS.score=prevSS.quickScore=betterScore; 1147 }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH) 1148 && !perfect1 && !perfect2){ 1149 prevSS.setLimits(minStart, maxStop); 1150 prevSS.score=prevSS.quickScore=betterScore; 1151 }else{ 1152 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1153 if(!perfect1){ss.setPerfect(bases);} 1154 assert(!perfect1 || ss.stop-ss.start==bases.length-1); 1155 } 1156 assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1); 1157 }else{ 1158 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1159 if(!perfect1){ss.setPerfect(bases);} 1160 ss.gaps=gapArray; 1161 if(gapArray!=null){ 1162 System.err.println(ss.toText()+"\t"+Arrays.toString(gapArray)+"\n"+Arrays.toString(locArray)+"\n"); 1163 } 1164 } 1165 1166 if(ss!=null){ 1167 // System.out.println("Added site "+ss.toText()); 1168 ssl.add(ss); 1169 prevSS=ss; 1170 }else{ 1171 // System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText()); 1172 } 1173 1174 } 1175 } 1176 1177 while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements 1178 final Quad t2=heap.poll(); 1179 final int row=t2.row+1, col=t2.column; 1180 if(row<stops[col]){ 1181 t2.row=row; 1182 1183 int a=t2.list[row]; 1184 int a2; 1185 if((a&SITE_MASK)>=offsets[col]){ 1186 a2=a-offsets[col]; 1187 1188 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1189 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1190 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1191 }else{ 1192 int ch=numberToChrom(a, baseChrom); 1193 int st=numberToSite(a); 1194 int st2=Tools.max(st-offsets[col], 0); 1195 a2=toNumber(st2, ch); 1196 1197 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1198 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1199 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1200 } 1201 1202 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1203 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1204 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1205 1206 t2.site=a2; 1207 values[col]=a2; 1208 heap.add(t2); 1209 } 1210 if(heap.isEmpty()){break;} 1211 } 1212 1213 } 1214 1215 return ssl; 1216 } 1217 1218 /** This uses a heap to track next column to increment */ 1219 private final ArrayList<SiteScore> slowWalk3(int[] starts, int[] stops, final byte[] bases, 1220 final byte[] baseScores, int[] keyScores, int[] offsets, 1221 final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, 1222 int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ 1223 assert(USE_EXTENDED_SCORE); 1224 1225 final int numKeys=offsets.length; //Before shrink 1226 1227 //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. 1228 final int maxQuickScore=maxQuickScore(offsets, keyScores); 1229 1230 if(SHRINK_BEFORE_WALK){ 1231 int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); 1232 if(r!=null){ 1233 starts=r[0]; 1234 stops=r[1]; 1235 offsets=r[2]; 1236 keyScores=r[4]; 1237 } 1238 } 1239 1240 final int numHits=offsets.length; //After shrink 1241 1242 1243 assert(numHits==offsets.length); 1244 assert(numHits==keyScores.length); 1245 1246 usedKeys+=numHits; 1247 usedKeyIterations++; 1248 1249 final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); 1250 1251 assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); 1252 1253 1254 // final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); 1255 final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); 1256 final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); 1257 1258 final int baseChrom=baseChrom(baseChrom_); 1259 1260 heap.clear(); 1261 1262 final Quad[] triples=tripleStorage; 1263 1264 final int[] values=valueArray; 1265 final int[] sizes=sizeArray; 1266 final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); 1267 final Block b=index[baseChrom]; 1268 1269 if(ssl==null){ssl=new ArrayList<SiteScore>(8);} 1270 1271 int currentTopScore=bestScores[0]; 1272 int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); 1273 1274 int qcutoff=Tools.max(bestScores[2], minQuickScore); 1275 int bestqscore=bestScores[3]; 1276 int maxHits=bestScores[1]; 1277 int perfectsFound=bestScores[5]; 1278 assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits; 1279 int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); 1280 if(approxHitsCutoff>numHits){return ssl;} 1281 1282 final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); 1283 1284 if(currentTopScore>=maxScore){ 1285 assert(currentTopScore==maxScore); 1286 qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); 1287 } 1288 1289 1290 // assert(false) : "numHits="+numHits+", maxHits="+maxHits+", MIN_APPROX_HITS_TO_KEEP="+MIN_APPROX_HITS_TO_KEEP+", approxHitsCutoff="+approxHitsCutoff+", maxHits="+maxHits; 1291 1292 1293 for(int i=0; i<numHits; i++){ 1294 final int[] sites=b.sites; 1295 final int start=starts[i]; 1296 sizes[i]=b.length(start, stops[i]); 1297 assert(sizes[i]>0); 1298 1299 int a=sites[start]; 1300 int a2; 1301 if((a&SITE_MASK)>=offsets[i]){ 1302 a2=a-offsets[i]; 1303 }else{ 1304 int ch=numberToChrom(a, baseChrom); 1305 int st=numberToSite(a); 1306 int st2=Tools.max(st-offsets[i], 0); 1307 a2=toNumber(st2, ch); 1308 } 1309 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); 1310 1311 Quad t=triples[i]; 1312 assert(t!=null) : "Should be using tripleStorage"; 1313 assert(i==t.column); 1314 t.row=start; 1315 t.site=a2; 1316 t.list=sites; 1317 values[i]=a2; 1318 1319 heap.add(t); 1320 } 1321 1322 // System.out.println("\nEntering SS loop:"); 1323 // System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + 1324 // "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); 1325 // System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); 1326 1327 1328 SiteScore prevSS=null; 1329 while(!heap.isEmpty()){ 1330 Quad t=heap.peek(); 1331 final int site=t.site; 1332 final int centerIndex=t.column; 1333 1334 int maxNearbySite=site; 1335 1336 1337 int approxHits=0; 1338 1339 {//Inner loop 1340 final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; 1341 for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){ 1342 final int x=values[column]; 1343 assert(x==triples[column].site); 1344 if(x>=minsite && x<=maxsite){ 1345 maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); 1346 approxHits++; 1347 }else{chances--;} 1348 } 1349 } 1350 1351 assert(centerIndex>=0) : centerIndex; 1352 1353 //I don't remember what this assertion was for or why, but it's causing trouble. 1354 //assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; 1355 if(approxHits>=approxHitsCutoff){ 1356 1357 int score; 1358 int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); 1359 if(ADD_SCORE_Z){ 1360 int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); 1361 qscore+=scoreZ; 1362 } 1363 1364 int mapStart=site, mapStop=maxNearbySite; 1365 1366 assert(USE_EXTENDED_SCORE); 1367 1368 boolean locArrayValid=false; 1369 if(qscore<qcutoff){ 1370 score=-1; 1371 }else{ 1372 1373 final int chrom=numberToChrom(site, baseChrom); 1374 1375 //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results). 1376 //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run. 1377 //May have something to do with compiler loop optimizations. 1378 if(shortCircuit && qscore==maxQuickScore){ 1379 assert(approxHits==numKeys); 1380 score=maxScore; 1381 }else{ 1382 if(verbose){ 1383 System.err.println("Extending "+Arrays.toString(values)); 1384 } 1385 score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); 1386 locArrayValid=true; 1387 1388 if(verbose){ 1389 System.err.println("score: "+score); 1390 System.err.println("locArray: "+Arrays.toString(locArray)); 1391 } 1392 1393 //Correct begin and end positions if they changed. 1394 int min=Integer.MAX_VALUE; 1395 int max=Integer.MIN_VALUE; 1396 for(int i=0; i<locArray.length; i++){ 1397 int x=locArray[i]; 1398 if(x>-1){ 1399 if(x<min){min=x;} 1400 if(x>max){max=x;} 1401 } 1402 } 1403 1404 if(score>=maxScore){ 1405 assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ 1406 ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; 1407 } 1408 1409 // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? 1410 if(min<0 || max<0){ 1411 if(!Shared.anomaly){ 1412 Shared.anomaly=true; 1413 System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ 1414 chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ 1415 Arrays.toString(locArray)+"\n"+ 1416 Arrays.toString(values)+"\n"+ 1417 new String(bases)+"\nstrand="+strand+"\n"); 1418 System.err.println(); 1419 } 1420 score=-99999; 1421 } 1422 1423 //mapStart and mapStop are indices 1424 mapStart=toNumber(min, chrom); 1425 mapStop=toNumber(max, chrom); 1426 1427 if(score>=maxScore){ 1428 assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ 1429 ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; 1430 } 1431 } 1432 1433 if(score==maxScore){ 1434 qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); 1435 approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); 1436 } 1437 1438 if(score>=cutoff){ 1439 qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); 1440 bestqscore=Tools.max(qscore, bestqscore); 1441 } 1442 } 1443 1444 if(score>=cutoff){ 1445 1446 if(score>currentTopScore){ 1447 // System.err.println("New top score!"); 1448 1449 if(DYNAMICALLY_TRIM_LOW_SCORES){ 1450 1451 maxHits=Tools.max(approxHits, maxHits); 1452 approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); 1453 1454 cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); 1455 if(score>=maxScore){ 1456 assert(USE_EXTENDED_SCORE); 1457 cutoff=Tools.max(cutoff, (int)(score*0.95f)); 1458 } 1459 } 1460 1461 currentTopScore=score; 1462 1463 // System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); 1464 } 1465 1466 final int chrom=numberToChrom(mapStart, baseChrom); 1467 final int site2=numberToSite(mapStart); 1468 final int site3=numberToSite(mapStop)+bases.length-1; 1469 1470 assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+ 1471 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+ 1472 "\n\n"+Arrays.toString(b.getHitList(centerIndex)); assert(site2<site3) : R+chrom+R+strand+R+site+ R+maxNearbySite+R+site2+R+site3+R+bases.length; int[] gapArray=null; if(site3-site2>=MINGAP+bases.length)1473 assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+ 1474 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length; 1475 1476 1477 int[] gapArray=null; 1478 if(site3-site2>=MINGAP+bases.length){ 1479 assert(locArrayValid) : "Loc array was not filled."; 1480 // System.err.println("****\n"+Arrays.toString(locArray)+"\n"); 1481 // int[] clone=locArray.clone(); 1482 gapArray=makeGapArray(locArray, site2, MINGAP); 1483 if(gapArray!=null){ 1484 // System.err.println(Arrays.toString(locArray)+"\n"); 1485 // System.err.println(Arrays.toString(gapArray)); 1486 // 1487 //// int sub=site2-mapStart;//thus site2=mapStart+sub 1488 //// for(int i=0; i<gapArray.length; i++){ 1489 //// gapArray[i]+=sub; 1490 //// } 1491 //// System.err.println(Arrays.toString(gapArray)); 1492 // 1493 // System.err.println(mapStart+" -> "+site2); 1494 // System.err.println(mapStop+" -> "+site3); 1495 1496 assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length); 1497 assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+ 1498 mapStart+" -> "+site2+"\n"+ 1499 mapStop+" -> "+site3+"\n\n"+ 1500 Arrays.toString(gapArray)+"\n\n"+ 1501 // Arrays.toString(clone)+"\n\n"+ 1502 Arrays.toString(locArray)+"\n"+ 1503 "numHits="+numHits+", "+ 1504 "heap.size="+heap.size()+", "+ 1505 "numHits="+numHits+", "+ 1506 "approxHits="+approxHits+"\n"; 1507 gapArray[0]=Tools.min(gapArray[0], site2); 1508 gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); 1509 } 1510 if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} 1511 // assert(false) : Arrays.toString(locArray); 1512 } 1513 1514 1515 //This block is optional, but tries to eliminate multiple identical alignments 1516 1517 SiteScore ss=null; 1518 final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; 1519 final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]); 1520 // if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);} 1521 1522 if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null && 1523 prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ 1524 1525 final int betterScore=Tools.max(score, prevSS.score); 1526 final int minStart=Tools.min(prevSS.start, site2); 1527 final int maxStop=Tools.max(prevSS.stop, site3); 1528 final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined; 1529 assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect); 1530 1531 final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length)); 1532 1533 if(prevSS.start==site2 && prevSS.stop==site3){ 1534 prevSS.score=prevSS.quickScore=betterScore; 1535 prevSS.perfect=(prevSS.perfect || perfect1 || perfect2); 1536 if(prevSS.perfect){prevSS.semiperfect=true;} 1537 }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){ 1538 if(perfect2){ 1539 //do nothing 1540 }else if(perfect1){ 1541 prevSS.setStop(site3); 1542 if(!prevSS.perfect){perfectsFound++;}//***$ 1543 prevSS.perfect=prevSS.semiperfect=true; 1544 }else{ 1545 prevSS.setStop(maxStop); 1546 prevSS.setPerfect(bases); 1547 } 1548 prevSS.score=prevSS.quickScore=betterScore; 1549 }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){ 1550 if(perfect2){ 1551 //do nothing 1552 }else if(perfect1){ 1553 prevSS.setStart(site2); 1554 if(!prevSS.perfect){perfectsFound++;}//***$ 1555 prevSS.perfect=prevSS.semiperfect=true; 1556 }else{ 1557 prevSS.setStart(minStart); 1558 prevSS.setPerfect(bases); 1559 } 1560 prevSS.score=prevSS.quickScore=betterScore; 1561 }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH) 1562 && !perfect1 && !perfect2 && !prevSS.semiperfect){ 1563 prevSS.setLimits(minStart, maxStop); 1564 prevSS.score=prevSS.quickScore=betterScore; 1565 prevSS.setPerfect(bases); 1566 }else{ 1567 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1568 if(!perfect1){ss.setPerfect(bases);} 1569 if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));} 1570 assert(!perfect1 || ss.stop-ss.start==bases.length-1); 1571 } 1572 assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1); 1573 }else if(inbounds){ 1574 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1575 if(!perfect1){ss.setPerfect(bases);} 1576 ss.gaps=gapArray; 1577 if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));} 1578 } 1579 1580 assert(ss==null || !ss.perfect || ss.semiperfect) : ss; 1581 assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS; 1582 if(ss!=null && ((SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect))){ss=null;} 1583 1584 if(ss!=null){ 1585 // System.out.println("Added site "+ss.toText()+", qscore="+qscore); 1586 ssl.add(ss); 1587 if(ss.perfect){ 1588 1589 if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){ 1590 if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));} 1591 perfectsFound++; 1592 1593 //Human-specific code 1594 // if(QUIT_AFTER_TWO_PERFECTS){ 1595 // if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;} 1596 // } 1597 1598 if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;} 1599 } 1600 } 1601 1602 prevSS=ss; 1603 }else{ 1604 // System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText()); 1605 } 1606 } 1607 } 1608 1609 while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements 1610 final Quad t2=heap.poll(); 1611 final int row=t2.row+1, col=t2.column; 1612 if(row<stops[col]){ 1613 t2.row=row; 1614 1615 int a=t2.list[row]; 1616 int a2; 1617 if((a&SITE_MASK)>=offsets[col]){ 1618 a2=a-offsets[col]; 1619 1620 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1621 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1622 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1623 }else{ 1624 int ch=numberToChrom(a, baseChrom); 1625 int st=numberToSite(a); 1626 int st2=Tools.max(st-offsets[col], 0); 1627 a2=toNumber(st2, ch); 1628 1629 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1630 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1631 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1632 } 1633 1634 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1635 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1636 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1637 1638 t2.site=a2; 1639 values[col]=a2; 1640 heap.add(t2); 1641 }else if(heap.size()<approxHitsCutoff || PERFECTMODE){ 1642 assert(USE_EXTENDED_SCORE); 1643 bestScores[0]=Tools.max(bestScores[0], currentTopScore); 1644 bestScores[1]=Tools.max(bestScores[1], maxHits); 1645 bestScores[2]=Tools.max(bestScores[2], qcutoff); 1646 bestScores[3]=Tools.max(bestScores[3], bestqscore); 1647 1648 bestScores[4]=maxQuickScore; 1649 bestScores[5]=perfectsFound; //***$ fixed by adding this line 1650 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 1651 1652 return ssl; 1653 } 1654 if(heap.isEmpty()){ 1655 assert(false) : heap.size()+", "+approxHitsCutoff; 1656 break; 1657 } 1658 } 1659 1660 } 1661 1662 assert(USE_EXTENDED_SCORE); 1663 bestScores[0]=Tools.max(bestScores[0], currentTopScore); 1664 bestScores[1]=Tools.max(bestScores[1], maxHits); 1665 bestScores[2]=Tools.max(bestScores[2], qcutoff); 1666 bestScores[3]=Tools.max(bestScores[3], bestqscore); 1667 1668 bestScores[4]=maxQuickScore; 1669 bestScores[5]=perfectsFound; 1670 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 1671 1672 return ssl; 1673 } 1674 1675 findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores, final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits, boolean earlyExit, boolean perfectOnly)1676 private final int[] findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores, 1677 final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits, 1678 boolean earlyExit, boolean perfectOnly){ 1679 1680 final int numHits=offsets.length; 1681 assert(numHits>=prevMaxHits); 1682 1683 final int baseChrom=baseChrom(baseChrom_); 1684 final Block b=index[baseChrom]; 1685 final int[] sizes=sizeArray; 1686 1687 heap.clear(); 1688 for(int i=0; i<numHits; i++){ 1689 final int[] sites=b.sites; 1690 final int start=starts[i]; 1691 sizes[i]=b.length(start, stops[i]); 1692 assert(sizes[i]>0); 1693 1694 int a=sites[start]; 1695 int a2; 1696 if((a&SITE_MASK)>=offsets[i]){ 1697 a2=a-offsets[i]; 1698 }else{ 1699 int ch=numberToChrom(a, baseChrom); 1700 int st=numberToSite(a); 1701 int st2=Tools.max(st-offsets[i], 0); 1702 a2=toNumber(st2, ch); 1703 } 1704 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); 1705 1706 Quad t=triples[i]; 1707 assert(t!=null) : "Should be using tripleStorage"; 1708 assert(i==t.column); 1709 t.row=start; 1710 t.site=a2; 1711 t.list=sites; 1712 values[i]=a2; 1713 1714 heap.add(t); 1715 } 1716 1717 final int maxQuickScore=maxQuickScore(offsets, keyScores); 1718 1719 int topQscore=-999999999; 1720 1721 int maxHits=0; 1722 // int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; 1723 1724 1725 int approxHitsCutoff; 1726 final int indelCutoff; 1727 if(perfectOnly){ 1728 approxHitsCutoff=numHits; 1729 indelCutoff=0; 1730 }else{ 1731 approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy 1732 indelCutoff=MAX_INDEL2; 1733 } 1734 1735 1736 while(!heap.isEmpty()){ 1737 Quad t=heap.peek(); 1738 final int site=t.site; 1739 final int centerIndex=t.column; 1740 1741 int maxNearbySite=site; 1742 1743 1744 int approxHits=0; 1745 {//Inner loop 1746 final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2; 1747 for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){ 1748 final int x=values[column]; 1749 assert(x==triples[column].site); 1750 if(x>=minsite && x<=maxsite){ 1751 maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); 1752 approxHits++; 1753 }else{chances--;} 1754 } 1755 } 1756 hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; 1757 1758 assert(centerIndex>=0) : centerIndex; 1759 1760 //I don't remember what this assertion was for or why, but it's causing trouble. 1761 //assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; 1762 if(approxHits>=approxHitsCutoff){ 1763 1764 int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); 1765 1766 if(ADD_SCORE_Z){ 1767 int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); 1768 qscore+=scoreZ; 1769 } 1770 1771 if(qscore>topQscore){ 1772 1773 // maxHits=Tools.max(approxHits, maxHits); 1774 // approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan 1775 1776 maxHits=Tools.max(approxHits, maxHits); 1777 approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan 1778 1779 topQscore=qscore; 1780 1781 if(qscore>=maxQuickScore){ 1782 assert(qscore==maxQuickScore); 1783 assert(approxHits==numHits); 1784 if(earlyExit){ 1785 return new int[] {topQscore, maxHits}; 1786 } 1787 } 1788 } 1789 } 1790 1791 while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements 1792 final Quad t2=heap.poll(); 1793 final int row=t2.row+1, col=t2.column; 1794 if(row<stops[col]){ 1795 t2.row=row; 1796 1797 int a=t2.list[row]; 1798 int a2; 1799 if((a&SITE_MASK)>=offsets[col]){ 1800 a2=a-offsets[col]; 1801 1802 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1803 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ 1804 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1805 }else{ 1806 int ch=numberToChrom(a, baseChrom); 1807 int st=numberToSite(a); 1808 int st2=Tools.max(st-offsets[col], 0); 1809 a2=toNumber(st2, ch); 1810 1811 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1812 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ 1813 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1814 } 1815 1816 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1817 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ 1818 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1819 1820 t2.site=a2; 1821 values[col]=a2; 1822 heap.add(t2); 1823 }else if(earlyExit && (perfectOnly || heap.size()<approxHitsCutoff)){ 1824 return new int[] {topQscore, maxHits}; 1825 } 1826 if(heap.isEmpty()){break;} 1827 } 1828 1829 } 1830 1831 1832 1833 return new int[] {topQscore, maxHits}; 1834 } 1835 1836 absdif(int a, int b)1837 private static final int absdif(int a, int b){ 1838 return a>b ? a-b : b-a; 1839 } 1840 1841 1842 @Override maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality)1843 final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){ 1844 1845 if(useQuality){ 1846 //These lines apparently MUST be used if quality is used later on for slow align. 1847 if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);} 1848 if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sumInt(baseScores);} 1849 }else{ 1850 if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);} 1851 if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);} 1852 } 1853 1854 return maxQuickScore(offsets, keyScores); 1855 } 1856 1857 maxQuickScore(int[] offsets, int[] keyScores)1858 public final int maxQuickScore(int[] offsets, int[] keyScores){ 1859 1860 // int x=offsets.length*BASE_KEY_HIT_SCORE; 1861 int x=Tools.intSum(keyScores); 1862 int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]); 1863 // if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);} 1864 // assert(!ADD_SCORE_Z) : "Need to make sure this is correct..."; 1865 1866 // if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);} 1867 if(ADD_SCORE_Z){x+=maxScoreZ(offsets);} 1868 1869 return x+y; 1870 // int bonus=(2*(HIT_SCORE/2)); //For matching both ends 1871 // return x+y+bonus; 1872 } 1873 1874 quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits)1875 private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], 1876 int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){ 1877 1878 hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++; 1879 if(numApproxHits==1){return keyScores[centerIndex];} 1880 1881 //Done! 1882 //Correct way to calculate score: 1883 //Find the first chunk that exactly hits the center. 1884 //Then, align leftward of it, and align rightward of it, and sum the scores. 1885 1886 //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns 1887 //(for example, a small indel will generate two valid site candidates), choose the lower site. 1888 1889 int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+ 1890 scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex; 1891 1892 int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets); 1893 if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);} 1894 // int z=scoreZ(locs, hits); 1895 return x+y; 1896 } 1897 1898 1899 // /** Generates a term that increases score with how many bases in the read match the ref. */ 1900 // public static final int scoreZ(int[] locs, int centerIndex, int offsets[]){ 1901 // final int center=locs[centerIndex]; 1902 // 1903 // final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE]; 1904 // 1905 // final int maxLoc=center+MAX_INDEL2; 1906 // final int minLoc=Tools.max(0, center-MAX_INDEL); 1907 // 1908 // int score=0; 1909 // 1910 // for(int i=0; i<locs.length; i++){ 1911 // int loc=locs[i]; 1912 //// int dif=absdif(loc, center); 1913 // if(loc>=minLoc && loc<=maxLoc){ 1914 //// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ 1915 //// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); 1916 // 1917 // int offset=offsets[i]; 1918 // int max=CHUNKSIZE+offset; 1919 // 1920 // for(int j=offset; j<max; j++){ 1921 // int old=refLoc[j]; 1922 // if(old==0){ 1923 // refLoc[j]=loc; 1924 // score+=4; 1925 // }else if(old>loc){ 1926 // refLoc[j]=loc; 1927 // score-=2; 1928 // }else if(old==loc){ 1929 // score-=1; 1930 // //do nothing, perhaps, or add 1? 1931 // }else{ 1932 // score-=2; 1933 // assert(old<loc); 1934 // } 1935 // } 1936 // } 1937 // } 1938 // return score; 1939 // } 1940 1941 1942 extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values, final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits)1943 private final int extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values, 1944 final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits){ 1945 callsToExtendScore++; 1946 hist_hits_extend[Tools.min(HIT_HIST_LEN, numApproxHits)]++; 1947 1948 final int centerVal=values[centerIndex]; 1949 final int centerLoc=numberToSite(centerVal); 1950 1951 final int minLoc=Tools.max(0, centerLoc-MAX_INDEL); //Legacy, for assertions 1952 final int maxLoc=centerLoc+MAX_INDEL2; //Legacy, for assertions 1953 1954 final int minVal=centerVal-MAX_INDEL; 1955 final int maxVal=centerVal+MAX_INDEL2; 1956 1957 final byte[] ref=Data.getChromosome(chrom).array; 1958 1959 if(verbose){ 1960 System.err.println("\n"); 1961 System.err.println("minLoc="+minLoc+", maxLoc="+ maxLoc+", centerIndex="+centerIndex+", centerVal="+centerVal+", centerLoc="+centerLoc); 1962 System.err.println("minVal="+minVal+", maxVal="+ maxVal+", numHits="+numHits+", numApproxHits="+numApproxHits); 1963 System.err.println("offsets:\t"+Arrays.toString(offsets)); 1964 System.err.println("values:\t"+Arrays.toString(values)); 1965 System.err.println(); 1966 int centerOffset=offsets[centerIndex]; 1967 1968 for(int i=0; i<centerOffset; i++){System.err.print(" ");} 1969 for(int i=centerOffset; i<centerOffset+KEYLEN; i++){System.err.print((char)bases[i]);} 1970 System.err.println(); 1971 1972 System.err.println(new String(bases)); 1973 System.err.println(new String(KillSwitch.copyOfRange(ref, centerLoc, centerLoc+bases.length))); 1974 System.err.println(); 1975 } 1976 1977 // int[] locArray=new int[bases.length]; 1978 Arrays.fill(locArray, -1); 1979 1980 1981 //First fill in reverse 1982 for(int i=0, keynum=0; i<numHits; i++){ 1983 final int value=values[i]; 1984 1985 if(value>=minVal && value<=maxVal){ 1986 final int refbase=numberToSite(value); 1987 assert(refbase>=minLoc && refbase<=maxLoc) : refbase+", "+minLoc+", "+maxLoc+", "+value+", "+maxVal+"\n"+new String(bases)+"\n"; 1988 1989 // System.out.println("numApproxHits="+numApproxHits+", numHits="+numHits+", i="+i+", minVal="+minVal+", value="+value+", maxVal="+maxVal+ 1990 // ", refbase="+refbase+", minLoc="+minLoc+", maxLoc="+maxLoc+", keynum="+keynum); 1991 // System.out.println("Reverse: Trying key "+refbase+" @ "+offsets[i]); 1992 // System.out.println("Passed!"); 1993 // 1994 // System.out.println("Number: \t"+Long.toHexString(value|(1l<<63))); 1995 // System.out.println("Mask: \t"+Long.toHexString(SITE_MASK|(1l<<63))); 1996 // System.out.println("Both: \t"+Long.toHexString((value&SITE_MASK)|(1l<<63))); 1997 1998 keynum++; 1999 final int callbase=offsets[i]; 2000 2001 int misses=0; 2002 for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc<ref.length; cloc--, rloc--){ 2003 int old=locArray[cloc]; 2004 if(old==refbase){ 2005 // System.out.println("Broke because old="+old+", refbase="+refbase); 2006 break; 2007 } //Already filled with present value 2008 if(misses>0 && old>=0){ 2009 // System.out.println("Broke because old="+old+", misses="+misses); 2010 break; 2011 } //Already filled with something that has no errors 2012 byte c=bases[cloc]; 2013 byte r=ref[rloc]; 2014 2015 if(c==r){ 2016 if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center 2017 locArray[cloc]=refbase; 2018 } 2019 }else{ 2020 misses++; 2021 //Only extends first key all the way back. Others stop at the first error. 2022 if(old>=0 || keynum>1){ 2023 // System.out.println("Broke because old="+old+", keynum="+keynum); 2024 break; 2025 } 2026 } 2027 } 2028 } 2029 } 2030 2031 2032 2033 //Then fill forward 2034 for(int i=0; i<numHits; i++){ 2035 final int value=values[i]; 2036 2037 if(value>=minVal && value<=maxVal){ 2038 final int refbase=numberToSite(value); 2039 assert(refbase>=minLoc && refbase<=maxLoc); 2040 final int callbase=offsets[i]; 2041 2042 int misses=0; 2043 for(int cloc=callbase+KEYLEN, rloc=refbase+cloc; cloc<bases.length && rloc<ref.length; cloc++, rloc++){ 2044 int old=locArray[cloc]; 2045 if(old==refbase){break;} //Already filled with present value 2046 if(misses>0 && old>=0){break;} //Already filled with something that has no errors 2047 byte c=bases[cloc]; 2048 byte r=ref[rloc]; 2049 2050 if(c==r){ 2051 if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center 2052 locArray[cloc]=refbase; 2053 } 2054 }else{ 2055 misses++; 2056 if(old>=0){break;} //Already filled with something that has no errors 2057 } 2058 } 2059 } 2060 } 2061 2062 // //Change 'N' to -2. A bit slow. 2063 // { 2064 // int firstMatch=0; 2065 // while(firstMatch<locArray.length && locArray[firstMatch]<0){firstMatch++;} 2066 // assert(firstMatch<locArray.length) : new String(bases); 2067 // int last=locArray[firstMatch]; 2068 // for(int i=firstMatch-1; i>=0; i--){ 2069 // final byte c=bases[i]; 2070 // if(c=='N'){locArray[i]=-2;} 2071 // else{ 2072 // assert(locArray[i]==-1); 2073 // final int rloc=last+i; 2074 // byte r=ref[rloc]; 2075 // if(r=='N'){locArray[i]=-2;} 2076 // } 2077 // } 2078 // for(int i=firstMatch; i<locArray.length; i++){ 2079 // final int loc=locArray[i]; 2080 // if(last<1){last=loc;} 2081 // final byte c=bases[i]; 2082 // if(c=='N'){locArray[i]=-2;} 2083 // else if(loc==-1 && last>0){ 2084 // final int rloc=last+i; 2085 // byte r=ref[rloc]; 2086 // if(r=='N'){locArray[i]=-2;} 2087 // } 2088 // } 2089 // } 2090 2091 // System.out.println("$$$\n"+Arrays.toString(locArray)); 2092 // assert(false) : "hits="+numHits+", centerIndex="+centerIndex+", centerVal="+centerVal+", centerLoc="+centerLoc+ 2093 // ", minLoc="+minLoc+", maxLoc="+maxLoc+", minVal="+minVal+", maxVal="+maxVal; 2094 2095 //Change 'N' to -2, but only for nocalls, not norefs. Much faster. 2096 { 2097 final byte nb=(byte)'N'; 2098 for(int i=0; i<bases.length; i++){ 2099 if(bases[i]==nb){locArray[i]=-2;} 2100 } 2101 } 2102 2103 if(USE_AFFINE_SCORE){ 2104 /* TODO - sometimes returns a higher score than actual alignment. This should never happen. */ 2105 int score=(KFILTER<2 ? msa.calcAffineScore(locArray, baseScores, bases) : 2106 msa.calcAffineScore(locArray, baseScores, bases, KFILTER)); 2107 return score; 2108 } 2109 2110 int score=0; 2111 int lastLoc=-1; 2112 int centerBonus=BASE_HIT_SCORE/5; 2113 for(int i=0; i<locArray.length; i++){ 2114 int loc=locArray[i]; 2115 if(loc>=0){ 2116 score+=BASE_HIT_SCORE+baseScores[i]; 2117 if(loc==centerLoc){score+=centerBonus;} 2118 if(loc!=lastLoc && lastLoc>=0){ 2119 int dif=absdif(loc, lastLoc); 2120 int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT); 2121 score-=penalty; 2122 } 2123 lastLoc=loc; 2124 } 2125 } 2126 2127 // System.err.println("Extended score: "+score); 2128 // System.err.println(Arrays.toString(locArray)); 2129 2130 2131 return score; 2132 } 2133 2134 2135 /** NOTE! This destroys the locArray, so use a copy if needed. */ makeGapArray(int[] locArray, int minLoc, int minGap)2136 private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){ 2137 int gaps=0; 2138 boolean doSort=false; 2139 2140 if(locArray[0]<0){locArray[0]=minLoc;} 2141 for(int i=1; i<locArray.length; i++){ 2142 if(locArray[i]<0){locArray[i]=locArray[i-1]+1;} 2143 else{locArray[i]+=i;} 2144 if(locArray[i]<locArray[i-1]){doSort=true;} 2145 } 2146 2147 // System.err.println(Arrays.toString(locArray)+"\n"); 2148 2149 if(doSort){ 2150 // System.err.println("*"); 2151 Arrays.sort(locArray); 2152 } 2153 // System.err.println(Arrays.toString(locArray)+"\n"); 2154 2155 for(int i=1; i<locArray.length; i++){ 2156 int dif=locArray[i]-locArray[i-1]; 2157 assert(dif>=0); 2158 if(dif>minGap){ 2159 gaps++; 2160 } 2161 } 2162 if(gaps<1){return null;} 2163 int[] out=new int[2+gaps*2]; 2164 out[0]=locArray[0]; 2165 out[out.length-1]=locArray[locArray.length-1]; 2166 2167 for(int i=1, j=1; i<locArray.length; i++){ 2168 int dif=locArray[i]-locArray[i-1]; 2169 assert(dif>=0); 2170 if(dif>minGap){ 2171 out[j]=locArray[i-1]; 2172 out[j+1]=locArray[i]; 2173 j+=2; 2174 } 2175 } 2176 return out; 2177 } 2178 2179 2180 /** Generates a term that increases score with how many bases in the read match the ref. */ scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits)2181 private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){ 2182 2183 if(numApproxHits==1){return SCOREZ_1KEY;} 2184 2185 final int center=locs[centerIndex]; 2186 2187 final int maxLoc=center+MAX_INDEL2; 2188 final int minLoc=Tools.max(0, center-MAX_INDEL); 2189 2190 int score=0; 2191 2192 int a0=-1, b0=-1; 2193 2194 for(int i=0; i<numHits; i++){ 2195 int loc=locs[i]; 2196 // int dif=absdif(loc, center); 2197 if(loc>=minLoc && loc<=maxLoc){ 2198 // assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ 2199 // "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); 2200 int a=offsets[i]; 2201 2202 if(b0<a){ 2203 score+=b0-a0; 2204 a0=a; 2205 } 2206 b0=a+KEYLEN; 2207 } 2208 } 2209 score+=b0-a0; 2210 score=score*Z_SCORE_MULT; 2211 // assert(score==scoreZslow(locs, centerIndex, offsets, false)) : scoreZslow(locs, centerIndex, offsets, true)+" != "+score; 2212 return score; 2213 } 2214 2215 @Deprecated 2216 /** This was just to verify scoreZ2. */ scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display)2217 private final int scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display){ 2218 final int center=locs[centerIndex]; 2219 2220 final int maxLoc=center+MAX_INDEL2; 2221 final int minLoc=Tools.max(0, center-MAX_INDEL); 2222 2223 byte[] array=new byte[offsets[offsets.length-1]+KEYLEN]; 2224 int score=0; 2225 2226 for(int i=0; i<locs.length; i++){ 2227 int loc=locs[i]; 2228 // int dif=absdif(loc, center); 2229 if(loc>=minLoc && loc<=maxLoc){ 2230 int pos=offsets[i]; 2231 // if(true){ 2232 // System.err.println("\ni="+i+", pos="+pos+", array=["+array.length+"], limit="+(pos+CHUNKSIZE-1)); 2233 // } 2234 for(int j=pos; j<pos+KEYLEN; j++){ 2235 if(array[j]==0){score++;} 2236 array[j]=1; 2237 } 2238 } 2239 } 2240 2241 if(display){System.err.println("\n"+Arrays.toString(array)+"\n");} 2242 2243 return score*Z_SCORE_MULT; 2244 } 2245 2246 /** Generates a term that increases score with how many bases in the read match the ref. */ maxScoreZ(int offsets[])2247 private final int maxScoreZ(int offsets[]){ 2248 int score=0; 2249 int a0=-1, b0=-1; 2250 2251 for(int i=0; i<offsets.length; i++){ 2252 int a=offsets[i]; 2253 2254 if(b0<a){ 2255 score+=b0-a0; 2256 a0=a; 2257 } 2258 b0=a+KEYLEN; 2259 2260 } 2261 score+=b0-a0; 2262 return score*Z_SCORE_MULT; 2263 } 2264 2265 scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits)2266 private final int scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits){ 2267 2268 int score=0; 2269 2270 int prev, loc=locs[centerIndex]; 2271 2272 for(int i=centerIndex+1; i<numHits; i++){ 2273 2274 if(locs[i]>=0){ 2275 prev=loc; 2276 loc=locs[i]; 2277 2278 int offset=absdif(loc, prev); 2279 2280 if(offset<=MAX_INDEL){ 2281 score+=keyScores[i]; 2282 if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} 2283 2284 // if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key 2285 2286 if(penalizeIndels && offset!=0){ 2287 int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); 2288 score-=penalty; 2289 // score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4)); 2290 } 2291 }else{ 2292 loc=prev; 2293 } 2294 } 2295 2296 } 2297 return score; 2298 2299 } 2300 scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels)2301 private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){ 2302 2303 callsToScore++; 2304 2305 int score=0; 2306 2307 int prev, loc=locs[centerIndex]; 2308 2309 for(int i=centerIndex-1; i>=0; i--){ 2310 2311 if(locs[i]>=0){ 2312 prev=loc; 2313 loc=locs[i]; 2314 2315 int offset=absdif(loc, prev); 2316 2317 if(offset<=MAX_INDEL){ 2318 score+=keyScores[i]; 2319 if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} 2320 2321 // if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key 2322 if(penalizeIndels && offset!=0){ 2323 int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); 2324 score-=penalty; 2325 } 2326 }else{ 2327 loc=prev; 2328 } 2329 } 2330 2331 } 2332 return score; 2333 2334 } 2335 2336 /** Encode a (location, chrom) pair to an index */ toNumber(int site, int chrom)2337 private static final int toNumber(int site, int chrom){ 2338 int out=(chrom&CHROM_MASK_LOW); 2339 out=out<<SHIFT_LENGTH; 2340 out=(out|site); 2341 return out; 2342 } 2343 2344 /** Decode an (index, baseChrom) pair to a chromosome */ numberToChrom(int number, int baseChrom)2345 private static final int numberToChrom(int number, int baseChrom){ 2346 assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom; 2347 assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom; 2348 int out=(number>>>SHIFT_LENGTH); 2349 out=out+(baseChrom&CHROM_MASK_HIGH); 2350 return out; 2351 } 2352 2353 /** Decode an index to a location */ numberToSite(int number)2354 private static final int numberToSite(int number){ 2355 return (number&SITE_MASK); 2356 } 2357 minChrom(int chrom)2358 public static final int minChrom(int chrom){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);} baseChrom(int chrom)2359 public static final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);} maxChrom(int chrom)2360 public static final int maxChrom(int chrom){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));} 2361 2362 getOffsetArray(int len)2363 private final int[] getOffsetArray(int len){ 2364 if(len>=offsetArrays.length){return new int[len];} 2365 if(offsetArrays[len]==null){offsetArrays[len]=new int[len];} 2366 return offsetArrays[len]; 2367 } getLocArray(int len)2368 private final int[] getLocArray(int len){ 2369 if(len>=locArrays.length){return new int[len];} 2370 if(locArrays[len]==null){locArrays[len]=new int[len];} 2371 return locArrays[len]; 2372 } getGreedyListArray(int len)2373 private final int[] getGreedyListArray(int len){ 2374 if(len>=greedyListArrays.length){return new int[len];} 2375 if(greedyListArrays[len]==null){greedyListArrays[len]=new int[len];} 2376 return greedyListArrays[len]; 2377 } getGenericArray(int len)2378 private final int[] getGenericArray(int len){ 2379 if(len>=genericArrays.length){return new int[len];} 2380 if(genericArrays[len]==null){genericArrays[len]=new int[len];} 2381 return genericArrays[len]; 2382 } 2383 2384 @Override getBaseScoreArray(int len, int strand)2385 final byte[] getBaseScoreArray(int len, int strand){ 2386 if(len>=baseScoreArrays[0].length){return new byte[len];} 2387 if(baseScoreArrays[strand][len]==null){baseScoreArrays[strand][len]=new byte[len];} 2388 return baseScoreArrays[strand][len]; 2389 } 2390 @Override getKeyScoreArray(int len, int strand)2391 final int[] getKeyScoreArray(int len, int strand){ 2392 if(len>=keyScoreArrays.length){return new int[len];} 2393 if(keyScoreArrays[strand][len]==null){keyScoreArrays[strand][len]=new int[len];} 2394 return keyScoreArrays[strand][len]; 2395 } getKeyWeightArray(int len)2396 private final float[] getKeyWeightArray(int len){ 2397 if(len>=keyWeightArrays.length){return new float[len];} 2398 if(keyWeightArrays[len]==null){keyWeightArrays[len]=new float[len];} 2399 return keyWeightArrays[len]; 2400 } 2401 @Override keyProbArray()2402 float[] keyProbArray() { 2403 return keyProbArray; 2404 } 2405 2406 public static final int KMER_ARRAY_LENGTH=1201; 2407 public static final int HEAP_LENGTH=2047; 2408 public static final int BASE_ARRAY_LENGTH=6001; 2409 2410 private final int[][] locArrays=new int[BASE_ARRAY_LENGTH][]; 2411 private final int[] valueArray=new int[HEAP_LENGTH]; 2412 private final int[] sizeArray=new int[HEAP_LENGTH]; 2413 private final int[][] offsetArrays=new int[KMER_ARRAY_LENGTH][]; 2414 private final int[][] greedyListArrays=new int[KMER_ARRAY_LENGTH][]; 2415 private final int[][] genericArrays=new int[KMER_ARRAY_LENGTH][]; 2416 private final int[] startArray=new int[HEAP_LENGTH]; 2417 private final int[] stopArray=new int[HEAP_LENGTH]; 2418 private final Quad[] tripleStorage=makeQuadStorage(HEAP_LENGTH); 2419 private final int[] greedyReturn=KillSwitch.allocInt1D(2); 2420 private final int[][] shrinkReturn2=new int[3][]; 2421 private final int[][] shrinkReturn3=new int[5][]; 2422 private final int[][] prescanReturn=new int[2][]; 2423 private final int[] prescoreArray; 2424 private final int[] precountArray; 2425 2426 private final byte[][][] baseScoreArrays=new byte[2][BASE_ARRAY_LENGTH][]; 2427 private final int[][][] keyScoreArrays=new int[2][KMER_ARRAY_LENGTH][]; 2428 final float[] keyProbArray=new float[BASE_ARRAY_LENGTH]; 2429 private final float[][] keyWeightArrays=new float[KMER_ARRAY_LENGTH][]; 2430 2431 makeQuadStorage(int number)2432 private final static Quad[] makeQuadStorage(int number){ 2433 Quad[] r=new Quad[number]; 2434 for(int i=0; i<number; i++){r[i]=new Quad(i, 0, 0);} 2435 return r; 2436 } 2437 2438 2439 private final QuadHeap heap=new QuadHeap(HEAP_LENGTH); 2440 2441 static int SHIFT_LENGTH=(32-1-NUM_CHROM_BITS); 2442 static int MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH); 2443 2444 /** Mask the number to get the site, which is in the lower bits */ 2445 static int SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1)); 2446 2447 /** Mask the chromosome's high bits to get the low bits */ 2448 static int CHROM_MASK_LOW=CHROMS_PER_BLOCK-1; 2449 2450 /** Mask the chromosome's lower bits to get the high bits */ 2451 static int CHROM_MASK_HIGH=~CHROM_MASK_LOW; 2452 setChromBits(int x)2453 static void setChromBits(int x){ 2454 2455 NUM_CHROM_BITS=x; 2456 CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS)); 2457 SHIFT_LENGTH=(32-1-NUM_CHROM_BITS); 2458 MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH); 2459 SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1)); 2460 CHROM_MASK_LOW=CHROMS_PER_BLOCK-1; 2461 CHROM_MASK_HIGH=~CHROM_MASK_LOW; 2462 2463 // assert(NUM_CHROM_BITS<30); 2464 assert(NUM_CHROM_BITS>=0); //max is 3 for human; perhaps more for other organisms 2465 // assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK; 2466 assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK; 2467 assert(Integer.bitCount(CHROMS_PER_BLOCK)==1); 2468 assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS+1)) : Integer.toHexString(SITE_MASK); 2469 } 2470 2471 private final int cycles; 2472 2473 public static final int BASE_HIT_SCORE=100; 2474 public static final int ALIGN_COLUMNS=7600; 2475 public static int MAX_INDEL=100; //Max indel length, min 0, default 400; longer is more accurate 2476 public static int MAX_INDEL2=8*MAX_INDEL; 2477 2478 private final float INV_BASE_KEY_HIT_SCORE; 2479 private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1 2480 private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length 2481 private final int MAX_PENALTY_FOR_MISALIGNED_HIT; 2482 private final int SCOREZ_1KEY; 2483 2484 public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed 2485 public static final int Z_SCORE_MULT=25; 2486 public static final int Y_SCORE_MULT=10; 2487 2488 2489 /** 2490 * Return only sites that match completely or with partial no-reference 2491 */ setSemiperfectMode()2492 public static void setSemiperfectMode() { 2493 assert(!PERFECTMODE); 2494 SEMIPERFECTMODE=true; 2495 PRESCAN_QSCORE=false; 2496 // MIN_APPROX_HITS_TO_KEEP++; 2497 2498 2499 2500 MAX_INDEL=0; 2501 MAX_INDEL2=0; 2502 } 2503 2504 /** 2505 * Return only sites that match completely 2506 */ setPerfectMode()2507 public static void setPerfectMode() { 2508 assert(!SEMIPERFECTMODE); 2509 PERFECTMODE=true; 2510 PRESCAN_QSCORE=false; 2511 // MIN_APPROX_HITS_TO_KEEP++; 2512 2513 2514 2515 MAX_INDEL=0; 2516 MAX_INDEL2=0; 2517 } 2518 2519 static float FRACTION_GENOME_TO_EXCLUDE=0.005f; //Default .04; lower is slower and more accurate 2520 setFractionToExclude(float f)2521 public static final void setFractionToExclude(float f){ 2522 assert(f>=0 && f<1); 2523 FRACTION_GENOME_TO_EXCLUDE=f; 2524 MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 2525 MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 2526 MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 2527 MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 2528 MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 2529 } 2530 2531 2532 /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */ 2533 static final float HIT_FRACTION_TO_RETAIN=.97f; //default: .85 2534 /** Range: 0 to 1000. Lower should be faster and less accurate. */ 2535 static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 2536 /** Range: 2 to infinity. Lower should be faster and less accurate. */ 2537 static final int MIN_HIT_LISTS_TO_RETAIN=12; 2538 2539 static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 2540 //lower is faster 2541 static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 2542 //lower is faster 2543 static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 2544 //lower is faster 2545 static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 2546 2547 /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */ 2548 public static final int SMALL_GENOME_LIST=80; 2549 2550 static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";} 2551 2552 static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy. 2553 2554 /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */ 2555 static final int CLUMPY_MIN_LENGTH_INDEX=2800; 2556 static final float CLUMPY_FRACTION=0.8f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy. 2557 2558 static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2; 2559 2560 /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */ 2561 public static final int MAX_HITS_REDUCTION1=2; 2562 2563 /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */ 2564 public static int MAX_HITS_REDUCTION2=3; 2565 2566 /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */ 2567 public static final int MAX_HITS_REDUCTION_PERFECT=2; 2568 2569 public static int MAXIMUM_MAX_HITS_REDUCTION=6; 2570 public static int HIT_REDUCTION_DIV=4; 2571 calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect)2572 private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$ 2573 assert(keys>=hits) : keys+", "+hits; 2574 assert(hits>=0); 2575 2576 int mahtk=MIN_APPROX_HITS_TO_KEEP; 2577 if(SEMIPERFECTMODE || PERFECTMODE){ 2578 if(keys==1){return 1;} 2579 else if(MIN_APPROX_HITS_TO_KEEP<keys){ 2580 mahtk++; 2581 if(currentCutoff==MIN_APPROX_HITS_TO_KEEP){currentCutoff++;} 2582 } 2583 } 2584 2585 int reduction=Tools.min(Tools.max((hits)/HIT_REDUCTION_DIV, MAX_HITS_REDUCTION2), Tools.max(MAXIMUM_MAX_HITS_REDUCTION, keys/8)); 2586 assert(reduction>=0); 2587 int r=hits-reduction; 2588 2589 r=Tools.max(mahtk, currentCutoff, r); 2590 2591 if(perfect){ 2592 r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT); 2593 } 2594 return r; 2595 } 2596 2597 public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE; 2598 public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed 2599 public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast. 2600 public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.02f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate 2601 public static final float MIN_QSCORE_MULT=0.005f; //Fraction of max score to use as cutoff. Default 0.025, max is 1; lower is more accurate 2602 public static final float MIN_QSCORE_MULT2=0.005f; 2603 static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.64f : USE_EXTENDED_SCORE ? .74f : 0.6f); //Default .85f; lower is more accurate 2604 static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f 2605 static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$ 2606 static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false 2607 static{ 2608 assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1); 2609 assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1); 2610 } 2611 2612 2613 } 2614