1 package align2; 2 3 import java.util.ArrayList; 4 import java.util.Arrays; 5 import java.util.HashMap; 6 7 import dna.AminoAcid; 8 import dna.Data; 9 import shared.KillSwitch; 10 import shared.Shared; 11 import shared.Tools; 12 import stream.SiteScore; 13 import structures.LongM; 14 15 16 /** 17 * Based on Index11f 18 * Index stored in single array per block. 19 * 20 * 21 * 22 * @author Brian Bushnell 23 * @date Dec 22, 2012 24 * 25 */ 26 public final class BBIndex extends AbstractIndex { 27 28 main(String[] args)29 public static void main(String[] args){ 30 31 int k=13; 32 33 for(int i=0; i<args.length; i++){ 34 String s=args[i].toLowerCase(); 35 if(s.contains("=")){ 36 String[] split=s.split("="); 37 String a=split[0]; 38 String b=split[1]; 39 if(a.equals("build") || a.equals("b")){ 40 Data.setGenome(Integer.parseInt(b)); 41 }else if(a.equals("minchrom")){ 42 MINCHROM=Integer.parseInt(b); 43 }else if(a.equals("maxchrom")){ 44 MAXCHROM=Integer.parseInt(b); 45 }else if(a.equals("keylen") || a.equals("k")){ 46 k=Integer.parseInt(b); 47 } 48 } 49 } 50 51 if(MINCHROM==-1){MINCHROM=1;} 52 if(MAXCHROM==-1){ 53 assert(Data.numChroms<=Byte.MAX_VALUE) : "TODO"; 54 MAXCHROM=Data.numChroms; 55 } 56 57 58 System.err.println("Writing build "+Data.GENOME_BUILD+" "+ 59 "BASESPACE index, keylen="+k+", chrom bits="+NUM_CHROM_BITS); 60 61 62 int first=(NUM_CHROM_BITS==0 ? 1 : 0); 63 64 65 Data.sysout.println("Loading index for chunk "+first+"-"+MAXCHROM+", build "+Data.GENOME_BUILD); 66 index=IndexMaker4.makeIndex(Data.GENOME_BUILD, first, MAXCHROM, 67 k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, true, false, index); 68 69 70 System.err.println("Finished all chroms, may still be writing."); 71 } 72 73 BBIndex(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_)74 public BBIndex(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_){ 75 super(k_, kfilter_, BASE_HIT_SCORE, minChrom_, maxChrom_, msa_); 76 INV_BASE_KEY_HIT_SCORE=1f/BASE_KEY_HIT_SCORE; 77 INDEL_PENALTY=(BASE_KEY_HIT_SCORE/2)-1; //default (HIT_SCORE/2)-1 78 INDEL_PENALTY_MULT=20; //default 20; penalty for indel length 79 MAX_PENALTY_FOR_MISALIGNED_HIT=BASE_KEY_HIT_SCORE-(1+BASE_KEY_HIT_SCORE/8); 80 SCOREZ_1KEY=Z_SCORE_MULT*KEYLEN; 81 { 82 int cyc=0; 83 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){cyc+=2;} 84 cycles=cyc; 85 } 86 prescoreArray=new int[cycles]; 87 precountArray=new int[cycles]; 88 } 89 90 /** Load or generate index from minChrom to maxChrom, inclusive, with keylength k. 91 * This range can encompass multiple blocks. 92 * Should only be called once in a process. */ loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid)93 public static final synchronized void loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid){ 94 if(minChrom<1){minChrom=1;} 95 if(maxChrom>Data.numChroms){maxChrom=Data.numChroms;} 96 assert(minChrom<=maxChrom) : minChrom+", "+maxChrom+"\nThe reference file appears to be empty."; 97 Data.sysout.println("Loading index for chunk "+minChrom+"-"+maxChrom+", build "+Data.GENOME_BUILD); 98 index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom, 99 k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, writeToDisk, diskInvalid, index); 100 } 101 102 /** Calculate statistics of index, such as list lengths, and find clumpy keys */ analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k)103 public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){ 104 assert(lengthHistogram==null); 105 assert(COUNTS==null); 106 107 int KEYSPACE=1<<(2*k); 108 COUNTS=new int[KEYSPACE]; 109 maxChrom=maxChrom(maxChrom); 110 111 HashMap<Integer, LongM> cmap=new HashMap<Integer, LongM>(); 112 113 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ 114 Block b=index[chrom]; 115 final int[] sites=b.sites; 116 final int[] starts=b.starts; 117 118 for(int key=0; key<KEYSPACE; key++){ 119 120 long clumps=0; 121 122 final int start1=starts[key]; 123 final int stop1=starts[key+1]; 124 final int len1=stop1-start1; 125 COUNTS[key]=Tools.min(Integer.MAX_VALUE, COUNTS[key]+len1); 126 127 if(REMOVE_CLUMPY){ 128 for(int i=start1+1; i<stop1; i++){ 129 int dif=sites[i]-sites[i-1]; 130 assert(dif!=0); 131 if(dif>0 && dif<=CLUMPY_MAX_DIST){ 132 clumps++; 133 } 134 } 135 if(clumps>0){ 136 final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k)); 137 final Integer ko=x; 138 LongM lm=cmap.get(ko); 139 if(lm==null){ 140 lm=new LongM(0); 141 cmap.put(ko, lm); 142 } 143 lm.increment(clumps); 144 } 145 } 146 } 147 } 148 149 for(int key=0; key<COUNTS.length; key++){ 150 int rkey=AminoAcid.reverseComplementBinaryFast(key, k); 151 if(key<rkey){ 152 int x=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+(long)COUNTS[rkey]); 153 COUNTS[key]=COUNTS[rkey]=x; 154 } 155 } 156 157 if(REMOVE_CLUMPY){ 158 Integer[] keys=cmap.keySet().toArray(new Integer[cmap.size()]); 159 Arrays.sort(keys); 160 161 for(Integer key : keys){ 162 long clumps=cmap.get(key).value(); 163 long len=COUNTS[key]; 164 if((len>CLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){ 165 int rkey=AminoAcid.reverseComplementBinaryFast(key, k); 166 assert(key<=rkey); 167 assert(key==KeyRing.reverseComplementKey(rkey, k)); 168 COUNTS[key]=0; 169 COUNTS[rkey]=0; 170 } 171 } 172 } 173 174 lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); 175 176 //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} 177 178 if(REMOVE_FREQUENT_GENOME_FRACTION){ 179 180 int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); 181 int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); 182 183 MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); 184 MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); 185 186 if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} 187 } 188 189 Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); 190 if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} 191 if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} 192 assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; 193 } 194 195 196 /** Returns the filename for the block holding this chrom */ 197 public static final String fname(int chrom, int k){ 198 return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS); 199 } 200 201 /** Ensure key offsets are strictly ascending. */ 202 private static boolean checkOffsets(int[] offsets){ 203 for(int i=1; i<offsets.length; i++){ 204 if(offsets[i]<=offsets[i-1]){return false;} 205 } 206 return true; 207 } 208 209 @Deprecated 210 private final int trimExcessHitLists(int[] keys, int[][] hits){ 211 212 assert(false) : "Needs to be redone because hits are no longer sorted by length."; 213 214 assert(hits.length==keys.length); 215 // assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits"; 216 //And also, incorporate the "remove the longest list" function 217 218 final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length; 219 final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]); 220 final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]); 221 222 int sum=0; 223 int initialHitCount=0; 224 225 int shortest=Integer.MAX_VALUE-1; 226 int shortest2=Integer.MAX_VALUE; 227 228 for(int i=0; i<keys.length; i++){ 229 int key=keys[i]; 230 int x=count(key); 231 sum+=x; 232 initialHitCount+=(x==0 ? 0 : 1); 233 if(x>0){ 234 if(x<shortest2){ 235 shortest2=x; 236 if(shortest2<shortest){ 237 shortest2=shortest; 238 shortest=x; 239 } 240 } 241 } 242 } 243 assert(shortest2>=shortest); 244 if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;} 245 if(shortest>limit3 && !SLOW){ 246 for(int i=0; i<hits.length; i++){hits[i]=null;} 247 return 0; 248 } 249 if(sum<=limit && sum/initialHitCount<=limit2){return initialHitCount;} 250 251 Pointer[] ptrs=Pointer.loadMatrix(hits); 252 // ptrs[0].value/=2; 253 // ptrs[ptrs.length-1].value/=2; 254 Arrays.sort(ptrs); 255 256 int finalHitCount=initialHitCount; 257 for(int i=ptrs.length-1; sum>limit || sum/finalHitCount>limit2; i--){ 258 Pointer p=ptrs[i]; 259 sum-=hits[p.key].length; 260 hits[p.key]=null; 261 finalHitCount--; 262 } 263 264 return finalHitCount; 265 } 266 267 /** Remove least useful keys to accelerate search */ 268 private final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){ 269 270 float[] keyWeights=getKeyWeightArray(keyScores.length); 271 for(int i=0; i<keyScores.length; i++){ 272 keyWeights[i]=keyScores[i]*INV_BASE_KEY_HIT_SCORE; 273 } 274 275 // assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits"; 276 //And also, incorporate the "remove the longest list" function 277 278 //Remove lists until the length sum is below this value 279 final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length; 280 final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]); 281 282 //Do not search if the shortest list is longer than this 283 final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]); 284 // final int limitS=lengthHistogram[chrom][MAX_SINGLE_LIST_TO_SEARCH]; 285 286 int sum=0; 287 int initialHitCount=0; 288 289 //Shortest list found 290 int shortest=Integer.MAX_VALUE-1; 291 //Second shortest list found 292 int shortest2=Integer.MAX_VALUE; 293 294 // for(int i=0; i<hits.length; i++){ 295 // if(hits[i]!=null && hits[i].length>limitS){hits[i]=null;} 296 // } 297 298 final int[] lengths=getGenericArray(keys.length); 299 300 //Find shortest lists 301 for(int i=0; i<keys.length; i++){ 302 int key=keys[i]; 303 int x=count(key); 304 lengths[i]=x; 305 sum+=x; 306 initialHitCount+=(x==0 ? 0 : 1); 307 if(x>0){ 308 if(x<shortest2){ 309 shortest2=x; 310 if(shortest2<shortest){ 311 shortest2=shortest; 312 shortest=x; 313 } 314 } 315 } 316 } 317 // assert(false) : limit+", "+limit2+", "+limit3+", "+shortest2+", "+shortest+", "+initialHitCount+", "+MIN_APPROX_HITS_TO_KEEP+"\n"+Arrays.toString(lengths); 318 assert(shortest2>=shortest); 319 if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;} 320 321 //Don't search if the shortest list is too long 322 if(shortest>limit3 && !SLOW){ 323 for(int i=0; i<keys.length; i++){keys[i]=-1;} 324 return 0; 325 } 326 327 int hitsCount=initialHitCount; 328 int worstValue=Integer.MIN_VALUE; 329 330 while(hitsCount>=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){ 331 final int[] lists=getGreedyListArray(hitsCount); 332 for(int i=0, j=0; j<lists.length; i++){ 333 if(lengths[i]>0){ 334 lists[j]=i; 335 j++; 336 } 337 } 338 339 Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn); 340 int worstIndex=greedyReturn[0]; 341 int worst=lists[worstIndex]; 342 worstValue=greedyReturn[1]; 343 sum-=lengths[worst]; 344 345 // if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]<excessListLimit)){return hitsCount;} 346 if(worstValue>0 || lengths[worst]<SMALL_GENOME_LIST){return hitsCount;} //This line increases accuracy at expense of speed. Lower constant = more accurate, default 0. 347 hitsCount--; 348 lengths[worst]=0; 349 keys[worst]=-1; 350 } 351 return hitsCount; 352 } 353 354 355 private final int getHits(final int[] keys, final int chrom, final int maxLen, final int[] starts, final int[] stops){ 356 int numHits=0; 357 final Block b=index[chrom]; 358 for(int i=0; i<keys.length; i++){ 359 final int key=keys[i]; 360 starts[i]=-1; 361 stops[i]=-1; 362 if(key>=0){ 363 final int len=count(key); 364 if(len>0 && len<maxLen){ 365 final int len2=b.length(key); 366 if(len2>0){ 367 starts[i]=b.starts[key]; 368 stops[i]=starts[i]+len2; 369 numHits++; 370 } 371 } 372 } 373 } 374 return numHits; 375 } 376 377 378 private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){ 379 int numHits=0; 380 for(int i=0; i<keys.length; i++){ 381 final int key=keys[i]; 382 if(key>=0){ 383 final int len=count(key); 384 // System.err.println(len); 385 if(len>0 && len<maxLen){ 386 numHits++; 387 }else if(clearBadKeys){ 388 keys[i]=-1; 389 } 390 } 391 } 392 return numHits; 393 } 394 395 396 @Override 397 public final ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){ 398 assert(minChrom<=maxChrom && minChrom>=0); 399 ArrayList<SiteScore> result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id); 400 if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);} 401 402 return result; 403 } 404 405 406 public final ArrayList<SiteScore> find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){ 407 408 assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 409 final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN); 410 int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length); 411 412 initialKeys+=offsetsP.length; 413 initialKeyIterations++; 414 415 final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2); 416 417 int numHits=0; 418 numHits=countHits(keysP, maxLen, true); 419 420 if(verbose){ 421 System.err.println("initial hits: "+numHits); 422 System.err.println("initial keys: "+keysP.length+"\n"+Arrays.toString(keysP)); 423 } 424 425 if(numHits>0){ //TODO: Change these to higher numbers 426 int trigger=(3*keysP.length)/4; 427 if(numHits<4 && numHits<trigger){ 428 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 429 numHits=countHits(keysP, (maxLen*3)/2, true); 430 } 431 if(numHits<3 && numHits<trigger){ 432 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 433 numHits=countHits(keysP, maxLen*2, true); 434 } 435 if(numHits<3 && numHits<trigger){ 436 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 437 numHits=countHits(keysP, maxLen*3, true); 438 } 439 if(numHits<2 && numHits<trigger){ 440 for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];} 441 numHits=countHits(keysP, maxLen*5, true); 442 } 443 } 444 // assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 445 if(numHits<keysP.length){ 446 int[][] r=shrink2(offsetsP, keysP, keyScoresP); 447 assert(r!=null); 448 if(r!=null){ 449 offsetsP=r[0]; 450 keysP=r[1]; 451 keyScoresP=r[2]; 452 } 453 }else{ 454 assert(shrink2(offsetsP, keysP, keyScoresP)==null); 455 } 456 initialKeys2+=numHits; 457 //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 458 459 // assert(checkOffsets(offsets)) : Arrays.toString(offsets); 460 if(TRIM_BY_GREEDY && obeyLimits){ 461 int maxLists=Tools.max((int)(HIT_FRACTION_TO_RETAIN*keysP.length), MIN_HIT_LISTS_TO_RETAIN); 462 numHits=trimExcessHitListsByGreedy(offsetsP, keyScoresP, maxLists, keysP); 463 } 464 // System.out.println("After greedy: numHits = "+numHits); 465 466 if(verbose){ 467 System.err.println("final hits: "+numHits); 468 System.err.println("final keys: "+keysP.length+"\n"+Arrays.toString(keysP)); 469 } 470 471 if(TRIM_BY_TOTAL_SITE_COUNT && obeyLimits){ 472 throw new RuntimeException("Needs to be redone."); 473 // numHits=trimExcessHitLists(keys, hits); 474 } 475 476 if(TRIM_LONG_HIT_LISTS && obeyLimits && numHits>MIN_APPROX_HITS_TO_KEEP){ 477 int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits); 478 479 int zeroes=keysP.length-numHits; 480 int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1)); 481 cutoffIndex=Tools.max(cutoffIndex, altMinIndex); 482 483 assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits; 484 485 if(cutoffIndex<(keysP.length-1)){ 486 int[] lens=getGenericArray(keysP.length); 487 for(int i=0; i<keysP.length; i++){lens[i]=count(keysP[i]);} 488 Arrays.sort(lens); 489 int cutoff=lens[cutoffIndex]; 490 491 cutoff=Tools.max(lengthHistogram[MIN_INDEX_TO_DROP_LONG_HIT_LIST], cutoff); 492 493 int removed=0; 494 495 for(int i=0; i<keysP.length; i++){ 496 int key=keysP[i]; 497 if(count(key)>cutoff){ 498 keysP[i]=-1; 499 removed++; 500 numHits--; 501 } 502 } 503 } 504 } 505 // assert(checkOffsets(offsets)) : Arrays.toString(offsets); 506 507 final ArrayList<SiteScore> result=new ArrayList<SiteScore>(8); 508 if(numHits<MIN_APPROX_HITS_TO_KEEP){return result;} 509 //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 510 if(numHits<keysP.length){ 511 int[][] r=shrink2(offsetsP, keysP, keyScoresP); 512 assert(r!=null); 513 if(r!=null){ 514 offsetsP=r[0]; 515 keysP=r[1]; 516 keyScoresP=r[2]; 517 } 518 }else{ 519 assert(shrink2(offsetsP, keysP, keyScoresP)==null); 520 } 521 assert(keysP.length==numHits); 522 //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 523 //Reverse the offsets for minus-strand mapping, since they are generated based on quality 524 int[] offsetsM=KeyRing.reverseOffsets(offsetsP, KEYLEN, basesP.length); 525 if(verbose){ 526 System.err.println("Reversed offsets: \n"+Arrays.toString(offsetsP)+" ->\n"+Arrays.toString(offsetsM)); 527 } 528 final int[] keysM=KeyRing.reverseComplementKeys(keysP, KEYLEN); 529 530 // assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); 531 // assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM); 532 533 assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length))); 534 assert(keyScoresP!=null); 535 assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP); 536 final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1)); 537 final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1)); 538 final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); 539 540 assert(offsetsM.length==offsetsP.length); 541 assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM)); 542 543 /* 544 * bestScores: 545 * 546 * bestScores[0] currentTopScore 547 * bestScores[1] maxHits 548 * bestScores[2] qcutoff 549 * bestScores[3] bestqscore 550 * bestScores[4] maxQuickScore 551 * bestScores[5] perfectsFound 552 */ 553 final int[] bestScores=KillSwitch.allocInt1D(6); 554 555 //This prevents filtering by qscore when a low-quality read only uses a few keys. 556 //In that case, extending is more important. 557 final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5); 558 559 int[][] prescanResults=null; 560 int[] precounts=null; 561 int[] prescores=null; 562 563 int hitsCutoff=0; 564 int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore); 565 566 boolean allBasesCovered=true; 567 { 568 if(offsetsP[0]!=0){allBasesCovered=false;} 569 else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;} 570 else{ 571 for(int i=1; i<offsetsP.length; i++){ 572 if(offsetsP[i]>offsetsP[i-1]+KEYLEN){ 573 allBasesCovered=false; 574 break; 575 } 576 } 577 } 578 } 579 580 //TODO I don't understand this logic 581 final boolean pretendAllBasesAreCovered=//false; 582 (allBasesCovered || 583 keysP.length>=keysOriginal.length-4 || 584 (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f)))); 585 586 // System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP)); 587 // assert(allBasesCovered); 588 589 if(prescan_qscore){ 590 prescanResults=prescanAllBlocks(bestScores, 591 keysP, keyScoresP, offsetsP, 592 keysM, keyScoresM, offsetsM, 593 pretendAllBasesAreCovered); 594 595 if(prescanResults!=null){ 596 precounts=prescanResults[0]; 597 prescores=prescanResults[1]; 598 } 599 600 if(bestScores[1]<MIN_APPROX_HITS_TO_KEEP){return result;} 601 if(bestScores[3]<maxQuickScore*MIN_QSCORE_MULT2){return result;} //if(bestScores[3]<maxQuickScore(offsetsP, keyScoresP)*.10f){return result;} 602 603 if(bestScores[3]>=maxQuickScore && pretendAllBasesAreCovered){ 604 assert(bestScores[3]==maxQuickScore); 605 assert(bestScores[1]==numHits); 606 607 hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true); 608 qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT)); 609 }else{ 610 hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false); 611 qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH)); 612 } 613 } 614 615 final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true); 616 final boolean fullyDefined=AminoAcid.isFullyDefined(basesP); 617 assert(bestScores[2]<=0) : Arrays.toString(bestScores); 618 619 int cycle=0; 620 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ 621 if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ 622 find(keysP, basesP, baseScoresP, keyScoresP, chrom, Shared.PLUS, 623 offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); 624 } 625 if(QUIT_AFTER_TWO_PERFECTS){ 626 if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human 627 } 628 cycle++; 629 if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ 630 find(keysM, basesM, baseScoresM, keyScoresM, chrom, Shared.MINUS, 631 offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); 632 } 633 if(QUIT_AFTER_TWO_PERFECTS){ 634 if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human 635 } 636 cycle++; 637 } 638 639 // assert(Read.CHECKSITES(result, basesP, basesM, id, false)); //TODO: Comment out once checked 640 641 return result; 642 } 643 644 /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */ 645 private final int[][] prescanAllBlocks(int[] bestScores, 646 int[] keysP, int[] keyScoresP, int[] offsetsP, 647 int[] keysM, int[] keyScoresM, int[] offsetsM, 648 final boolean allBasesCovered){ 649 650 int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}}; 651 652 int bestqscore=0; 653 int maxHits=0; 654 int minHitsToScore=MIN_APPROX_HITS_TO_KEEP; 655 656 final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); 657 658 final int[] counts=precountArray; 659 final int[] scores=prescoreArray; 660 final int[][] ret=prescanReturn; 661 Arrays.fill(counts, keysP.length); 662 Arrays.fill(scores, maxQuickScore); 663 ret[0]=counts; 664 ret[1]=scores; 665 666 int cycle=0; 667 for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ 668 final int baseChrom=baseChrom(chrom); 669 for(int pmi=0; pmi<2; pmi++, cycle++){ 670 671 int[] keys=pm[pmi][0]; 672 int[] keyScores=pm[pmi][1]; 673 int[] offsets=pm[pmi][2]; 674 // int[][] hits=getHitArray(offsets.length); 675 676 int[] starts=startArray; 677 int[] stops=stopArray; 678 int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); 679 680 if(numHits<minHitsToScore){ 681 scores[cycle]=-9999; 682 counts[cycle]=0; 683 }else{ 684 685 // final int maxQuickScore=maxQuickScore(offsets, keyScores); 686 // System.err.println("maxScore = "+maxScore); 687 688 if(numHits<keys.length){ 689 int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); 690 if(r!=null){ 691 starts=r[0]; 692 stops=r[1]; 693 offsets=r[2]; 694 keyScores=r[4]; 695 } 696 } 697 698 assert(numHits==offsets.length); 699 assert(numHits==keyScores.length); 700 heap.clear(); 701 final Quad[] triples=tripleStorage; 702 final int[] values=valueArray; 703 704 int[] temp=findMaxQscore2(starts, stops, offsets, keyScores, baseChrom, triples, values, minHitsToScore, true, 705 bestqscore>=maxQuickScore && allBasesCovered); 706 707 scores[cycle]=temp[0]; 708 counts[cycle]=temp[1]; 709 710 bestqscore=Tools.max(temp[0], bestqscore); 711 maxHits=Tools.max(maxHits, temp[1]); 712 if(bestqscore>=maxQuickScore && allBasesCovered){ 713 assert(bestqscore==maxQuickScore); 714 assert(maxHits==keysP.length) : 715 "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" + 716 "Scores: \t"+Arrays.toString(scores)+ 717 "Counts: \t"+Arrays.toString(counts)+ 718 "bestqscore: \t"+bestqscore+ 719 "maxHits: \t"+maxHits+ 720 "maxQuickScore: \t"+maxQuickScore+ 721 "numHits: \t"+numHits+ 722 "minHitsToScore: \t"+minHitsToScore+ 723 "keys.length: \t"+keys.length; 724 725 minHitsToScore=Tools.max(minHitsToScore, maxHits); 726 727 { 728 //This early exit is optional. Does not seem to impact speed much either way. 729 bestScores[1]=Tools.max(bestScores[1], maxHits); 730 bestScores[3]=Tools.max(bestScores[3], bestqscore); 731 return ret; 732 } 733 } 734 } 735 } 736 } 737 738 bestScores[1]=Tools.max(bestScores[1], maxHits); 739 bestScores[3]=Tools.max(bestScores[3], bestqscore); 740 741 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 742 743 return ret; 744 } 745 746 747 /** Search a single block and strand */ 748 public final ArrayList<SiteScore> find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores, 749 final int chrom, final byte strand, 750 int[] offsets, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores, 751 final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ 752 753 assert(checkOffsets(offsets)) : Arrays.toString(offsets); 754 755 //Index of first location of each key 756 int[] starts=startArray; 757 //Index of first location of next key (i.e., (last location of key)+1) 758 int[] stops=stopArray; 759 760 int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); 761 if(numHits<MIN_APPROX_HITS_TO_KEEP){return ssl;} 762 // assert(false) : "\n"+Data.getChromosome(1).minIndex+"\n"+Data.getChromosome(1).maxIndex+"\n"+Data.getChromosome(1).array.length; 763 // assert(false) : "\n"+Data.getChromosome(1).minIndex+"\n"+Data.getChromosome(1).maxIndex+"\n"+Data.getChromosome(1).array.length+"\n"+(char)Data.getChromosome(1).array[0]; 764 // assert(false) : numHits+"\n"+Arrays.toString(starts)+"\n"+Arrays.toString(stops)+"\n"+Arrays.toString(index[0].getHitList(starts[0], stops[0]))+"\n"; 765 if(USE_CAMELWALK){ 766 if(USE_SLOWALK3){ 767 if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);} 768 ssl=camelWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined); 769 }else{ 770 assert(false) : "TODO"; 771 ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined); 772 } 773 }else{ 774 if(USE_SLOWALK3){ 775 if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);} 776 ssl=slowWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined); 777 }else{ 778 ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined); 779 } 780 } 781 782 return ssl; 783 } 784 785 /** Compress arrays by removing null/empty lists */ 786 private final int[][] shrink(int[] starts, int[] stops, int[] offsets, int[] keyScores, final int len){ 787 int numHits=0; 788 for(int i=0; i<len; i++){ 789 if(starts[i]>=0){numHits++;} 790 } 791 792 if(numHits==offsets.length){ 793 return null; 794 } 795 int[][] r=shrinkReturn3; 796 int[] starts2=startArray; 797 int[] stops2=stopArray; 798 int[] offsets2=getOffsetArray(numHits); 799 int[] keyScores2=new int[numHits]; 800 801 for(int i=0, j=0; i<len; i++){ 802 if(starts[i]>=0){ 803 starts2[j]=starts[i]; 804 stops2[j]=stops[i]; 805 offsets2[j]=offsets[i]; 806 keyScores2[j]=keyScores[i]; 807 j++; 808 } 809 } 810 r[0]=starts2; 811 r[1]=stops2; 812 r[2]=offsets2; 813 r[4]=keyScores2; 814 return r; 815 } 816 817 /** Removes "-1" keys. */ 818 private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){ 819 820 821 int numHits=0; 822 for(int i=0; i<keys.length; i++){ 823 if(keys[i]>=0){numHits++;} 824 } 825 826 827 assert(checkOffsets(offsets)) : Arrays.toString(offsets); 828 if(numHits==keys.length){ 829 return null; 830 }else{ 831 int[][] r=shrinkReturn2; 832 int[] offsets2=getOffsetArray(numHits); 833 assert(offsets2!=offsets); 834 assert(offsets2.length<offsets.length); 835 int[] keys2=new int[numHits]; 836 int[] keyScores2=new int[numHits]; 837 838 for(int i=0, j=0; i<keys.length; i++){ 839 if(keys[i]>=0){ 840 offsets2[j]=offsets[i]; 841 keys2[j]=keys[i]; 842 keyScores2[j]=keyScores[i]; 843 j++; 844 } 845 } 846 assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+ 847 "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n"; 848 r[0]=offsets2; 849 r[1]=keys2; 850 r[2]=keyScores2; 851 return r; 852 } 853 } 854 855 856 /** This uses a heap to track next column to increment */ 857 private final ArrayList<SiteScore> slowWalk2(int[] starts, int[] stops, final byte[] bases, 858 final byte[] baseScores, int[] keyScores, int[] offsets, 859 final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, final boolean fullyDefined){ 860 861 if(SHRINK_BEFORE_WALK){ 862 int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); 863 if(r!=null){ 864 starts=r[0]; 865 stops=r[1]; 866 offsets=r[2]; 867 keyScores=r[4]; 868 } 869 } 870 871 final int numHits=offsets.length; //After shrink 872 873 assert(numHits==offsets.length); 874 assert(numHits==keyScores.length); 875 876 assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); 877 878 //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. 879 final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true); 880 // final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets)); 881 // System.err.println("maxScore = "+maxScore); 882 883 // final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); 884 final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); 885 // final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f)); 886 // final int minScore=(int)(MIN_SCORE_MULT*maxScore); 887 // System.err.println("minScore = "+minScore); 888 889 final int baseChrom=baseChrom(baseChrom_); 890 891 892 heap.clear(); 893 final Quad[] triples=tripleStorage; 894 895 final Block b=index[baseChrom]; 896 final int[] values=valueArray; 897 final int[] sizes=sizeArray; 898 final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); 899 900 for(int i=0; i<numHits; i++){ 901 final int[] sites=b.sites; 902 final int start=starts[i]; 903 sizes[i]=b.length(start, stops[i]); 904 assert(sizes[i]>0); 905 906 int a=sites[start]; 907 int a2; 908 if((a&SITE_MASK)>=offsets[i]){ 909 a2=a-offsets[i]; 910 }else{ 911 int ch=numberToChrom(a, baseChrom); 912 int st=numberToSite(a); 913 int st2=Tools.max(st-offsets[i], 0); 914 a2=toNumber(st2, ch); 915 } 916 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); 917 918 Quad t=triples[i]; 919 assert(t!=null) : "Should be using tripleStorage"; 920 assert(i==t.column); 921 t.row=start; 922 t.site=a2; 923 t.list=sites; 924 values[i]=a2; 925 926 heap.add(t); 927 } 928 929 if(ssl==null){ssl=new ArrayList<SiteScore>(8);} 930 931 int currentTopScore=-999999999; 932 933 int cutoff=minScore; 934 935 int maxHits=0; 936 int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; 937 938 // System.out.println("\nEntering SS loop:"); 939 // System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + 940 // "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); 941 // System.out.println(); 942 943 SiteScore prevSS=null; 944 while(!heap.isEmpty()){ 945 Quad t=heap.peek(); 946 final int site=t.site; 947 final int centerIndex=t.column; 948 949 int maxNearbySite=site; 950 int approxHits=0; 951 952 {//Inner loop 953 final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; 954 for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){ 955 final int x=values[column]; 956 assert(x==triples[column].site); 957 if(x>=minsite && x<=maxsite){ 958 maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); 959 approxHits++; 960 }else{chances--;} 961 } 962 } 963 hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; 964 965 assert(centerIndex>=0) : centerIndex; 966 967 //I don't remember what this assertion was for or why, but it's causing trouble. 968 //assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; 969 if(approxHits>=approxHitsCutoff){ 970 971 int score; 972 973 int mapStart=site, mapStop=maxNearbySite; 974 975 if(USE_EXTENDED_SCORE){ 976 final int chrom=numberToChrom(site, baseChrom); 977 score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); 978 if(true/*USE_AFFINE_SCORE*/){ 979 //Correct begin and end positions if they changed. 980 int min=Integer.MAX_VALUE; 981 int max=Integer.MIN_VALUE; 982 for(int i=0; i<locArray.length; i++){ 983 int x=locArray[i]; 984 if(x>-1){ 985 if(x<min){min=x;} 986 if(x>max){max=x;} 987 } 988 } 989 990 // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? 991 if(min<0 || max<0){ 992 //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries 993 if(!Shared.anomaly){ 994 Shared.anomaly=true; 995 System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ 996 chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ 997 Arrays.toString(locArray)+"\n"+ 998 Arrays.toString(values)+"\n"+ 999 new String(bases)+"\nstrand="+strand+"\n"); 1000 System.err.println(); 1001 } 1002 score=-99999; 1003 } 1004 1005 1006 mapStart=toNumber(min, chrom); 1007 mapStop=toNumber(max, chrom); 1008 1009 1010 1011 // System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+ 1012 // ", mapStart="+mapStart+", mapStop="+mapStop); 1013 1014 // if(chrom==17 && absdif(min, 30354420)<2000){ 1015 // System.err.println("\n*****\n"); 1016 // System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+ 1017 // " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+ 1018 // ", mapStart="+mapStart+", mapStop="+mapStop); 1019 // System.err.println(); 1020 // System.err.println(Arrays.toString(locArray)); 1021 // System.err.println(); 1022 // System.err.println("chrom="+chrom); 1023 // System.err.println("score="+score); 1024 // } 1025 } 1026 }else{ 1027 score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); 1028 if(ADD_SCORE_Z){ 1029 int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); 1030 score+=scoreZ; 1031 } 1032 } 1033 1034 if(score>=cutoff){ 1035 1036 if(score>currentTopScore){ 1037 // System.err.println("New top score!"); 1038 1039 if(DYNAMICALLY_TRIM_LOW_SCORES){ 1040 1041 maxHits=Tools.max(approxHits, maxHits); 1042 // approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); 1043 approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower 1044 1045 cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); 1046 1047 // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH)); 1048 if(USE_EXTENDED_SCORE && score>=maxScore){ 1049 cutoff=Tools.max(cutoff, (int)(score*0.95f)); 1050 } 1051 } 1052 1053 currentTopScore=score; 1054 1055 // System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); 1056 } 1057 1058 final int chrom=numberToChrom(mapStart, baseChrom); 1059 final int site2=numberToSite(mapStart); 1060 final int site3=numberToSite(mapStop)+bases.length-1; 1061 1062 assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+ 1063 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+ 1064 "\n\n"+Arrays.toString(b.getHitList(centerIndex)); 1065 assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+ 1066 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length; 1067 1068 1069 //Note: I could also do this as soon as score is calculated. 1070 // if(ADD_SCORE_Z){ 1071 // int scoreZ=scoreZ2(values, centerIndex, offsets); 1072 // score+=scoreZ; 1073 // } 1074 1075 //This block is optional, but tries to eliminate multiple identical alignments 1076 // SiteScore prevSS=(ssl.size()<1 ? null : ssl.get(ssl.size()-1)); 1077 1078 SiteScore ss=null; 1079 final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; 1080 1081 int[] gapArray=null; 1082 if(site3-site2>=MINGAP+bases.length){ 1083 gapArray=makeGapArray(locArray, mapStart, MINGAP); 1084 if(gapArray!=null){ 1085 int sub=site2-mapStart;//thus site2=mapStart+sub 1086 for(int i=0; i<gapArray.length; i++){ 1087 gapArray[i]+=sub; 1088 } 1089 assert(gapArray[0]==mapStart) : gapArray[0]+", "+mapStart; 1090 assert(gapArray[gapArray.length-1]==mapStop); 1091 } 1092 assert(false) : Arrays.toString(locArray); 1093 } 1094 1095 if(gapArray==null && prevSS!=null && prevSS.gaps==null && 1096 prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ 1097 1098 int betterScore=Tools.max(score, prevSS.score); 1099 int minStart=Tools.min(prevSS.start, site2); 1100 int maxStop=Tools.max(prevSS.stop, site3); 1101 final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined; 1102 assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect); 1103 1104 boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length)); 1105 1106 if(prevSS.start==site2 && prevSS.stop==site3){ 1107 prevSS.score=prevSS.quickScore=betterScore; 1108 }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2){ 1109 if(perfect2){ 1110 //do nothing 1111 }else if(perfect1){ 1112 prevSS.setStop(site3); 1113 prevSS.setPerfect(); 1114 }else{ 1115 prevSS.setStop(maxStop); 1116 } 1117 prevSS.score=prevSS.quickScore=betterScore; 1118 }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3){ 1119 if(perfect2){ 1120 //do nothing 1121 }else if(perfect1){ 1122 prevSS.setStart(site2); 1123 prevSS.setPerfect(); 1124 }else{ 1125 prevSS.setStart(minStart); 1126 } 1127 prevSS.score=prevSS.quickScore=betterScore; 1128 }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH) 1129 && !perfect1 && !perfect2){ 1130 prevSS.setLimits(minStart, maxStop); 1131 prevSS.score=prevSS.quickScore=betterScore; 1132 }else{ 1133 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1134 if(!perfect1){ss.setPerfect(bases);} 1135 assert(!perfect1 || ss.stop-ss.start==bases.length-1); 1136 } 1137 assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1); 1138 }else{ 1139 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1140 if(!perfect1){ss.setPerfect(bases);} 1141 ss.gaps=gapArray; 1142 if(verbose && gapArray!=null){ 1143 System.err.println(ss.toText()+"\t"+Arrays.toString(gapArray)+"\n"+Arrays.toString(locArray)+"\n"); 1144 } 1145 } 1146 1147 if(ss!=null){ 1148 // System.out.println("Added site "+ss.toText()); 1149 ssl.add(ss); 1150 prevSS=ss; 1151 }else{ 1152 // System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText()); 1153 } 1154 1155 // if(prevSS!=null && prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ 1156 // int betterScore=Tools.max(score, prevSS.score); 1157 // if(prevSS.start==site2 && prevSS.stop==site3){ 1158 // prevSS.score=prevSS.quickScore=betterScore; 1159 // }else if(prevSS.start==site2 1160 // /*isWithin(prevSS.start, prevSS.stop, site2, site3) || 1161 // isWithin(site2, site3, prevSS.start, prevSS.stop)*/){ 1162 // prevSS.score=prevSS.quickScore=betterScore; 1163 // assert(prevSS.start<prevSS.stop); 1164 //// prevSS.start=Tools.min(prevSS.start, site2); 1165 // prevSS.stop=Tools.max(prevSS.stop, site3); 1166 // assert(prevSS.start<prevSS.stop); 1167 // }else{ 1168 // SiteScore ss=new SiteScore(chrom, strand, site2, site3, score); 1169 // ssl.add(ss); 1170 // prevSS=ss; 1171 // } 1172 // }else{ 1173 // SiteScore ss=new SiteScore(chrom, strand, site2, site3, score); 1174 // ssl.add(ss); 1175 // prevSS=ss; 1176 // } 1177 1178 } 1179 } 1180 1181 while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements 1182 final Quad t2=heap.poll(); 1183 final int row=t2.row+1, col=t2.column; 1184 if(row<stops[col]){ 1185 t2.row=row; 1186 1187 int a=t2.list[row]; 1188 int a2; 1189 if((a&SITE_MASK)>=offsets[col]){ 1190 a2=a-offsets[col]; 1191 1192 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1193 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1194 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1195 }else{ 1196 int ch=numberToChrom(a, baseChrom); 1197 int st=numberToSite(a); 1198 int st2=Tools.max(st-offsets[col], 0); 1199 a2=toNumber(st2, ch); 1200 1201 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1202 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1203 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1204 } 1205 1206 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1207 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1208 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1209 1210 t2.site=a2; 1211 values[col]=a2; 1212 heap.add(t2); 1213 } 1214 if(heap.isEmpty()){break;} 1215 } 1216 1217 } 1218 1219 return ssl; 1220 } 1221 1222 /** This uses a heap to track next column to increment */ 1223 private final ArrayList<SiteScore> slowWalk3(int[] starts, int[] stops, final byte[] bases, 1224 final byte[] baseScores, int[] keyScores, int[] offsets, 1225 final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, 1226 int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ 1227 assert(USE_EXTENDED_SCORE); 1228 1229 final int numKeys=offsets.length; //Before shrink 1230 1231 //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. 1232 final int maxQuickScore=maxQuickScore(offsets, keyScores); 1233 1234 if(SHRINK_BEFORE_WALK){ 1235 int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); 1236 if(r!=null){ 1237 starts=r[0]; 1238 stops=r[1]; 1239 offsets=r[2]; 1240 keyScores=r[4]; 1241 } 1242 } 1243 1244 final int numHits=offsets.length; //After shrink 1245 1246 1247 assert(numHits==offsets.length); 1248 assert(numHits==keyScores.length); 1249 1250 usedKeys+=numHits; 1251 usedKeyIterations++; 1252 1253 final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); 1254 1255 assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); 1256 1257 1258 // final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); 1259 final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); 1260 final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); 1261 1262 final int baseChrom=baseChrom(baseChrom_); 1263 1264 heap.clear(); 1265 1266 final Quad[] triples=tripleStorage; 1267 1268 final int[] values=valueArray; 1269 final int[] sizes=sizeArray; 1270 final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); 1271 final Block b=index[baseChrom]; 1272 1273 if(ssl==null){ssl=new ArrayList<SiteScore>(8);} 1274 1275 int currentTopScore=bestScores[0]; 1276 int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); 1277 1278 int qcutoff=Tools.max(bestScores[2], minQuickScore); 1279 int bestqscore=bestScores[3]; 1280 int maxHits=bestScores[1]; 1281 int perfectsFound=bestScores[5]; 1282 assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits+", "+new String(bases); 1283 int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); 1284 if(approxHitsCutoff>numHits){return ssl;} 1285 1286 final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); 1287 1288 1289 assert(USE_EXTENDED_SCORE); 1290 1291 if(currentTopScore>=maxScore){ 1292 assert(currentTopScore==maxScore); 1293 qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); 1294 } 1295 1296 1297 for(int i=0; i<numHits; i++){ 1298 final int[] sites=b.sites; 1299 final int start=starts[i]; 1300 sizes[i]=b.length(start, stops[i]); 1301 assert(sizes[i]>0); 1302 1303 int a=sites[start]; 1304 int a2; 1305 if((a&SITE_MASK)>=offsets[i]){ 1306 a2=a-offsets[i]; 1307 }else{ 1308 int ch=numberToChrom(a, baseChrom); 1309 int st=numberToSite(a); 1310 int st2=Tools.max(st-offsets[i], 0); 1311 a2=toNumber(st2, ch); 1312 } 1313 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); 1314 1315 Quad t=triples[i]; 1316 assert(t!=null) : "Should be using tripleStorage"; 1317 assert(i==t.column); 1318 t.row=start; 1319 t.site=a2; 1320 t.list=sites; 1321 values[i]=a2; 1322 1323 heap.add(t); 1324 } 1325 1326 // System.out.println("\nEntering SS loop:"); 1327 // System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + 1328 // "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); 1329 // System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); 1330 1331 1332 SiteScore prevSS=null; 1333 while(!heap.isEmpty()){ 1334 Quad t=heap.peek(); 1335 final int site=t.site; 1336 final int centerIndex=t.column; 1337 1338 int maxNearbySite=site; 1339 1340 1341 int approxHits=0; 1342 1343 {//Inner loop 1344 final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; 1345 for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){ 1346 final int x=values[column]; 1347 assert(x==triples[column].site); 1348 if(x>=minsite && x<=maxsite){ 1349 maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); 1350 approxHits++; 1351 }else{chances--;} 1352 } 1353 } 1354 hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; 1355 1356 assert(centerIndex>=0) : centerIndex; 1357 1358 //I don't remember what this assertion was for or why, but it's causing trouble. 1359 //assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; 1360 if(approxHits>=approxHitsCutoff){ 1361 1362 int score; 1363 int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); 1364 if(ADD_SCORE_Z){ 1365 int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); 1366 qscore+=scoreZ; 1367 } 1368 1369 int mapStart=site, mapStop=maxNearbySite; 1370 1371 assert(USE_EXTENDED_SCORE); 1372 1373 boolean locArrayValid=false; 1374 if(qscore<qcutoff){ 1375 score=-1; 1376 }else{ 1377 1378 final int chrom=numberToChrom(site, baseChrom); 1379 1380 //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results). 1381 //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run. 1382 //May have something to do with compiler loop optimizations. 1383 if(shortCircuit && qscore==maxQuickScore){ 1384 assert(approxHits==numKeys); 1385 score=maxScore; 1386 }else{ 1387 if(verbose){ 1388 System.err.println("numHits="+numHits+", approxHits="+approxHits+", keys="+numKeys+", centerIndex="+centerIndex+ 1389 ", qscore="+qscore+", qcutoff="+qcutoff+", filter_by_qscore="+filter_by_qscore); 1390 System.err.println("Extending "+Arrays.toString(values)); 1391 } 1392 score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); 1393 locArrayValid=true; 1394 1395 if(verbose){ 1396 System.err.println("score: "+score); 1397 System.err.println("locArray: "+Arrays.toString(locArray)); 1398 } 1399 1400 //Correct begin and end positions if they changed. 1401 int min=Integer.MAX_VALUE; 1402 int max=Integer.MIN_VALUE; 1403 for(int i=0; i<locArray.length; i++){ 1404 int x=locArray[i]; 1405 if(x>-1){ 1406 if(x<min){min=x;} 1407 if(x>max){max=x;} 1408 } 1409 } 1410 1411 if(score>=maxScore){ 1412 assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ 1413 ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; 1414 } 1415 1416 // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? 1417 if(min<0 || max<0){ 1418 if(!Shared.anomaly){ 1419 Shared.anomaly=true; 1420 System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+ 1421 "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+ 1422 "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+ 1423 "numHits="+approxHits+", approxHits="+approxHits+"\n"+ 1424 "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+ 1425 "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+ 1426 "locArray:\t"+Arrays.toString(locArray)+"\n"+ 1427 "values:\t"+Arrays.toString(values)+"\n"+ 1428 "bases:\t"+new String(bases)); 1429 System.err.println(); 1430 assert(false); 1431 } 1432 score=-99999; 1433 } 1434 1435 //mapStart and mapStop are indices 1436 mapStart=toNumber(min, chrom); 1437 mapStop=toNumber(max, chrom); 1438 1439 if(score>=maxScore){ 1440 assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ 1441 ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; 1442 } 1443 } 1444 1445 if(score==maxScore){ 1446 qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); 1447 approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); 1448 } 1449 1450 if(score>=cutoff){ 1451 qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); 1452 bestqscore=Tools.max(qscore, bestqscore); 1453 } 1454 } 1455 1456 if(score>=cutoff){ 1457 1458 if(score>currentTopScore){ 1459 // System.err.println("New top score!"); 1460 1461 if(DYNAMICALLY_TRIM_LOW_SCORES){ 1462 1463 maxHits=Tools.max(approxHits, maxHits); 1464 approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); 1465 1466 cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); 1467 if(score>=maxScore){ 1468 assert(USE_EXTENDED_SCORE); 1469 cutoff=Tools.max(cutoff, (int)(score*0.95f)); 1470 } 1471 } 1472 1473 currentTopScore=score; 1474 1475 // System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); 1476 } 1477 1478 final int chrom=numberToChrom(mapStart, baseChrom); 1479 final int site2=numberToSite(mapStart); 1480 final int site3=numberToSite(mapStop)+bases.length-1; 1481 1482 assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop; 1483 1484 assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+ 1485 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+ 1486 "\n\n"+Arrays.toString(b.getHitList(centerIndex)); assert(site2<site3) : R+chrom+R+strand+R+site+ R+maxNearbySite+R+site2+R+site3+R+bases.length; int[] gapArray=null; if(site3-site2>=MINGAP+bases.length)1487 assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+ 1488 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length; 1489 1490 1491 int[] gapArray=null; 1492 if(site3-site2>=MINGAP+bases.length){ 1493 assert(locArrayValid) : "Loc array was not filled."; 1494 // System.err.println("****\n"+Arrays.toString(locArray)+"\n"); 1495 // int[] clone=locArray.clone(); 1496 gapArray=makeGapArray(locArray, site2, MINGAP); 1497 if(gapArray!=null){ 1498 // System.err.println(Arrays.toString(locArray)+"\n"); 1499 // System.err.println(Arrays.toString(gapArray)); 1500 // 1501 //// int sub=site2-mapStart;//thus site2=mapStart+sub 1502 //// for(int i=0; i<gapArray.length; i++){ 1503 //// gapArray[i]+=sub; 1504 //// } 1505 //// System.err.println(Arrays.toString(gapArray)); 1506 // 1507 // System.err.println(mapStart+" -> "+site2); 1508 // System.err.println(mapStop+" -> "+site3); 1509 1510 assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length); 1511 assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+ 1512 mapStart+" -> "+site2+"\n"+ 1513 mapStop+" -> "+site3+"\n\n"+ 1514 Arrays.toString(gapArray)+"\n\n"+ 1515 // Arrays.toString(clone)+"\n\n"+ 1516 Arrays.toString(locArray)+"\n"+ 1517 "numHits="+numHits+", "+ 1518 "heap.size="+heap.size()+", "+ 1519 "numHits="+numHits+", "+ 1520 "approxHits="+approxHits+"\n"; 1521 gapArray[0]=Tools.min(gapArray[0], site2); 1522 gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); 1523 } 1524 if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} 1525 // assert(false) : Arrays.toString(locArray); 1526 } 1527 1528 1529 //This block is optional, but tries to eliminate multiple identical alignments 1530 1531 SiteScore ss=null; 1532 final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; 1533 final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]); 1534 // if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);} 1535 1536 if(verbose){ 1537 System.err.println("Considering site chr"+chrom+", site2="+site2+", site3="+site3+", mapStart="+mapStart+", mapStop="+mapStop); 1538 if(!inbounds){System.err.println("Index tossed out-of-bounds site.");} 1539 assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop; 1540 } 1541 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1542 1543 if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null && 1544 prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ 1545 1546 if(verbose){System.err.println("Considering overlapping site chr"+chrom+", "+site2+"-"+site3);} 1547 1548 final int betterScore=Tools.max(score, prevSS.score); 1549 final int minStart=Tools.min(prevSS.start, site2); 1550 final int maxStop=Tools.max(prevSS.stop, site3); 1551 final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined; 1552 assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect); 1553 1554 final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length)); 1555 1556 if(prevSS.start==site2 && prevSS.stop==site3){ 1557 if(verbose){System.err.println("Class 1: Same bounds as last site.");} 1558 prevSS.score=prevSS.quickScore=betterScore; 1559 prevSS.perfect=(prevSS.perfect || perfect1 || perfect2); 1560 if(prevSS.perfect){prevSS.semiperfect=true;} 1561 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1562 }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){ 1563 if(verbose){System.err.println("Class 2: Same start as last site.");} 1564 if(perfect2){ 1565 //do nothing 1566 }else if(perfect1){ 1567 prevSS.setStop(site3); 1568 if(!prevSS.perfect){perfectsFound++;}//***$ 1569 prevSS.perfect=prevSS.semiperfect=true; 1570 }else{ 1571 prevSS.setStop(maxStop); 1572 prevSS.setPerfect(bases); 1573 } 1574 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1575 prevSS.score=prevSS.quickScore=betterScore; 1576 }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){ 1577 if(verbose){System.err.println("Class 3: Same stop as last site.");} 1578 if(perfect2){ 1579 //do nothing 1580 }else if(perfect1){ 1581 prevSS.setStart(site2); 1582 if(!prevSS.perfect){perfectsFound++;}//***$ 1583 prevSS.perfect=prevSS.semiperfect=true; 1584 }else{ 1585 prevSS.setStart(minStart); 1586 prevSS.setPerfect(bases); 1587 } 1588 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1589 prevSS.score=prevSS.quickScore=betterScore; 1590 }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH) 1591 && !perfect1 && !perfect2 && !prevSS.semiperfect){ 1592 if(verbose){System.err.println("Class 4.");} 1593 prevSS.setLimits(minStart, maxStop); 1594 prevSS.score=prevSS.quickScore=betterScore; 1595 prevSS.setPerfect(bases); 1596 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1597 }else{ 1598 if(verbose){System.err.println("Class 5: Making new site");} 1599 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1600 if(!perfect1){ss.setPerfect(bases);} 1601 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1602 // assert(Read.CHECKSITE(ss, bases)); 1603 if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));} 1604 assert(!perfect1 || ss.stop-ss.start==bases.length-1); 1605 } 1606 assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1); 1607 }else if(inbounds){ 1608 if(verbose){System.err.println("Considering new site chr"+chrom+", "+site2+"-"+site3);} 1609 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 1610 if(!perfect1){ss.setPerfect(bases);} 1611 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1612 // assert(Read.CHECKSITE(ss, bases)); 1613 ss.gaps=gapArray; 1614 if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));} 1615 } 1616 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1617 assert(ss==null || !ss.perfect || ss.semiperfect) : ss; 1618 assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS; 1619 if(ss!=null && ((SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect))){ss=null;} 1620 1621 1622 if(ss!=null){ 1623 // System.out.println("Added site "+ss.toText()+", qscore="+qscore); 1624 ssl.add(ss); 1625 if(ss.perfect){ 1626 1627 if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){ 1628 if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));} 1629 perfectsFound++; 1630 1631 //Human-specific code 1632 // if(QUIT_AFTER_TWO_PERFECTS){ 1633 // if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;} 1634 // } 1635 1636 if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;} 1637 } 1638 } 1639 1640 prevSS=ss; 1641 }else{ 1642 // System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText()); 1643 } 1644 //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //*** 1645 } 1646 } 1647 1648 while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements 1649 final Quad t2=heap.poll(); 1650 final int row=t2.row+1, col=t2.column; 1651 if(row<stops[col]){ 1652 t2.row=row; 1653 1654 int a=t2.list[row]; 1655 int a2; 1656 if((a&SITE_MASK)>=offsets[col]){ 1657 a2=a-offsets[col]; 1658 1659 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1660 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1661 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1662 }else{ 1663 int ch=numberToChrom(a, baseChrom); 1664 int st=numberToSite(a); 1665 int st2=Tools.max(st-offsets[col], 0); 1666 a2=toNumber(st2, ch); 1667 1668 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1669 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1670 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1671 } 1672 1673 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1674 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1675 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1676 1677 t2.site=a2; 1678 values[col]=a2; 1679 heap.add(t2); 1680 }else if(heap.size()<approxHitsCutoff || PERFECTMODE){ 1681 assert(USE_EXTENDED_SCORE); 1682 bestScores[0]=Tools.max(bestScores[0], currentTopScore); 1683 bestScores[1]=Tools.max(bestScores[1], maxHits); 1684 bestScores[2]=Tools.max(bestScores[2], qcutoff); 1685 bestScores[3]=Tools.max(bestScores[3], bestqscore); 1686 1687 bestScores[4]=maxQuickScore; 1688 bestScores[5]=perfectsFound; //***$ fixed by adding this line 1689 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 1690 1691 return ssl; 1692 } 1693 if(heap.isEmpty()){ 1694 assert(false) : heap.size()+", "+approxHitsCutoff; 1695 break; 1696 } 1697 } 1698 1699 } 1700 1701 assert(USE_EXTENDED_SCORE); 1702 bestScores[0]=Tools.max(bestScores[0], currentTopScore); 1703 bestScores[1]=Tools.max(bestScores[1], maxHits); 1704 bestScores[2]=Tools.max(bestScores[2], qcutoff); 1705 bestScores[3]=Tools.max(bestScores[3], bestqscore); 1706 1707 bestScores[4]=maxQuickScore; 1708 bestScores[5]=perfectsFound; 1709 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 1710 1711 return ssl; 1712 } 1713 1714 1715 /** Uses dual heaps - reserve (big) and active (small). */ camelWalk3(int[] starts, int[] stops, final byte[] bases, final byte[] baseScores, int[] keyScores, int[] offsets, final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined)1716 private final ArrayList<SiteScore> camelWalk3(int[] starts, int[] stops, final byte[] bases, 1717 final byte[] baseScores, int[] keyScores, int[] offsets, 1718 final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, 1719 int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ 1720 assert(USE_EXTENDED_SCORE); 1721 1722 final int numKeys=offsets.length; //Before shrink 1723 1724 //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. 1725 final int maxQuickScore=maxQuickScore(offsets, keyScores); 1726 1727 if(SHRINK_BEFORE_WALK){ 1728 int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); 1729 if(r!=null){ 1730 starts=r[0]; 1731 stops=r[1]; 1732 offsets=r[2]; 1733 keyScores=r[4]; 1734 } 1735 } 1736 1737 final int numHits=offsets.length; //After shrink 1738 1739 1740 assert(numHits==offsets.length); 1741 assert(numHits==keyScores.length); 1742 1743 usedKeys+=numHits; 1744 usedKeyIterations++; 1745 1746 final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); 1747 1748 assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); 1749 1750 1751 // final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); 1752 final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); 1753 final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); 1754 1755 final int baseChrom=baseChrom(baseChrom_); 1756 1757 heap.clear(); 1758 active.clear(); 1759 1760 final Quad[] triples=tripleStorage; 1761 1762 final int[] values=valueArray; 1763 final int[] sizes=sizeArray; 1764 final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); 1765 final Block b=index[baseChrom]; 1766 1767 if(ssl==null){ssl=new ArrayList<SiteScore>(8);} 1768 1769 int currentTopScore=bestScores[0]; 1770 int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); 1771 1772 int qcutoff=Tools.max(bestScores[2], minQuickScore); 1773 int bestqscore=bestScores[3]; 1774 int maxHits=bestScores[1]; 1775 int perfectsFound=bestScores[5]; 1776 assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits; 1777 int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); 1778 if(approxHitsCutoff>numHits){return ssl;} 1779 1780 final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); 1781 1782 if(currentTopScore>=maxScore){ 1783 assert(currentTopScore==maxScore); 1784 qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); 1785 } 1786 1787 1788 for(int i=0; i<numHits; i++){ 1789 final int[] sites=b.sites; 1790 final int start=starts[i]; 1791 sizes[i]=b.length(start, stops[i]); 1792 assert(sizes[i]>0); 1793 1794 int a=sites[start]; 1795 int a2; 1796 if((a&SITE_MASK)>=offsets[i]){ 1797 a2=a-offsets[i]; 1798 }else{ 1799 int ch=numberToChrom(a, baseChrom); 1800 int st=numberToSite(a); 1801 int st2=Tools.max(st-offsets[i], 0); 1802 a2=toNumber(st2, ch); 1803 } 1804 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); 1805 1806 Quad t=triples[i]; 1807 assert(t!=null) : "Should be using tripleStorage"; 1808 assert(i==t.column); 1809 t.row=start; 1810 t.site=a2; 1811 t.list=sites; 1812 values[i]=a2; 1813 1814 heap.add(t); 1815 } 1816 1817 assert(numHits>0); 1818 assert(heap.size()==numHits); 1819 1820 /* Tracks largest element allowed in 'active' */ 1821 1822 // System.err.println("\nEntering SS loop:"); 1823 // System.err.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + 1824 // "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); 1825 // System.err.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); 1826 1827 // int iter=0; 1828 SiteScore prevSS=null; 1829 int maxNearbySite=0; 1830 int site=0; 1831 int horizon=0; 1832 assert(active.isEmpty()); 1833 while(!heap.isEmpty() || !active.isEmpty()){ 1834 // iter++; 1835 1836 do{ 1837 while(!active.isEmpty() && active.peek().site==site){ //Remove all identical elements, and add subsequent elements 1838 final Quad t2=active.poll(); 1839 final int row=t2.row+1, col=t2.column; 1840 1841 //This is called the "increment" operation. Very messy and slow due to rare cases at beginning of a chrom. 1842 if(row<stops[col]){//then increment and return to the heap 1843 t2.row=row; 1844 1845 int a=t2.list[row]; 1846 int a2; 1847 if((a&SITE_MASK)>=offsets[col]){ 1848 a2=a-offsets[col]; 1849 1850 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1851 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1852 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1853 }else{ 1854 int ch=numberToChrom(a, baseChrom); 1855 int st=numberToSite(a); 1856 int st2=Tools.max(st-offsets[col], 0); 1857 a2=toNumber(st2, ch); 1858 1859 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1860 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1861 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1862 } 1863 1864 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 1865 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 1866 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 1867 1868 t2.site=a2; 1869 values[col]=a2; 1870 if(a2<=horizon){ 1871 active.add(t2); 1872 maxNearbySite=Tools.max(t2.site, maxNearbySite); 1873 }else{heap.add(t2);} 1874 }else if((heap.size()+active.size())<approxHitsCutoff || PERFECTMODE){ //Then there are not enough keys remaining for a site 1875 assert(USE_EXTENDED_SCORE); 1876 bestScores[0]=Tools.max(bestScores[0], currentTopScore); 1877 bestScores[1]=Tools.max(bestScores[1], maxHits); 1878 bestScores[2]=Tools.max(bestScores[2], qcutoff); 1879 bestScores[3]=Tools.max(bestScores[3], bestqscore); 1880 1881 bestScores[4]=maxQuickScore; 1882 bestScores[5]=perfectsFound; //***$ fixed by adding this line 1883 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 1884 1885 return ssl; 1886 } 1887 // if(heap.isEmpty() && active.isEmpty()){ 1888 // assert(false) : heap.size()+", "+active.size()+", "+approxHitsCutoff; 1889 // break; 1890 // } 1891 } 1892 1893 final Quad t; 1894 if(active.isEmpty()){ 1895 t=heap.poll(); 1896 active.add(t); 1897 maxNearbySite=t.site; 1898 }else{ 1899 t=active.peek(); 1900 } 1901 site=t.site; 1902 horizon=(int)Tools.min(Integer.MAX_VALUE, site+(long)MAX_INDEL2); 1903 while(!heap.isEmpty() && heap.peek().site<=horizon){ 1904 Quad t2=heap.poll(); 1905 active.add(t2); 1906 maxNearbySite=t2.site; 1907 } 1908 1909 if(verbose){System.err.println("\nFinished loop iteration. active="+active+"\nheap="+heap+"\nfloor="+site+", horizon="+horizon+", maxNearbySite="+maxNearbySite);} 1910 1911 }while(active.size()<approxHitsCutoff); 1912 1913 final int approxHits=active.size(); 1914 final Quad t=active.peek(); 1915 final int centerIndex=t.column; 1916 1917 if(verbose){ 1918 System.err.println("\nLeft loop. active="+active+"\nheap="+heap+"\n" + 1919 "floor="+site+", maxNearbySite="+maxNearbySite+", approxHits="+approxHits+", approxHitsCutoff="+approxHitsCutoff+"\n"); 1920 } 1921 1922 1923 1924 // approxHits=0; 1925 // {//Inner loop 1926 // final int minsite=site, maxsite=site+MAX_INDEL2; 1927 // for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){ 1928 // final int x=values[column]; 1929 // assert(x==triples[column].site); 1930 // if(x>=minsite && x<=maxsite){ 1931 // maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); 1932 // approxHits++; 1933 // }else{chances--;} 1934 //// if(verbose){ 1935 //// System.err.println("column="+column+", numHits="+numHits+", approxHits="+approxHits+ 1936 //// ", approxHitsCutoff="+approxHitsCutoff+", chances="+chances); 1937 //// } 1938 // } 1939 // //Invalid assertion due to loop early exit 1940 //// assert(approxHits>0) : "\niter="+iter+", maxHits="+maxHits+", numHits="+numHits+", approxHitsCutoff="+approxHitsCutoff+ 1941 //// "\nheap.size()="+heap.size()+", minsite="+minsite+", maxsite="+maxsite+", values[center]="+values[centerIndex]+", t="+t; 1942 // } 1943 // assert(approxHits<=active.size()) : "approxHits="+approxHits+", active.size()="+active.size()+", maxNearbySite="+maxNearbySite+"\nvalues="+Arrays.toString(values); 1944 1945 hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; 1946 1947 assert(centerIndex>=0) : centerIndex; 1948 1949 //I don't remember what this assertion was for or why, but it's causing trouble. 1950 //assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; 1951 if(approxHits>=approxHitsCutoff){ 1952 1953 // if(verbose){System.err.println("A");} 1954 1955 int score; 1956 int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); 1957 if(ADD_SCORE_Z){ 1958 int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); 1959 qscore+=scoreZ; 1960 } 1961 1962 int mapStart=site, mapStop=maxNearbySite; 1963 assert(mapStart<=mapStop); 1964 1965 assert(USE_EXTENDED_SCORE); 1966 1967 boolean locArrayValid=false; 1968 if(qscore<qcutoff){ 1969 score=-1; 1970 // if(verbose){System.err.println("B");} 1971 }else{ 1972 // if(verbose){System.err.println("C");} 1973 final int chrom=numberToChrom(site, baseChrom); 1974 1975 //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results). 1976 //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run. 1977 //May have something to do with compiler loop optimizations. 1978 if(shortCircuit && qscore==maxQuickScore){ 1979 // if(verbose){System.err.println("D");} 1980 assert(approxHits==numKeys); 1981 score=maxScore; 1982 }else{ 1983 // if(verbose){System.err.println("E");} 1984 if(verbose){ 1985 System.err.println("numHits="+numHits+", approxHits="+approxHits+", qscore="+qscore+", qcutoff="+qcutoff+", filter_by_qscore="+filter_by_qscore); 1986 System.err.println("Extending "+Arrays.toString(values)); 1987 } 1988 score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); 1989 locArrayValid=true; 1990 1991 if(verbose){ 1992 System.err.println("score: "+score); 1993 System.err.println("locArray: "+Arrays.toString(locArray)); 1994 } 1995 1996 //Correct begin and end positions if they changed. 1997 int min=Integer.MAX_VALUE; 1998 int max=Integer.MIN_VALUE; 1999 for(int i=0; i<locArray.length; i++){ 2000 int x=locArray[i]; 2001 if(x>-1){ 2002 if(x<min){min=x;} 2003 if(x>max){max=x;} 2004 } 2005 } 2006 2007 if(score>=maxScore){ 2008 assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ 2009 ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; 2010 } 2011 2012 // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? 2013 if(min<0 || max<0){ 2014 if(!Shared.anomaly){ 2015 Shared.anomaly=true; 2016 System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+ 2017 "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+ 2018 "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+ 2019 "numHits="+approxHits+", approxHits="+approxHits+"\n"+ 2020 "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+ 2021 "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+ 2022 "locArray:\t"+Arrays.toString(locArray)+"\n"+ 2023 "values:\t"+Arrays.toString(values)+"\n"+ 2024 "bases:\t"+new String(bases)); 2025 System.err.println(); 2026 assert(false); 2027 } 2028 score=-99999; 2029 } 2030 2031 //mapStart and mapStop are indices 2032 mapStart=toNumber(min, chrom); 2033 mapStop=toNumber(max, chrom); 2034 assert(mapStart<=mapStop); 2035 2036 if(score>=maxScore){ 2037 assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ 2038 ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; 2039 } 2040 } 2041 // if(verbose){System.err.println("F");} 2042 2043 if(score==maxScore){ 2044 qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); 2045 approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); 2046 } 2047 2048 if(score>=cutoff){ 2049 qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); 2050 bestqscore=Tools.max(qscore, bestqscore); 2051 } 2052 } 2053 // if(verbose){System.err.println("G");} 2054 2055 if(score>=cutoff){ 2056 // if(verbose){System.err.println("H");} 2057 2058 if(score>currentTopScore){ 2059 // if(verbose){System.err.println("I");} 2060 // System.err.println("New top score!"); 2061 2062 if(DYNAMICALLY_TRIM_LOW_SCORES){ 2063 2064 maxHits=Tools.max(approxHits, maxHits); 2065 approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); 2066 2067 cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); 2068 if(score>=maxScore){ 2069 assert(USE_EXTENDED_SCORE); 2070 cutoff=Tools.max(cutoff, (int)(score*0.95f)); 2071 } 2072 } 2073 2074 currentTopScore=score; 2075 2076 // System.err.println("New top score: "+currentTopScore+" \t("+cutoff+")"); 2077 } 2078 // if(verbose){System.err.println("J");} 2079 2080 final int chrom=numberToChrom(mapStart, baseChrom); 2081 final int site2=numberToSite(mapStart); 2082 final int site3=numberToSite(mapStop)+bases.length-1; 2083 2084 assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+ 2085 ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+ 2086 "\n\n"+Arrays.toString(b.getHitList(centerIndex)); 2087 assert(site2<site3) : "\nchrom="+chrom+", strand="+strand+", site="+site+", maxNearbySite="+maxNearbySite+"\n"+ 2088 "mapStart="+mapStart+", mapStop="+mapStop+", site2="+site2+", site3="+site3+", read.length="+bases.length+"\n"+ 2089 "numberToSite("+mapStart+")="+numberToSite(mapStart)+", numberToSite("+mapStop+")="+numberToSite(mapStop)+"\n"+ 2090 "\n"+new String(bases)+"\n"; 2091 2092 if(verbose){ 2093 System.err.println("chrom="+chrom+", strand="+strand+", site="+site+", maxNearbySite="+maxNearbySite+"\n"+ 2094 "mapStart="+mapStart+", mapStop="+mapStop+", site2="+site2+", site3="+site3+", read.length="+bases.length); 2095 } 2096 2097 int[] gapArray=null; 2098 if(site3-site2>=MINGAP+bases.length){ 2099 assert(locArrayValid) : "Loc array was not filled."; 2100 // System.err.println("****\n"+Arrays.toString(locArray)+"\n"); 2101 // int[] clone=locArray.clone(); 2102 gapArray=makeGapArray(locArray, site2, MINGAP); 2103 if(gapArray!=null){ 2104 // System.err.println(Arrays.toString(locArray)+"\n"); 2105 // System.err.println(Arrays.toString(gapArray)); 2106 // 2107 //// int sub=site2-mapStart;//thus site2=mapStart+sub 2108 //// for(int i=0; i<gapArray.length; i++){ 2109 //// gapArray[i]+=sub; 2110 //// } 2111 //// System.err.println(Arrays.toString(gapArray)); 2112 // 2113 // System.err.println(mapStart+" -> "+site2); 2114 // System.err.println(mapStop+" -> "+site3); 2115 2116 assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length); 2117 assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+ 2118 mapStart+" -> "+site2+"\n"+ 2119 mapStop+" -> "+site3+"\n\n"+ 2120 Arrays.toString(gapArray)+"\n\n"+ 2121 // Arrays.toString(clone)+"\n\n"+ 2122 Arrays.toString(locArray)+"\n"+ 2123 "numHits="+numHits+", "+ 2124 "heap.size="+heap.size()+", "+ 2125 "numHits="+numHits+", "+ 2126 "approxHits="+approxHits+"\n"; 2127 gapArray[0]=Tools.min(gapArray[0], site2); 2128 gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); 2129 } 2130 if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} 2131 // assert(false) : Arrays.toString(locArray); 2132 } 2133 2134 2135 //This block is optional, but tries to eliminate multiple identical alignments 2136 2137 SiteScore ss=null; 2138 final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; 2139 final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]); 2140 // if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);} 2141 2142 if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null && 2143 prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ 2144 2145 final int betterScore=Tools.max(score, prevSS.score); 2146 final int minStart=Tools.min(prevSS.start, site2); 2147 final int maxStop=Tools.max(prevSS.stop, site3); 2148 final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined; 2149 assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect); 2150 2151 final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length)); 2152 2153 if(prevSS.start==site2 && prevSS.stop==site3){ 2154 prevSS.score=prevSS.quickScore=betterScore; 2155 prevSS.perfect=(prevSS.perfect || perfect1 || perfect2); 2156 if(prevSS.perfect){prevSS.semiperfect=true;} 2157 }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){ 2158 if(perfect2){ 2159 //do nothing 2160 }else if(perfect1){ 2161 prevSS.setStop(site3); 2162 if(!prevSS.perfect){perfectsFound++;}//***$ 2163 prevSS.perfect=prevSS.semiperfect=true; 2164 }else{ 2165 prevSS.setStop(maxStop); 2166 prevSS.setPerfect(bases); 2167 } 2168 prevSS.score=prevSS.quickScore=betterScore; 2169 }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){ 2170 if(perfect2){ 2171 //do nothing 2172 }else if(perfect1){ 2173 prevSS.setStart(site2); 2174 if(!prevSS.perfect){perfectsFound++;}//***$ 2175 prevSS.perfect=prevSS.semiperfect=true; 2176 }else{ 2177 prevSS.setStart(minStart); 2178 prevSS.setPerfect(bases); 2179 } 2180 prevSS.score=prevSS.quickScore=betterScore; 2181 }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH) 2182 && !perfect1 && !perfect2 && !prevSS.semiperfect){ 2183 prevSS.setLimits(minStart, maxStop); 2184 prevSS.score=prevSS.quickScore=betterScore; 2185 prevSS.setPerfect(bases); 2186 }else{ 2187 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 2188 if(!perfect1){ss.setPerfect(bases);} 2189 // assert(Read.CHECKSITE(ss, bases)); 2190 if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));} 2191 assert(!perfect1 || ss.stop-ss.start==bases.length-1); 2192 } 2193 assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1); 2194 }else if(inbounds){ 2195 ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); 2196 if(!perfect1){ss.setPerfect(bases);} 2197 // assert(Read.CHECKSITE(ss, bases)); 2198 ss.gaps=gapArray; 2199 if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));} 2200 } 2201 2202 assert(ss==null || !ss.perfect || ss.semiperfect) : ss; 2203 assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS; 2204 if(ss!=null && ((SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect))){ss=null;} 2205 2206 2207 if(ss!=null){ 2208 // System.err.println("Added site "+ss.toText()+", qscore="+qscore); 2209 ssl.add(ss); 2210 if(ss.perfect){ 2211 2212 if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){ 2213 if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));} 2214 perfectsFound++; 2215 2216 //Human-specific code 2217 // if(QUIT_AFTER_TWO_PERFECTS){ 2218 // if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;} 2219 // } 2220 2221 if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;} 2222 } 2223 } 2224 2225 prevSS=ss; 2226 }else{ 2227 // System.err.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText()); 2228 } 2229 } 2230 } 2231 2232 // while(!active.isEmpty() && active.peek().site==site){ //Remove all identical elements, and add subsequent elements 2233 // final Quad t2=active.poll(); 2234 // final int row=t2.row+1, col=t2.column; 2235 // 2236 // //This is called the "increment" operation. Very messy and slow due to rare cases at beginning of a chrom. 2237 // if(row<stops[col]){//then increment and return to the heap 2238 // t2.row=row; 2239 // 2240 // int a=t2.list[row]; 2241 // int a2; 2242 // if((a&SITE_MASK)>=offsets[col]){ 2243 // a2=a-offsets[col]; 2244 // 2245 // assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 2246 // "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 2247 // ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 2248 // }else{ 2249 // int ch=numberToChrom(a, baseChrom); 2250 // int st=numberToSite(a); 2251 // int st2=Tools.max(st-offsets[col], 0); 2252 // a2=toNumber(st2, ch); 2253 // 2254 // assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 2255 // "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 2256 // ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 2257 // } 2258 // 2259 // assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 2260 // "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ 2261 // ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 2262 // 2263 // t2.site=a2; 2264 // values[col]=a2; 2265 // if() 2266 // heap.add(t2); 2267 // }else if((heap.size()+active.size())<approxHitsCutoff || PERFECTMODE){ //Then there are not enough keys remaining for a site 2268 // assert(USE_EXTENDED_SCORE); 2269 // bestScores[0]=Tools.max(bestScores[0], currentTopScore); 2270 // bestScores[1]=Tools.max(bestScores[1], maxHits); 2271 // bestScores[2]=Tools.max(bestScores[2], qcutoff); 2272 // bestScores[3]=Tools.max(bestScores[3], bestqscore); 2273 // 2274 // bestScores[4]=maxQuickScore; 2275 // bestScores[5]=perfectsFound; //***$ fixed by adding this line 2276 // if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 2277 // 2278 // return ssl; 2279 // } 2280 // if(heap.isEmpty() && active.isEmpty()){ 2281 // assert(false) : heap.size()+", "+active.size()+", "+approxHitsCutoff; 2282 // break; 2283 // } 2284 // } 2285 2286 } 2287 2288 assert(USE_EXTENDED_SCORE); 2289 bestScores[0]=Tools.max(bestScores[0], currentTopScore); 2290 bestScores[1]=Tools.max(bestScores[1], maxHits); 2291 bestScores[2]=Tools.max(bestScores[2], qcutoff); 2292 bestScores[3]=Tools.max(bestScores[3], bestqscore); 2293 2294 bestScores[4]=maxQuickScore; 2295 bestScores[5]=perfectsFound; 2296 if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} 2297 2298 return ssl; 2299 } 2300 2301 findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores, final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits, boolean earlyExit, boolean perfectOnly)2302 private final int[] findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores, 2303 final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits, 2304 boolean earlyExit, boolean perfectOnly){ 2305 2306 final int numHits=offsets.length; 2307 assert(numHits>=prevMaxHits); 2308 2309 final int baseChrom=baseChrom(baseChrom_); 2310 final Block b=index[baseChrom]; 2311 final int[] sizes=sizeArray; 2312 2313 heap.clear(); 2314 for(int i=0; i<numHits; i++){ 2315 final int[] sites=b.sites; 2316 final int start=starts[i]; 2317 sizes[i]=b.length(start, stops[i]); 2318 assert(sizes[i]>0); 2319 2320 int a=sites[start]; 2321 int a2; 2322 if((a&SITE_MASK)>=offsets[i]){ 2323 a2=a-offsets[i]; 2324 }else{ 2325 int ch=numberToChrom(a, baseChrom); 2326 int st=numberToSite(a); 2327 int st2=Tools.max(st-offsets[i], 0); 2328 a2=toNumber(st2, ch); 2329 } 2330 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); 2331 2332 Quad t=triples[i]; 2333 assert(t!=null) : "Should be using tripleStorage"; 2334 assert(i==t.column); 2335 t.row=start; 2336 t.site=a2; 2337 t.list=sites; 2338 values[i]=a2; 2339 2340 heap.add(t); 2341 } 2342 2343 final int maxQuickScore=maxQuickScore(offsets, keyScores); 2344 2345 int topQscore=-999999999; 2346 2347 int maxHits=0; 2348 // int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; 2349 2350 2351 int approxHitsCutoff; 2352 final int indelCutoff; 2353 if(perfectOnly){ 2354 approxHitsCutoff=numHits; 2355 indelCutoff=0; 2356 }else{ 2357 approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy 2358 indelCutoff=MAX_INDEL2; 2359 } 2360 2361 2362 while(!heap.isEmpty()){ 2363 Quad t=heap.peek(); 2364 final int site=t.site; 2365 final int centerIndex=t.column; 2366 2367 int maxNearbySite=site; 2368 2369 2370 int approxHits=0; 2371 {//Inner loop 2372 final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2; 2373 for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){ 2374 final int x=values[column]; 2375 assert(x==triples[column].site); 2376 if(x>=minsite && x<=maxsite){ 2377 maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); 2378 approxHits++; 2379 }else{chances--;} 2380 } 2381 } 2382 hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; 2383 2384 assert(centerIndex>=0) : centerIndex; 2385 2386 //I don't remember what this assertion was for or why, but it's causing trouble. 2387 //assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; 2388 if(approxHits>=approxHitsCutoff){ 2389 2390 int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); 2391 2392 if(ADD_SCORE_Z){ 2393 int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); 2394 qscore+=scoreZ; 2395 } 2396 2397 if(qscore>topQscore){ 2398 2399 // maxHits=Tools.max(approxHits, maxHits); 2400 // approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan 2401 2402 maxHits=Tools.max(approxHits, maxHits); 2403 approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan 2404 2405 topQscore=qscore; 2406 2407 if(qscore>=maxQuickScore){ 2408 assert(qscore==maxQuickScore); 2409 assert(approxHits==numHits); 2410 if(earlyExit){ 2411 return new int[] {topQscore, maxHits}; 2412 } 2413 } 2414 } 2415 } 2416 2417 while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements 2418 final Quad t2=heap.poll(); 2419 final int row=t2.row+1, col=t2.column; 2420 if(row<stops[col]){ 2421 t2.row=row; 2422 2423 int a=t2.list[row]; 2424 int a2; 2425 if((a&SITE_MASK)>=offsets[col]){ 2426 a2=a-offsets[col]; 2427 2428 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 2429 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ 2430 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 2431 }else{ 2432 int ch=numberToChrom(a, baseChrom); 2433 int st=numberToSite(a); 2434 int st2=Tools.max(st-offsets[col], 0); 2435 a2=toNumber(st2, ch); 2436 2437 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 2438 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ 2439 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 2440 } 2441 2442 assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : 2443 "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ 2444 ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; 2445 2446 t2.site=a2; 2447 values[col]=a2; 2448 heap.add(t2); 2449 }else if(earlyExit && (perfectOnly || heap.size()<approxHitsCutoff)){ 2450 return new int[] {topQscore, maxHits}; 2451 } 2452 if(heap.isEmpty()){break;} 2453 } 2454 2455 } 2456 2457 2458 2459 return new int[] {topQscore, maxHits}; 2460 } 2461 2462 absdif(int a, int b)2463 private static final int absdif(int a, int b){ 2464 return a>b ? a-b : b-a; 2465 } 2466 2467 2468 @Override maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality)2469 final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){ 2470 2471 if(useQuality){ 2472 //These lines apparently MUST be used if quality is used later on for slow align. 2473 if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);} 2474 if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sumInt(baseScores);} 2475 }else{ 2476 if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);} 2477 if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);} 2478 } 2479 2480 return maxQuickScore(offsets, keyScores); 2481 } 2482 2483 maxQuickScore(int[] offsets, int[] keyScores)2484 public final int maxQuickScore(int[] offsets, int[] keyScores){ 2485 2486 // int x=offsets.length*BASE_KEY_HIT_SCORE; 2487 int x=Tools.intSum(keyScores); 2488 int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]); 2489 // if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);} 2490 // assert(!ADD_SCORE_Z) : "Need to make sure this is correct..."; 2491 2492 // if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);} 2493 if(ADD_SCORE_Z){x+=maxScoreZ(offsets);} 2494 2495 return x+y; 2496 // int bonus=(2*(HIT_SCORE/2)); //For matching both ends 2497 // return x+y+bonus; 2498 } 2499 2500 quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits)2501 private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], 2502 int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){ 2503 2504 hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++; 2505 if(numApproxHits==1){return keyScores[centerIndex];} 2506 2507 //Done! 2508 //Correct way to calculate score: 2509 //Find the first chunk that exactly hits the center. 2510 //Then, align leftward of it, and align rightward of it, and sum the scores. 2511 2512 //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns 2513 //(for example, a small indel will generate two valid site candidates), choose the lower site. 2514 2515 int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+ 2516 scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex; 2517 2518 int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets); 2519 if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);} 2520 // int z=scoreZ(locs, hits); 2521 return x+y; 2522 } 2523 2524 2525 // /** Generates a term that increases score with how many bases in the read match the ref. */ 2526 // private static final int scoreZ(int[] locs, int centerIndex, int offsets[]){ 2527 // final int center=locs[centerIndex]; 2528 // 2529 // final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE]; 2530 // 2531 // final int maxLoc=center+MAX_INDEL2; 2532 // final int minLoc=Tools.max(0, center-MAX_INDEL); 2533 // 2534 // int score=0; 2535 // 2536 // for(int i=0; i<locs.length; i++){ 2537 // int loc=locs[i]; 2538 //// int dif=absdif(loc, center); 2539 // if(loc>=minLoc && loc<=maxLoc){ 2540 //// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ 2541 //// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); 2542 // 2543 // int offset=offsets[i]; 2544 // int max=CHUNKSIZE+offset; 2545 // 2546 // for(int j=offset; j<max; j++){ 2547 // int old=refLoc[j]; 2548 // if(old==0){ 2549 // refLoc[j]=loc; 2550 // score+=4; 2551 // }else if(old>loc){ 2552 // refLoc[j]=loc; 2553 // score-=2; 2554 // }else if(old==loc){ 2555 // score-=1; 2556 // //do nothing, perhaps, or add 1? 2557 // }else{ 2558 // score-=2; 2559 // assert(old<loc); 2560 // } 2561 // } 2562 // } 2563 // } 2564 // return score; 2565 // } 2566 2567 2568 extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values, final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits)2569 private final int extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values, 2570 final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits){ 2571 callsToExtendScore++; 2572 hist_hits_extend[Tools.min(HIT_HIST_LEN, numApproxHits)]++; 2573 2574 final int centerVal=values[centerIndex]; 2575 final int centerLoc=numberToSite(centerVal); 2576 2577 final int minLoc=Tools.max(0, centerLoc-MAX_INDEL); //Legacy, for assertions 2578 final int maxLoc=centerLoc+MAX_INDEL2; //Legacy, for assertions 2579 2580 final int minVal=centerVal-MAX_INDEL; 2581 final int maxVal=centerVal+MAX_INDEL2; 2582 2583 final byte[] ref=Data.getChromosome(chrom).array; 2584 2585 if(verbose){ 2586 System.err.println("\n"); 2587 System.err.println("minLoc="+minLoc+", maxLoc="+ maxLoc+", centerIndex="+centerIndex+", centerVal="+centerVal+", centerLoc="+centerLoc); 2588 System.err.println("minVal="+minVal+", maxVal="+ maxVal+", numHits="+numHits+", numApproxHits="+numApproxHits); 2589 System.err.println("offsets:\t"+Arrays.toString(offsets)); 2590 System.err.println("values:\t"+Arrays.toString(values)); 2591 System.err.println(); 2592 int centerOffset=offsets[centerIndex]; 2593 2594 for(int i=0; i<centerOffset; i++){System.err.print(" ");} 2595 for(int i=centerOffset; i<centerOffset+KEYLEN; i++){System.err.print((char)bases[i]);} 2596 System.err.println(); 2597 2598 System.err.println(new String(bases)); 2599 System.err.println(new String(KillSwitch.copyOfRange(ref, centerLoc, centerLoc+bases.length))); 2600 System.err.println(); 2601 } 2602 2603 // int[] locArray=new int[bases.length]; 2604 Arrays.fill(locArray, -1); 2605 2606 2607 // if(verbose){ 2608 // System.err.println("Reverse fill:"); 2609 // } 2610 2611 //First fill in reverse 2612 for(int i=0, keynum=0; i<numHits; i++){ 2613 final int value=values[i]; 2614 // if(verbose){System.err.println("values["+i+"]="+value);} 2615 2616 if(value>=minVal && value<=maxVal){ 2617 final int refbase=numberToSite(value); 2618 // if(verbose){System.err.println("refbase="+refbase);} 2619 2620 // Exception in thread "Thread-23" java.lang.AssertionError: 71543, 536356470, 536956470 2621 // at align2.BBIndex.extendScore(BBIndex.java:2620) 2622 // at align2.BBIndex.slowWalk3(BBIndex.java:1393) 2623 // at align2.BBIndex.find(BBIndex.java:777) 2624 // at align2.BBIndex.find(BBIndex.java:623) 2625 // at align2.BBIndex.findAdvanced(BBIndex.java:400) 2626 // at align2.AbstractMapThread.quickMap(AbstractMapThread.java:761) 2627 // at align2.BBMapThread.processRead(BBMapThread.java:408) 2628 // at align2.AbstractMapThread.run(AbstractMapThread.java:519) 2629 2630 //TODO - figure out why this assertion fires 2631 // assert(refbase>=minLoc && refbase<=maxLoc) : refbase+", "+minLoc+", "+maxLoc; //Apparently not a correct assumption 2632 2633 // System.err.println("Reverse: Trying key "+refbase+" @ "+offsets[i]); 2634 // System.err.println("Passed!"); 2635 keynum++; 2636 final int callbase=offsets[i]; 2637 2638 int misses=0; 2639 for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc<ref.length; cloc--, rloc--){ 2640 int old=locArray[cloc]; 2641 if(old==refbase){ 2642 // if(verbose){System.err.println("Broke because old="+old+", refbase="+refbase);} 2643 break; 2644 } //Already filled with present value 2645 if(misses>0 && old>=0){ 2646 // if(verbose){System.err.println("Broke because old="+old+", misses="+misses);} 2647 break; 2648 } //Already filled with something that has no errors 2649 byte c=bases[cloc]; 2650 byte r=ref[rloc]; 2651 2652 if(c==r){ 2653 if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center 2654 locArray[cloc]=refbase; 2655 } 2656 }else{ 2657 misses++; 2658 //Only extends first key all the way back. Others stop at the first error. 2659 if(old>=0 || keynum>1){ 2660 // if(verbose){System.err.println("Broke because old="+old+", keynum="+keynum);} 2661 break; 2662 } 2663 } 2664 } 2665 } 2666 } 2667 2668 // if(verbose){ 2669 // System.err.println("locArray:\t"+Arrays.toString(locArray)); 2670 // System.err.println("Forward fill:"); 2671 // } 2672 2673 //Then fill forward 2674 for(int i=0; i<numHits; i++){ 2675 final int value=values[i]; 2676 // if(verbose){System.err.println("values["+i+"]="+value);} 2677 2678 if(value>=minVal && value<=maxVal){ 2679 final int refbase=numberToSite(value); 2680 // if(verbose){System.err.println("refbase="+refbase);} 2681 //TODO - figure out why this assertion fires 2682 // assert(refbase>=minLoc && refbase<=maxLoc) : refbase+", "+minLoc+", "+maxLoc; //Apparently not a correct assumption 2683 final int callbase=offsets[i]; 2684 2685 int misses=0; 2686 for(int cloc=callbase+KEYLEN, rloc=refbase+cloc; cloc<bases.length && rloc<ref.length; cloc++, rloc++){ 2687 int old=locArray[cloc]; 2688 if(old==refbase){break;} //Already filled with present value 2689 if(misses>0 && old>=0){break;} //Already filled with something that has no errors 2690 byte c=bases[cloc]; 2691 byte r=ref[rloc]; 2692 2693 if(c==r){ 2694 if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center 2695 locArray[cloc]=refbase; 2696 } 2697 }else{ 2698 misses++; 2699 if(old>=0){break;} //Already filled with something that has no errors 2700 } 2701 } 2702 } 2703 } 2704 2705 //Try to subsume out-of-order locs where higher numbers come before lower numbers. Made things worse. 2706 // for(int i=1; i<locArray.length; i++){ 2707 // final int loc=locArray[i]; 2708 // final int last=locArray[i-1]; 2709 // if(loc<last && last>-1){ 2710 // final byte c=bases[i]; 2711 //// final int rloc1=loc+i; 2712 // final int rloc2=last+i; 2713 // byte r2=ref[rloc2]; 2714 // if(r2==c){ 2715 // locArray[i]=last; 2716 // } 2717 // } 2718 // } 2719 2720 // //Change 'N' to -2. A bit slow. 2721 // { 2722 // int firstMatch=0; 2723 // while(firstMatch<locArray.length && locArray[firstMatch]<0){firstMatch++;} 2724 // assert(firstMatch<locArray.length) : new String(bases); 2725 // int last=locArray[firstMatch]; 2726 // for(int i=firstMatch-1; i>=0; i--){ 2727 // final byte c=bases[i]; 2728 // if(c=='N'){locArray[i]=-2;} 2729 // else{ 2730 // assert(locArray[i]==-1); 2731 // final int rloc=last+i; 2732 // byte r=ref[rloc]; 2733 // if(r=='N'){locArray[i]=-2;} 2734 // } 2735 // } 2736 // for(int i=firstMatch; i<locArray.length; i++){ 2737 // final int loc=locArray[i]; 2738 // if(last<1){last=loc;} 2739 // final byte c=bases[i]; 2740 // if(c=='N'){locArray[i]=-2;} 2741 // else if(loc==-1 && last>0){ 2742 // final int rloc=last+i; 2743 // byte r=ref[rloc]; 2744 // if(r=='N'){locArray[i]=-2;} 2745 // } 2746 // } 2747 // } 2748 2749 //Change 'N' to -2, but only for nocalls, not norefs. Much faster. 2750 { 2751 final byte nb=(byte)'N'; 2752 for(int i=0; i<bases.length; i++){ 2753 if(bases[i]==nb){locArray[i]=-2;} 2754 } 2755 } 2756 2757 // 2758 // { 2759 // int last=locArray[0]; 2760 // for(int i=1; i<locArray.length; i++){ 2761 // final int loc=locArray[i]; 2762 // if(loc>0){ 2763 // if(last<1){last=loc;} 2764 // }else{ 2765 // if(last>0){ 2766 // final byte c=bases[i]; 2767 // final int rloc2=last+i; 2768 // byte r2=ref[rloc2]; 2769 // if(c=='N'){ 2770 // 2771 // }else if(c==r2){ 2772 // locArray[i]=last; 2773 // } 2774 // } 2775 // last=-1; 2776 // } 2777 // } 2778 // } 2779 // { 2780 // int last=locArray[locArray.length-1]; 2781 // for(int i=locArray.length-2; i>=0; i--){ 2782 // final int loc=locArray[i]; 2783 // if(loc>0){ 2784 // if(last<1){last=loc;} 2785 // }else{ 2786 // if(last>0){ 2787 // final byte c=bases[i]; 2788 // final int rloc2=last+i; 2789 // byte r2=ref[rloc2]; 2790 // if(c=='N'){ 2791 // 2792 // }else if(c==r2){ 2793 // locArray[i]=last; 2794 // } 2795 // } 2796 // last=-1; 2797 // } 2798 // } 2799 // } 2800 2801 // for(int i=locArray.length-2; i>=0; i--){ 2802 // final int loc=locArray[i]; 2803 // final int last=locArray[i+1]; 2804 // if(loc>last && last>-1){ 2805 // final byte c=bases[i]; 2806 //// final int rloc1=loc+i; 2807 // final int rloc2=last+i; 2808 // byte r2=ref[rloc2]; 2809 // if(r2==c){ 2810 // locArray[i]=last; 2811 // } 2812 // } 2813 // } 2814 2815 if(verbose){ 2816 // System.err.println("locArray:\t"+Arrays.toString(locArray)); 2817 2818 int centerOffset=offsets[centerIndex]; 2819 int lim=centerOffset+KEYLEN; 2820 for(int i=centerOffset; i<lim; i++){ 2821 assert(locArray[i]!=-1) : " ( "+centerOffset+" < "+i+" < "+lim+" ) "+ 2822 "\nlocArray: "+Arrays.toString(locArray)+ 2823 "\nvalues: "+Arrays.toString(values)+ 2824 "\noffsets: "+Arrays.toString(offsets); 2825 } 2826 } 2827 2828 if(USE_AFFINE_SCORE){ 2829 /* TODO - sometimes returns a higher score than actual alignment. This should never happen. */ 2830 int score=(KFILTER<2 ? msa.calcAffineScore(locArray, baseScores, bases) : 2831 msa.calcAffineScore(locArray, baseScores, bases, KFILTER)); 2832 return score; 2833 } 2834 2835 int score=0; 2836 int lastLoc=-1; 2837 int centerBonus=BASE_HIT_SCORE/5; 2838 for(int i=0; i<locArray.length; i++){ 2839 int loc=locArray[i]; 2840 if(loc>=0){ 2841 score+=BASE_HIT_SCORE+baseScores[i]; 2842 if(loc==centerLoc){score+=centerBonus;} 2843 if(loc!=lastLoc && lastLoc>=0){ 2844 int dif=absdif(loc, lastLoc); 2845 int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT); 2846 score-=penalty; 2847 } 2848 lastLoc=loc; 2849 } 2850 } 2851 2852 // System.err.println("Extended score: "+score); 2853 // System.err.println(Arrays.toString(locArray)); 2854 2855 2856 return score; 2857 } 2858 2859 2860 /** NOTE! This destroys the locArray, so use a copy if needed. */ makeGapArray(int[] locArray, int minLoc, int minGap)2861 private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){ 2862 int gaps=0; 2863 boolean doSort=false; 2864 2865 if(locArray[0]<0){locArray[0]=minLoc;} 2866 for(int i=1; i<locArray.length; i++){ 2867 if(locArray[i]<0){locArray[i]=locArray[i-1]+1;} 2868 else{locArray[i]+=i;} 2869 if(locArray[i]<locArray[i-1]){doSort=true;} 2870 } 2871 2872 // System.err.println(Arrays.toString(locArray)+"\n"); 2873 2874 if(doSort){ 2875 // System.err.println("*"); 2876 Arrays.sort(locArray); 2877 } 2878 // System.err.println(Arrays.toString(locArray)+"\n"); 2879 2880 for(int i=1; i<locArray.length; i++){ 2881 int dif=locArray[i]-locArray[i-1]; 2882 assert(dif>=0); 2883 if(dif>minGap){ 2884 gaps++; 2885 } 2886 } 2887 if(gaps<1){return null;} 2888 int[] out=new int[2+gaps*2]; 2889 out[0]=locArray[0]; 2890 out[out.length-1]=locArray[locArray.length-1]; 2891 2892 for(int i=1, j=1; i<locArray.length; i++){ 2893 int dif=locArray[i]-locArray[i-1]; 2894 assert(dif>=0); 2895 if(dif>minGap){ 2896 out[j]=locArray[i-1]; 2897 out[j+1]=locArray[i]; 2898 j+=2; 2899 } 2900 } 2901 return out; 2902 } 2903 2904 2905 /** Generates a term that increases score with how many bases in the read match the ref. */ scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits)2906 private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){ 2907 2908 if(numApproxHits==1){return SCOREZ_1KEY;} 2909 2910 final int center=locs[centerIndex]; 2911 2912 final int maxLoc=center+MAX_INDEL2; 2913 final int minLoc=Tools.max(0, center-MAX_INDEL); 2914 2915 int score=0; 2916 2917 int a0=-1, b0=-1; 2918 2919 for(int i=0; i<numHits; i++){ 2920 int loc=locs[i]; 2921 // int dif=absdif(loc, center); 2922 if(loc>=minLoc && loc<=maxLoc){ 2923 // assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ 2924 // "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); 2925 int a=offsets[i]; 2926 2927 if(b0<a){ 2928 score+=b0-a0; 2929 a0=a; 2930 } 2931 b0=a+KEYLEN; 2932 } 2933 } 2934 score+=b0-a0; 2935 score=score*Z_SCORE_MULT; 2936 // assert(score==scoreZslow(locs, centerIndex, offsets, false)) : scoreZslow(locs, centerIndex, offsets, true)+" != "+score; 2937 return score; 2938 } 2939 2940 @Deprecated 2941 /** This was just to verify scoreZ2. */ scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display, int numHits)2942 private final int scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display, int numHits){ 2943 final int center=locs[centerIndex]; 2944 2945 final int maxLoc=center+MAX_INDEL2; 2946 final int minLoc=Tools.max(0, center-MAX_INDEL); 2947 2948 byte[] array=new byte[offsets[offsets.length-1]+KEYLEN]; 2949 int score=0; 2950 2951 for(int i=0; i<numHits; i++){ 2952 int loc=locs[i]; 2953 // int dif=absdif(loc, center); 2954 if(loc>=minLoc && loc<=maxLoc){ 2955 int pos=offsets[i]; 2956 // if(true){ 2957 // System.err.println("\ni="+i+", pos="+pos+", array=["+array.length+"], limit="+(pos+CHUNKSIZE-1)); 2958 // } 2959 for(int j=pos; j<pos+KEYLEN; j++){ 2960 if(array[j]==0){score++;} 2961 array[j]=1; 2962 } 2963 } 2964 } 2965 2966 if(display){System.err.println("\n"+Arrays.toString(array)+"\n");} 2967 2968 return score*Z_SCORE_MULT; 2969 } 2970 2971 /** Generates a term that increases score with how many bases in the read match the ref. */ maxScoreZ(int offsets[])2972 private final int maxScoreZ(int offsets[]){ 2973 int score=0; 2974 int a0=-1, b0=-1; 2975 2976 for(int i=0; i<offsets.length; i++){ 2977 int a=offsets[i]; 2978 2979 if(b0<a){ 2980 score+=b0-a0; 2981 a0=a; 2982 } 2983 b0=a+KEYLEN; 2984 2985 } 2986 score+=b0-a0; 2987 return score*Z_SCORE_MULT; 2988 } 2989 2990 scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits)2991 private final int scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits){ 2992 2993 int score=0; 2994 2995 int prev, loc=locs[centerIndex]; 2996 2997 for(int i=centerIndex+1; i<numHits; i++){ 2998 2999 if(locs[i]>=0){ 3000 prev=loc; 3001 loc=locs[i]; 3002 3003 int offset=absdif(loc, prev); 3004 3005 if(offset<=MAX_INDEL){ 3006 score+=keyScores[i]; 3007 if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} 3008 3009 // if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key 3010 3011 if(penalizeIndels && offset!=0){ 3012 int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); 3013 score-=penalty; 3014 // score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4)); 3015 } 3016 }else{ 3017 loc=prev; 3018 } 3019 } 3020 3021 } 3022 return score; 3023 3024 } 3025 scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels)3026 private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){ 3027 3028 callsToScore++; 3029 3030 int score=0; 3031 3032 int prev, loc=locs[centerIndex]; 3033 3034 for(int i=centerIndex-1; i>=0; i--){ 3035 3036 if(locs[i]>=0){ 3037 prev=loc; 3038 loc=locs[i]; 3039 3040 int offset=absdif(loc, prev); 3041 3042 if(offset<=MAX_INDEL){ 3043 score+=keyScores[i]; 3044 if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} 3045 3046 // if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key 3047 if(penalizeIndels && offset!=0){ 3048 int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); 3049 score-=penalty; 3050 } 3051 }else{ 3052 loc=prev; 3053 } 3054 } 3055 3056 } 3057 return score; 3058 3059 } 3060 3061 /** Encode a (location, chrom) pair to an index */ toNumber(int site, int chrom)3062 private static final int toNumber(int site, int chrom){ 3063 int out=(chrom&CHROM_MASK_LOW); 3064 out=out<<SHIFT_LENGTH; 3065 out=(out|site); 3066 return out; 3067 } 3068 3069 /** Decode an (index, baseChrom) pair to a chromosome */ numberToChrom(int number, int baseChrom)3070 private static final int numberToChrom(int number, int baseChrom){ 3071 assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom; 3072 assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom; 3073 int out=(number>>>SHIFT_LENGTH); 3074 out=out+(baseChrom&CHROM_MASK_HIGH); 3075 return out; 3076 } 3077 3078 /** Decode an index to a location */ numberToSite(int number)3079 private static final int numberToSite(int number){ 3080 return (number&SITE_MASK); 3081 } 3082 minChrom(int chrom)3083 private static final int minChrom(int chrom){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);} baseChrom(int chrom)3084 private static final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);} maxChrom(int chrom)3085 private static final int maxChrom(int chrom){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));} 3086 3087 getOffsetArray(int len)3088 private final int[] getOffsetArray(int len){ 3089 if(offsetArrays[len]==null){offsetArrays[len]=new int[len];} 3090 return offsetArrays[len]; 3091 } getLocArray(int len)3092 private final int[] getLocArray(int len){ 3093 if(len>=locArrays.length){return new int[len];} 3094 if(locArrays[len]==null){locArrays[len]=new int[len];} 3095 return locArrays[len]; 3096 } getGreedyListArray(int len)3097 private final int[] getGreedyListArray(int len){ 3098 if(greedyListArrays[len]==null){greedyListArrays[len]=new int[len];} 3099 return greedyListArrays[len]; 3100 } getGenericArray(int len)3101 private final int[] getGenericArray(int len){ 3102 if(genericArrays[len]==null){genericArrays[len]=new int[len];} 3103 return genericArrays[len]; 3104 } 3105 3106 @Override getBaseScoreArray(int len, int strand)3107 final byte[] getBaseScoreArray(int len, int strand){ 3108 if(len>=baseScoreArrays[0].length){return new byte[len];} 3109 if(baseScoreArrays[strand][len]==null){baseScoreArrays[strand][len]=new byte[len];} 3110 return baseScoreArrays[strand][len]; 3111 } 3112 @Override getKeyScoreArray(int len, int strand)3113 final int[] getKeyScoreArray(int len, int strand){ 3114 if(len>=keyScoreArrays.length){return new int[len];} 3115 if(keyScoreArrays[strand][len]==null){keyScoreArrays[strand][len]=new int[len];} 3116 return keyScoreArrays[strand][len]; 3117 } getKeyWeightArray(int len)3118 private final float[] getKeyWeightArray(int len){ 3119 if(len>=keyWeightArrays.length){return new float[len];} 3120 if(keyWeightArrays[len]==null){keyWeightArrays[len]=new float[len];} 3121 return keyWeightArrays[len]; 3122 } 3123 @Override keyProbArray()3124 float[] keyProbArray() { 3125 return keyProbArray; 3126 } 3127 3128 private static final int KEY_BUFFER_LENGTH=256; //Boosted from 128 to 256 to deal with crashes from short kmers, long reads, and vslow. 3129 3130 private final int[][] locArrays=new int[601][]; 3131 private final int[] valueArray=new int[KEY_BUFFER_LENGTH]; 3132 private final int[] sizeArray=new int[KEY_BUFFER_LENGTH]; 3133 private final int[][] offsetArrays=new int[KEY_BUFFER_LENGTH][]; 3134 private final int[][] greedyListArrays=new int[KEY_BUFFER_LENGTH][]; 3135 private final int[][] genericArrays=new int[KEY_BUFFER_LENGTH][]; 3136 private final int[] startArray=new int[KEY_BUFFER_LENGTH]; 3137 private final int[] stopArray=new int[KEY_BUFFER_LENGTH]; 3138 private final Quad[] tripleStorage=makeQuadStorage(KEY_BUFFER_LENGTH); 3139 private final int[] greedyReturn=KillSwitch.allocInt1D(2); 3140 private final int[][] shrinkReturn2=new int[3][]; 3141 private final int[][] shrinkReturn3=new int[5][]; 3142 private final int[][] prescanReturn=new int[2][]; 3143 private final int[] prescoreArray; 3144 private final int[] precountArray; 3145 3146 private final byte[][][] baseScoreArrays=new byte[2][601][]; 3147 private final int[][][] keyScoreArrays=new int[2][KEY_BUFFER_LENGTH][]; 3148 final float[] keyProbArray=new float[601]; 3149 private final float[][] keyWeightArrays=new float[KEY_BUFFER_LENGTH][]; 3150 3151 makeQuadStorage(int number)3152 private final static Quad[] makeQuadStorage(int number){ 3153 Quad[] r=new Quad[number]; 3154 for(int i=0; i<number; i++){r[i]=new Quad(i, 0, 0);} 3155 return r; 3156 } 3157 3158 3159 private final QuadHeap heap=new QuadHeap(KEY_BUFFER_LENGTH-1); 3160 private final QuadHeap active=new QuadHeap(KEY_BUFFER_LENGTH-1); 3161 3162 static int SHIFT_LENGTH=(32-1-NUM_CHROM_BITS); 3163 static int MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH); 3164 3165 /** Mask the number to get the site, which is in the lower bits */ 3166 static int SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1)); 3167 3168 /** Mask the chromosome's high bits to get the low bits */ 3169 static int CHROM_MASK_LOW=CHROMS_PER_BLOCK-1; 3170 3171 /** Mask the chromosome's lower bits to get the high bits */ 3172 static int CHROM_MASK_HIGH=~CHROM_MASK_LOW; 3173 setChromBits(int x)3174 static void setChromBits(int x){ 3175 3176 NUM_CHROM_BITS=x; 3177 CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS)); 3178 SHIFT_LENGTH=(32-1-NUM_CHROM_BITS); 3179 MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH); 3180 SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1)); 3181 CHROM_MASK_LOW=CHROMS_PER_BLOCK-1; 3182 CHROM_MASK_HIGH=~CHROM_MASK_LOW; 3183 3184 // assert(NUM_CHROM_BITS<30); 3185 assert(NUM_CHROM_BITS>=0); //max is 3 for human; perhaps more for other organisms 3186 // assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK; 3187 assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK; 3188 assert(Integer.bitCount(CHROMS_PER_BLOCK)==1); 3189 assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS+1)) : Integer.toHexString(SITE_MASK); 3190 } 3191 3192 private final int cycles; 3193 3194 public static final int BASE_HIT_SCORE=100; 3195 public static final int ALIGN_COLUMNS=3000; 3196 public static int MAX_INDEL=16000; //Max indel length, min 0, default 400; longer is more accurate 3197 public static int MAX_INDEL2=2*MAX_INDEL; 3198 3199 private final float INV_BASE_KEY_HIT_SCORE; 3200 private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1 3201 private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length 3202 private final int MAX_PENALTY_FOR_MISALIGNED_HIT; 3203 private final int SCOREZ_1KEY; 3204 3205 public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed 3206 public static final int Z_SCORE_MULT=20; 3207 public static final int Y_SCORE_MULT=10; 3208 3209 3210 /** 3211 * Return only sites that match completely or with partial no-reference 3212 */ setSemiperfectMode()3213 public static void setSemiperfectMode() { 3214 assert(!PERFECTMODE); 3215 SEMIPERFECTMODE=true; 3216 PRESCAN_QSCORE=false; 3217 // MIN_APPROX_HITS_TO_KEEP++; 3218 3219 3220 3221 MAX_INDEL=0; 3222 MAX_INDEL2=0; 3223 } 3224 3225 /** 3226 * Return only sites that match completely 3227 */ setPerfectMode()3228 public static void setPerfectMode() { 3229 assert(!SEMIPERFECTMODE); 3230 PERFECTMODE=true; 3231 PRESCAN_QSCORE=false; 3232 // MIN_APPROX_HITS_TO_KEEP++; 3233 3234 3235 3236 MAX_INDEL=0; 3237 MAX_INDEL2=0; 3238 } 3239 3240 static float FRACTION_GENOME_TO_EXCLUDE=0.03f; //Default .03; lower is slower and more accurate. For perfect reads and small genomes, lower is FASTER. 3241 setFractionToExclude(float f)3242 public static final void setFractionToExclude(float f){ 3243 assert(f>=0 && f<1); 3244 FRACTION_GENOME_TO_EXCLUDE=f; 3245 MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 3246 MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 3247 MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 3248 MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 3249 MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 3250 } 3251 3252 3253 /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */ 3254 static final float HIT_FRACTION_TO_RETAIN=0.85f; //default: .8 3255 /** Range: 0 to 1000. Lower should be faster and less accurate. */ 3256 static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 3257 /** Range: 2 to infinity. Lower should be faster and less accurate. */ 3258 static final int MIN_HIT_LISTS_TO_RETAIN=6; 3259 3260 static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 3261 //lower is faster 3262 static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 3263 //lower is faster 3264 static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 3265 //lower is faster 3266 static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 3267 3268 /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */ 3269 public static final int SMALL_GENOME_LIST=20; 3270 3271 static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";} 3272 3273 static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy. 3274 3275 /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */ 3276 static final int CLUMPY_MIN_LENGTH_INDEX=2000; 3277 static final float CLUMPY_FRACTION=0.75f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy. 3278 3279 static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2; 3280 3281 /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */ 3282 public static final int MAX_HITS_REDUCTION1=0; 3283 3284 /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */ 3285 public static int MAX_HITS_REDUCTION2=2; //default 1; higher is more accurate (more mapping and less FP) but slower 3286 3287 /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */ 3288 public static final int MAX_HITS_REDUCTION_PERFECT=0; 3289 3290 public static int MAXIMUM_MAX_HITS_REDUCTION=3; 3291 public static int HIT_REDUCTION_DIV=5; 3292 calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect)3293 private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$ 3294 assert(keys>=hits) : keys+", "+hits; 3295 assert(hits>=0); 3296 3297 int mahtk=MIN_APPROX_HITS_TO_KEEP; 3298 if(SEMIPERFECTMODE || PERFECTMODE){ 3299 if(keys==1){return 1;} 3300 else if(MIN_APPROX_HITS_TO_KEEP<keys){ 3301 mahtk++; 3302 if(currentCutoff==MIN_APPROX_HITS_TO_KEEP){currentCutoff++;} 3303 } 3304 } 3305 3306 int reduction=Tools.min(Tools.max((hits)/HIT_REDUCTION_DIV, MAX_HITS_REDUCTION2), Tools.max(MAXIMUM_MAX_HITS_REDUCTION, keys/8)); 3307 // if(hits<3){reduction=0;} 3308 // else if(hits<4){reduction=Tools.min(1, reduction);} 3309 // else if(hits<5){reduction=Tools.min(2, reduction);} 3310 // else if(hits<6){reduction=Tools.min(3, reduction);} 3311 assert(reduction>=0); 3312 int r=hits-reduction; 3313 3314 r=Tools.max(mahtk, currentCutoff, r); 3315 3316 if(perfect){ 3317 r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT); 3318 } 3319 return r; 3320 } 3321 3322 public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE; 3323 public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed 3324 public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast. 3325 public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.15f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate 3326 public static final float MIN_QSCORE_MULT=0.025f; //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate 3327 public static final float MIN_QSCORE_MULT2=0.1f; 3328 static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.84f : USE_EXTENDED_SCORE ? .84f : 0.6f); //Default .85f; lower is more accurate 3329 static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f 3330 static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$ 3331 static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false 3332 static{ 3333 assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1); 3334 assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1); 3335 } 3336 3337 3338 } 3339