1 package aligner; 2 3 import java.util.Arrays; 4 5 import dna.AminoAcid; 6 import dna.ChromosomeArray; 7 import dna.Data; 8 import shared.Tools; 9 import stream.Read; 10 import stream.SiteScore; 11 12 /** 13 * Based on MSA9ts, with transform scores tweaked for PacBio. */ 14 public final class MultiStateAligner9PacBioAdapter3 { 15 16 MultiStateAligner9PacBioAdapter3(int maxRows_, int maxColumns_)17 public MultiStateAligner9PacBioAdapter3(int maxRows_, int maxColumns_){ 18 // assert(maxColumns_>=200); 19 // assert(maxRows_>=200); 20 maxRows=maxRows_; 21 maxColumns=maxColumns_; 22 packed=new int[3][maxRows+1][]; 23 24 for(int i=0; i<3; i++){ 25 packed[i][0]=new int[maxColumns+1]; 26 packed[i][1]=new int[maxColumns+1]; 27 packed[i][2]=new int[maxColumns+1]; 28 for(int j=3; j<maxRows+1; j++){ 29 packed[i][j]=packed[i][j-2]; 30 } 31 } 32 33 insScoreArray=new int[maxRows+1]; 34 delScoreArray=new int[maxColumns+1]; 35 matchScoreArray=new int[maxRows+1]; 36 subScoreArray=new int[maxRows+1]; 37 for(int i=0; i<insScoreArray.length; i++){ 38 insScoreArray[i]=(i==0 ? POINTSoff_INS : 39 i<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : 40 i<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4); 41 } 42 for(int i=0; i<delScoreArray.length; i++){ 43 delScoreArray[i]=(i==0 ? POINTSoff_DEL : 44 i<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 : 45 i<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 : POINTSoff_DEL4); 46 } 47 for(int i=0; i<matchScoreArray.length; i++){ 48 matchScoreArray[i]=(i==0 ? POINTSoff_MATCH2 : POINTSoff_MATCH); 49 } 50 for(int i=0; i<subScoreArray.length; i++){ 51 subScoreArray[i]=(i==0 ? POINTSoff_SUB : i<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3); 52 } 53 54 vertLimit=new int[maxRows+1]; 55 horizLimit=new int[maxColumns+1]; 56 Arrays.fill(vertLimit, BADoff); 57 Arrays.fill(horizLimit, BADoff); 58 59 for(int matrix=0; matrix<packed.length; matrix++){ 60 for(int i=1; i<=3; i++){ 61 for(int j=0; j<packed[matrix][i].length; j++){ 62 packed[matrix][i][j]|=BADoff; 63 } 64 } 65 66 //Initializes the rows; not needed. 67 for(int i=1; i<=2; i++){ 68 69 int score=calcInsScoreOffset(i); 70 packed[matrix][i][0]=score; 71 } 72 } 73 } 74 75 76 /** return new int[] {rows, maxC, maxS, max}; 77 * Will not fill areas that cannot match minScore */ fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)78 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ 79 return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore); 80 } 81 82 83 /** return new int[] {rows, maxC, maxS, max}; 84 * Will not fill areas that cannot match minScore */ fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)85 private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ 86 // minScore=0; 87 // assert(minScore>0); 88 rows=read.length; 89 columns=refEndLoc-refStartLoc+1; 90 91 if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){ 92 // assert(false) : minScore; 93 return fillUnlimited(read, ref, refStartLoc, refEndLoc); 94 } 95 96 minScore-=100; //Increases quality trivially 97 98 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ 99 refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; 100 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ 101 refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; 102 103 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ 104 refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; 105 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+ 106 refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; 107 108 // for(int x=0; x<packed.length; x++){ 109 // for(int y=1; y<rows+1; y++){ 110 // Arrays.fill(packed[x][y], 1, columns+1, BADoff); 111 // } 112 // } 113 for(int x=0; x<packed.length; x++){ 114 115 // Arrays.fill(packed[x][1], 1, columns+1, BADoff); 116 Arrays.fill(packed[x][rows], 1, columns+1, BADoff); 117 // for(int y=1; y<rows+1; y++){ 118 // Arrays.fill(packed[x][y], 1, columns+1, BADoff); 119 // } 120 } 121 122 int minGoodCol=1; 123 int maxGoodCol=columns; 124 125 final int minScore_off=(minScore<<SCOREOFFSET); 126 final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; 127 final int floor=minScore_off-maxGain; 128 // final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2); 129 final int subfloor=floor-5*POINTSoff_MATCH2; 130 assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more. 131 assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor; 132 133 if(verbose2){ 134 System.out.println(); 135 System.out.println("minScore="+minScore+"\t"+minScore_off); 136 System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain)); 137 System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor)); 138 System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor)); 139 System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff)); 140 System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain)); 141 System.out.println(); 142 } 143 144 vertLimit[rows]=minScore_off; 145 for(int i=rows-1; i>=0; i--){ 146 vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); 147 } 148 149 horizLimit[columns]=minScore_off; 150 for(int i=columns-1; i>=0; i--){ 151 horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); 152 } 153 154 for(int row=1; row<=rows; row++){ 155 156 { 157 int score=calcInsScoreOffset(row); 158 packed[0][row][0]=score; 159 packed[1][row][0]=score; 160 packed[2][row][0]=score; 161 } 162 163 int colStart=minGoodCol; 164 int colStop=maxGoodCol; 165 166 minGoodCol=-1; 167 maxGoodCol=-2; 168 169 final int vlimit=vertLimit[row]; 170 171 if(verbose2){ 172 System.out.println(); 173 System.out.println("row="+row); 174 System.out.println("colStart="+colStart); 175 System.out.println("colStop="+colStop); 176 System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); 177 } 178 179 if(colStart<0 || colStop<colStart){break;} 180 181 182 if(colStart>1){ 183 assert(row>0); 184 packed[MODE_MS][row][colStart-1]=subfloor; 185 packed[MODE_INS][row][colStart-1]=subfloor; 186 packed[MODE_DEL][row][colStart-1]=subfloor; 187 } 188 189 190 for(int col=colStart; col<=columns; col++){ 191 192 193 if(verbose2){ 194 System.out.println("\ncol "+col); 195 } 196 197 final byte call0=(row<2 ? (byte)'?' : read[row-2]); 198 final byte call1=read[row-1]; 199 final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); 200 final byte ref1=ref[refStartLoc+col-1]; 201 202 // final boolean match=(read[row-1]==ref[refStartLoc+col-1]); 203 // final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); 204 final boolean match=(call1==ref1); 205 final boolean prevMatch=(call0==ref0); 206 207 // System.err.println("") 208 209 iterationsLimited++; 210 final int limit=Tools.max(vlimit, horizLimit[col]); 211 final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); 212 213 final int delNeeded=Tools.max(0, row-col-1); 214 final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); 215 216 final int delPenalty=calcDelScoreOffset(delNeeded); 217 final int insPenalty=calcInsScoreOffset(insNeeded); 218 219 220 final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; 221 final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; 222 final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; 223 224 final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; 225 final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; 226 227 final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; 228 final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; 229 230 // if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3 231 // && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){ 232 // iterationsLimited--; //A "fast" iteration 233 // } 234 235 if((scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){ 236 packed[MODE_MS][row][col]=subfloor; 237 }else{//Calculate match and sub scores 238 final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK); 239 240 {//Calculate match/sub score 241 242 int score; 243 int time; 244 byte prevState; 245 246 if(match){ 247 248 int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH); 249 int scoreD=scoreFromDel_MS+POINTSoff_MATCH; 250 int scoreI=scoreFromIns_MS+POINTSoff_MATCH; 251 252 // byte prevState; 253 if(scoreMS>=scoreD && scoreMS>=scoreI){ 254 score=scoreMS; 255 time=(prevMatch ? streak+1 : 1); 256 prevState=MODE_MS; 257 }else if(scoreD>=scoreI){ 258 score=scoreD; 259 time=1; 260 prevState=MODE_DEL; 261 }else{ 262 score=scoreI; 263 time=1; 264 prevState=MODE_INS; 265 } 266 267 }else{ 268 269 int scoreMS; 270 if(ref1!='N' && call1!='N'){ 271 scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : subScoreArray[streak]); 272 }else{ 273 scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL; 274 } 275 276 int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion 277 int scoreI=scoreFromIns_MS+POINTSoff_SUB; 278 279 if(scoreMS>=scoreD && scoreMS>=scoreI){ 280 score=scoreMS; 281 time=(prevMatch ? 1 : streak+1); 282 // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); 283 prevState=MODE_MS; 284 }else if(scoreD>=scoreI){ 285 score=scoreD; 286 time=1; 287 prevState=MODE_DEL; 288 }else{ 289 score=scoreI; 290 time=1; 291 prevState=MODE_INS; 292 } 293 } 294 295 final int limit2; 296 if(delNeeded>0){ 297 limit2=limit-delPenalty; 298 }else if(insNeeded>0){ 299 limit2=limit-insPenalty; 300 }else{ 301 limit2=limit; 302 } 303 assert(limit2>=limit); 304 305 if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} 306 307 if(score>=limit2){ 308 maxGoodCol=col; 309 if(minGoodCol<0){minGoodCol=col;} 310 }else{ 311 score=subfloor; 312 } 313 314 assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} 315 assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; 316 assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; 317 // packed[MODE_MS][row][col]=(score|prevState|time); 318 packed[MODE_MS][row][col]=(score|time); 319 assert((score&SCOREMASK)==score); 320 // assert((prevState&MODEMASK)==prevState); 321 assert((time&TIMEMASK)==time); 322 } 323 } 324 325 if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)){ 326 // assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; 327 packed[MODE_DEL][row][col]=subfloor; 328 }else{//Calculate DEL score 329 330 final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; 331 332 int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; 333 int scoreD=scoreFromDel_DEL+delScoreArray[streak]; 334 // int scoreI=scoreFromIns+POINTSoff_DEL; 335 336 337 if(ref1=='N'){ 338 scoreMS+=POINTSoff_DEL_REF_N; 339 scoreD+=POINTSoff_DEL_REF_N; 340 } 341 342 //if(match){scoreMS=subfloor;} 343 344 int score; 345 int time; 346 byte prevState; 347 if(scoreMS>=scoreD){ 348 score=scoreMS; 349 time=1; 350 prevState=MODE_MS; 351 }else{ 352 score=scoreD; 353 time=streak+1; 354 prevState=MODE_DEL; 355 } 356 357 final int limit2; 358 if(insNeeded>0){ 359 limit2=limit-insPenalty; 360 }else if(delNeeded>0){ 361 limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); 362 }else{ 363 limit2=limit; 364 } 365 assert(limit2>=limit); 366 if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} 367 368 if(score>=limit2){ 369 maxGoodCol=col; 370 if(minGoodCol<0){minGoodCol=col;} 371 }else{ 372 score=subfloor; 373 } 374 375 assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} 376 assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; 377 assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; 378 // packed[MODE_DEL][row][col]=(score|prevState|time); 379 packed[MODE_DEL][row][col]=(score|time); 380 assert((score&SCOREMASK)==score); 381 // assert((prevState&MODEMASK)==prevState); 382 assert((time&TIMEMASK)==time); 383 } 384 385 if(scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit){ 386 packed[MODE_INS][row][col]=subfloor; 387 }else{//Calculate INS score 388 389 final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; 390 391 int scoreMS=scoreFromDiag_INS+POINTSoff_INS; 392 // int scoreD=scoreFromDel+POINTSoff_INS; 393 int scoreI=scoreFromIns_INS+insScoreArray[streak]; 394 395 396 // System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+ 397 // scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", " 398 // +scoreD+", "+scoreFromIns+"+"+ 399 // (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI); 400 401 //if(match){scoreMS=subfloor;} 402 403 int score; 404 int time; 405 byte prevState; 406 if(scoreMS>=scoreI){ 407 score=scoreMS; 408 time=1; 409 prevState=MODE_MS; 410 }else{ 411 score=scoreI; 412 time=streak+1; 413 prevState=MODE_INS; 414 } 415 416 final int limit2; 417 if(delNeeded>0){ 418 limit2=limit-delPenalty; 419 }else if(insNeeded>0){ 420 limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time); 421 }else{ 422 limit2=limit; 423 } 424 assert(limit2>=limit); 425 426 if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} 427 if(score>=limit2){ 428 maxGoodCol=col; 429 if(minGoodCol<0){minGoodCol=col;} 430 }else{ 431 score=subfloor; 432 } 433 434 assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} 435 assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; 436 assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; 437 // packed[MODE_INS][row][col]=(score|prevState|time); 438 packed[MODE_INS][row][col]=(score|time); 439 assert((score&SCOREMASK)==score); 440 // assert((prevState&MODEMASK)==prevState); 441 assert((time&TIMEMASK)==time); 442 } 443 444 445 if(col>=colStop){ 446 if(col>colStop && maxGoodCol<col){break;} 447 if(row>1){ 448 packed[MODE_MS][row-1][col+1]=subfloor; 449 packed[MODE_INS][row-1][col+1]=subfloor; 450 packed[MODE_DEL][row-1][col+1]=subfloor; 451 } 452 } 453 } 454 } 455 456 457 int maxCol=-1; 458 int maxState=-1; 459 int maxScore=Integer.MIN_VALUE; 460 461 for(int state=0; state<packed.length; state++){ 462 for(int col=1; col<=columns; col++){ 463 int x=packed[state][rows][col]&SCOREMASK; 464 if(x>maxScore){ 465 maxScore=x; 466 maxCol=col; 467 maxState=state; 468 } 469 } 470 } 471 472 assert(maxScore>=BADoff); 473 // if(maxScore==BADoff){ 474 // return null; 475 // } 476 // if(maxScore<floor){ 477 // return null; 478 // } 479 if(maxScore<minScore_off){ 480 return null; 481 } 482 483 maxScore>>=SCOREOFFSET; 484 485 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); 486 return new int[] {rows, maxCol, maxState, maxScore}; 487 } 488 489 490 /** return new int[] {rows, maxC, maxS, max}; 491 * Does not require a min score (ie, same as old method) */ fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc)492 private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ 493 rows=read.length; 494 columns=refEndLoc-refStartLoc+1; 495 496 final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; 497 final int subfloor=0-2*maxGain; 498 assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more. 499 500 //temporary, for finding a bug 501 if(rows>maxRows || columns>maxColumns){ 502 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); 503 } 504 505 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; 506 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; 507 508 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; 509 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length; 510 511 for(int row=1; row<=rows; row++){ 512 513 { 514 int score=calcInsScoreOffset(row); 515 packed[0][row][0]=score; 516 packed[1][row][0]=score; 517 packed[2][row][0]=score; 518 } 519 520 // int minc=max(1, row-20); 521 // int maxc=min(columns, row+20); 522 523 for(int col=1; col<=columns; col++){ 524 iterationsUnlimited++; 525 526 // final boolean match=(read[row-1]==ref[refStartLoc+col-1]); 527 // final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); 528 529 final byte call0=(row<2 ? (byte)'?' : read[row-2]); 530 final byte call1=read[row-1]; 531 final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); 532 final byte ref1=ref[refStartLoc+col-1]; 533 534 final boolean match=(call1==ref1); 535 final boolean prevMatch=(call0==ref0); 536 537 {//Calculate match and sub scores 538 539 final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; 540 final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; 541 final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; 542 final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK); 543 544 {//Calculate match/sub score 545 546 if(match){ 547 548 int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH); 549 int scoreD=scoreFromDel+POINTSoff_MATCH; 550 int scoreI=scoreFromIns+POINTSoff_MATCH; 551 552 int score; 553 int time; 554 // byte prevState; 555 if(scoreMS>=scoreD && scoreMS>=scoreI){ 556 score=scoreMS; 557 time=(prevMatch ? streak+1 : 1); 558 // prevState=MODE_MS; 559 }else if(scoreD>=scoreI){ 560 score=scoreD; 561 time=1; 562 // prevState=MODE_DEL; 563 }else{ 564 score=scoreI; 565 time=1; 566 // prevState=MODE_INS; 567 } 568 569 assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} 570 assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; 571 assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; 572 // packed[MODE_MS][row][col]=(score|prevState|time); 573 packed[MODE_MS][row][col]=(score|time); 574 assert((score&SCOREMASK)==score); 575 // assert((prevState&MODEMASK)==prevState); 576 assert((time&TIMEMASK)==time); 577 578 }else{ 579 580 int scoreMS; 581 if(ref1!='N' && call1!='N'){ 582 scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : subScoreArray[streak]); 583 }else{ 584 scoreMS=scoreFromDiag+POINTSoff_NOCALL; 585 } 586 587 int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion 588 int scoreI=scoreFromIns+POINTSoff_SUB; 589 590 int score; 591 int time; 592 byte prevState; 593 if(scoreMS>=scoreD && scoreMS>=scoreI){ 594 score=scoreMS; 595 time=(prevMatch ? 1 : streak+1); 596 // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); 597 prevState=MODE_MS; 598 }else if(scoreD>=scoreI){ 599 score=scoreD; 600 time=1; 601 prevState=MODE_DEL; 602 }else{ 603 score=scoreI; 604 time=1; 605 prevState=MODE_INS; 606 } 607 608 assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} 609 assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; 610 assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; 611 // packed[MODE_MS][row][col]=(score|prevState|time); 612 packed[MODE_MS][row][col]=(score|time); 613 assert((score&SCOREMASK)==score); 614 // assert((prevState&MODEMASK)==prevState); 615 assert((time&TIMEMASK)==time); 616 } 617 } 618 } 619 620 {//Calculate DEL score 621 622 final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; 623 624 final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; 625 final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; 626 627 int scoreMS=scoreFromDiag+POINTSoff_DEL; 628 int scoreD=scoreFromDel+delScoreArray[streak]; 629 // int scoreI=scoreFromIns+POINTSoff_DEL; 630 631 if(ref1=='N'){ 632 scoreMS+=POINTSoff_DEL_REF_N; 633 scoreD+=POINTSoff_DEL_REF_N; 634 } 635 636 //if(match){scoreMS=subfloor;} 637 638 int score; 639 int time; 640 byte prevState; 641 if(scoreMS>=scoreD){ 642 score=scoreMS; 643 time=1; 644 prevState=MODE_MS; 645 }else{ 646 score=scoreD; 647 time=streak+1; 648 prevState=MODE_DEL; 649 } 650 651 assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} 652 assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; 653 assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; 654 // packed[MODE_DEL][row][col]=(score|prevState|time); 655 packed[MODE_DEL][row][col]=(score|time); 656 assert((score&SCOREMASK)==score); 657 // assert((prevState&MODEMASK)==prevState); 658 assert((time&TIMEMASK)==time); 659 } 660 661 {//Calculate INS score 662 663 final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; 664 665 final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; 666 final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; 667 668 int scoreMS=scoreFromDiag+POINTSoff_INS; 669 // int scoreD=scoreFromDel+POINTSoff_INS; 670 int scoreI=scoreFromIns+insScoreArray[streak]; 671 672 // System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+ 673 // scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", " 674 // +scoreD+", "+scoreFromIns+"+"+ 675 // (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI); 676 677 //if(match){scoreMS=subfloor;} 678 679 int score; 680 int time; 681 byte prevState; 682 if(scoreMS>=scoreI){ 683 score=scoreMS; 684 time=1; 685 prevState=MODE_MS; 686 }else{ 687 score=scoreI; 688 time=streak+1; 689 prevState=MODE_INS; 690 } 691 692 assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} 693 assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; 694 assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; 695 // packed[MODE_INS][row][col]=(score|prevState|time); 696 packed[MODE_INS][row][col]=(score|time); 697 assert((score&SCOREMASK)==score); 698 // assert((prevState&MODEMASK)==prevState); 699 assert((time&TIMEMASK)==time); 700 } 701 } 702 } 703 704 705 int maxCol=-1; 706 int maxState=-1; 707 int maxScore=Integer.MIN_VALUE; 708 709 for(int state=0; state<packed.length; state++){ 710 for(int col=1; col<=columns; col++){ 711 int x=packed[state][rows][col]&SCOREMASK; 712 if(x>maxScore){ 713 maxScore=x; 714 maxCol=col; 715 maxState=state; 716 } 717 } 718 } 719 maxScore>>=SCOREOFFSET; 720 721 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); 722 return new int[] {rows, maxCol, maxState, maxScore}; 723 } 724 725 726 /** Generates the match string */ 727 public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ 728 // assert(false); 729 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; 730 assert(row==rows); 731 732 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". 733 int outPos=0; 734 735 if(state==MODE_INS){ 736 //TODO ? Maybe not needed. 737 } 738 739 while(row>0 && col>0){ 740 741 // byte prev0=(byte)(packed[state][row][col]&MODEMASK); 742 743 final int time=packed[state][row][col]&TIMEMASK; 744 final byte prev; 745 746 // System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); 747 748 if(state==MODE_MS){ 749 if(time>1){prev=(byte)state;} 750 else{ 751 final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; 752 final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; 753 final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; 754 if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} 755 else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} 756 else{prev=MODE_INS;} 757 } 758 759 byte c=read[row-1]; 760 byte r=ref[refStartLoc+col-1]; 761 if(c==r){ 762 out[outPos]='m'; 763 }else{ 764 if(!AminoAcid.isFullyDefined(c)){ 765 out[outPos]='N'; 766 }else if(!AminoAcid.isFullyDefined(r)){ 767 // out[outPos]='X'; 768 out[outPos]='N'; 769 }else{ 770 out[outPos]='S'; 771 } 772 } 773 774 row--; 775 col--; 776 }else if(state==MODE_DEL){ 777 if(time>1){prev=(byte)state;} 778 else{ 779 final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; 780 final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; 781 if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} 782 else{prev=MODE_DEL;} 783 } 784 785 byte r=ref[refStartLoc+col-1]; 786 out[outPos]='D'; 787 788 col--; 789 }else{ 790 if(time>1){prev=(byte)state;} 791 else{ 792 final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; 793 final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; 794 if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} 795 else{prev=MODE_INS;} 796 } 797 798 assert(state==MODE_INS) : state; 799 if(col==0){ 800 out[outPos]='X'; 801 }else if(col>=columns){ 802 out[outPos]='Y'; 803 }else{ 804 out[outPos]='I'; 805 } 806 row--; 807 } 808 809 // assert(prev==prev0); 810 state=prev; 811 outPos++; 812 } 813 814 assert(row==0 || col==0); 815 if(col!=row){ 816 while(row>0){ 817 out[outPos]='X'; 818 outPos++; 819 row--; 820 col--; 821 } 822 if(col>0){ 823 //do nothing 824 } 825 } 826 827 828 //Shrink and reverse the string 829 byte[] out2=new byte[outPos]; 830 for(int i=0; i<outPos; i++){ 831 out2[i]=out[outPos-i-1]; 832 } 833 out=null; 834 835 return out2; 836 } 837 838 /** @return {score, bestRefStart, bestRefStop} */ score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, final int maxRow, final int maxCol, final int maxState)839 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, 840 final int maxRow, final int maxCol, final int maxState){ 841 842 int row=maxRow; 843 int col=maxCol; 844 int state=maxState; 845 846 assert(maxState>=0 && maxState<packed.length) : 847 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); 848 assert(maxRow>=0 && maxRow<packed[0].length) : 849 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); 850 assert(maxCol>=0 && maxCol<packed[0][0].length) : 851 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); 852 853 int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated 854 855 if(row<rows){ 856 int difR=rows-row; 857 int difC=columns-col; 858 859 while(difR>difC){ 860 score+=POINTSoff_NOREF; 861 difR--; 862 } 863 864 row+=difR; 865 col+=difR; 866 867 } 868 869 assert(refStartLoc<=refEndLoc); 870 assert(row==rows); 871 872 873 final int bestRefStop=refStartLoc+col-1; 874 875 while(row>0 && col>0){ 876 // System.err.println("state="+state+", row="+row+", col="+col); 877 878 879 880 // byte prev0=(byte)(packed[state][row][col]&MODEMASK); 881 882 final int time=packed[state][row][col]&TIMEMASK; 883 final byte prev; 884 885 if(state==MODE_MS){ 886 if(time>1){prev=(byte)state;} 887 else{ 888 final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; 889 final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; 890 final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; 891 if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} 892 else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} 893 else{prev=MODE_INS;} 894 } 895 row--; 896 col--; 897 }else if(state==MODE_DEL){ 898 if(time>1){prev=(byte)state;} 899 else{ 900 final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; 901 final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; 902 if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} 903 else{prev=MODE_DEL;} 904 } 905 col--; 906 }else{ 907 assert(state==MODE_INS); 908 if(time>1){prev=(byte)state;} 909 else{ 910 final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; 911 final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; 912 if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} 913 else{prev=MODE_INS;} 914 } 915 row--; 916 } 917 918 if(col<0){ 919 System.err.println(row); 920 break; //prevents an out of bounds access 921 922 } 923 924 // assert(prev==prev0); 925 state=prev; 926 927 // System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); 928 } 929 // assert(false) : row+", "+col; 930 if(row>col){ 931 col-=row; 932 } 933 934 final int bestRefStart=refStartLoc+col; 935 936 score>>=SCOREOFFSET; 937 int[] rvec; 938 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow 939 int padLeft=Tools.max(0, refStartLoc-bestRefStart); 940 int padRight=Tools.max(0, bestRefStop-refEndLoc); 941 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; 942 }else{ 943 rvec=new int[] {score, bestRefStart, bestRefStop}; 944 } 945 return rvec; 946 } 947 948 949 /** Will not fill areas that cannot match minScore. 950 * @return {score, bestRefStart, bestRefStop} */ fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)951 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ 952 int a=Tools.max(0, refStartLoc); 953 int b=Tools.min(ref.length-1, refEndLoc); 954 assert(b>=a); 955 956 int[] score; 957 958 if(b-a>=maxColumns){ 959 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); 960 assert(false) : refStartLoc+", "+refEndLoc; 961 b=Tools.min(ref.length-1, a+maxColumns-1); 962 } 963 int[] max=fillLimited(read, ref, a, b, minScore); 964 score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2])); 965 966 return score; 967 } 968 969 970 scoreNoIndels(byte[] read, SiteScore ss)971 public final static int scoreNoIndels(byte[] read, SiteScore ss){ 972 ChromosomeArray cha=Data.getChromosome(ss.chrom); 973 return scoreNoIndels(read, cha.array, ss.start, ss); 974 } 975 scoreNoIndels(byte[] read, final int chrom, final int refStart)976 public final static int scoreNoIndels(byte[] read, final int chrom, final int refStart){ 977 ChromosomeArray cha=Data.getChromosome(chrom); 978 return scoreNoIndels(read, cha.array, refStart, null); 979 } 980 scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores)981 public final static int scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores){ 982 ChromosomeArray cha=Data.getChromosome(ss.chrom); 983 return scoreNoIndels(read, cha.array, baseScores, ss.start, ss); 984 } 985 scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores)986 public final static int scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores){ 987 ChromosomeArray cha=Data.getChromosome(chrom); 988 return scoreNoIndels(read, cha.array, baseScores, refStart, null); 989 } 990 scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss)991 public final static int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){ 992 993 int score=0; 994 int mode=-1; 995 int timeInMode=0; 996 997 //This block handles cases where the read runs outside the reference 998 //Of course, padding the reference with 'N' would be better, but... 999 int readStart=0; 1000 int readStop=read.length; 1001 final int refStop=refStart+read.length; 1002 boolean semiperfect=true; 1003 int norefs=0; 1004 1005 if(refStart<0){ 1006 readStart=0-refStart; 1007 score+=POINTS_NOREF*readStart; 1008 norefs+=readStart; 1009 } 1010 if(refStop>ref.length){ 1011 int dif=(refStop-ref.length); 1012 readStop-=dif; 1013 score+=POINTS_NOREF*dif; 1014 norefs+=dif; 1015 } 1016 1017 // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. 1018 1019 for(int i=readStart; i<readStop; i++){ 1020 byte c=read[i]; 1021 byte r=ref[refStart+i]; 1022 1023 if(c==r && c!='N'){ 1024 if(mode==MODE_MS){ 1025 timeInMode++; 1026 score+=POINTS_MATCH2; 1027 }else{ 1028 timeInMode=0; 1029 score+=POINTS_MATCH; 1030 } 1031 mode=MODE_MS; 1032 }else if(c<0 || c=='N'){ 1033 score+=POINTS_NOCALL; 1034 semiperfect=false; 1035 }else if(r<0 || r=='N'){ 1036 score+=POINTS_NOREF; 1037 norefs++; 1038 }else{ 1039 if(mode==MODE_SUB){timeInMode++;} 1040 else{timeInMode=0;} 1041 1042 if(timeInMode==0){score+=POINTS_SUB;} 1043 else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;} 1044 else{score+=POINTS_SUB3;} 1045 mode=MODE_SUB; 1046 semiperfect=false; 1047 } 1048 } 1049 1050 if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));} 1051 1052 return score; 1053 } 1054 1055 scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss)1056 public final static int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){ 1057 1058 int score=0; 1059 int mode=-1; 1060 int timeInMode=0; 1061 int norefs=0; 1062 1063 //This block handles cases where the read runs outside the reference 1064 //Of course, padding the reference with 'N' would be better, but... 1065 int readStart=0; 1066 int readStop=read.length; 1067 final int refStop=refStart+read.length; 1068 boolean semiperfect=true; 1069 1070 if(refStart<0){ 1071 readStart=0-refStart; 1072 score+=POINTS_NOREF*readStart; 1073 norefs+=readStart; 1074 } 1075 if(refStop>ref.length){ 1076 int dif=(refStop-ref.length); 1077 readStop-=dif; 1078 score+=POINTS_NOREF*dif; 1079 norefs+=dif; 1080 } 1081 1082 // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. 1083 1084 for(int i=readStart; i<readStop; i++){ 1085 byte c=read[i]; 1086 byte r=ref[refStart+i]; 1087 1088 if(c==r && c!='N'){ 1089 if(mode==MODE_MS){ 1090 timeInMode++; 1091 score+=POINTS_MATCH2; 1092 }else{ 1093 timeInMode=0; 1094 score+=POINTS_MATCH; 1095 } 1096 score+=baseScores[i]; 1097 mode=MODE_MS; 1098 }else if(c<0 || c=='N'){ 1099 score+=POINTS_NOCALL; 1100 semiperfect=false; 1101 }else if(r<0 || r=='N'){ 1102 score+=POINTS_NOREF; 1103 norefs++; 1104 }else{ 1105 if(mode==MODE_SUB){timeInMode++;} 1106 else{timeInMode=0;} 1107 1108 if(timeInMode==0){score+=POINTS_SUB;} 1109 else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;} 1110 else{score+=POINTS_SUB3;} 1111 mode=MODE_SUB; 1112 semiperfect=false; 1113 } 1114 } 1115 1116 if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));} 1117 assert(Read.CHECKSITE(ss, read, -1)); 1118 1119 return score; 1120 } 1121 1122 scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn)1123 public final static int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){ 1124 int score=0; 1125 int mode=-1; 1126 int timeInMode=0; 1127 1128 assert(refStart<=ref.length) : refStart+", "+ref.length; 1129 1130 //This block handles cases where the read runs outside the reference 1131 //Of course, padding the reference with 'N' would be better, but... 1132 int readStart=0; 1133 int readStop=read.length; 1134 final int refStop=refStart+read.length; 1135 if(refStart<0){ 1136 readStart=0-refStart; 1137 score+=POINTS_NOREF*readStart; 1138 } 1139 if(refStop>ref.length){ 1140 int dif=(refStop-ref.length); 1141 System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); 1142 readStop-=dif; 1143 score+=POINTS_NOREF*dif; 1144 } 1145 assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ 1146 ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; 1147 1148 assert(matchReturn!=null); 1149 assert(matchReturn.length==1); 1150 if(matchReturn[0]==null || matchReturn[0].length!=read.length){ 1151 assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length; 1152 matchReturn[0]=new byte[read.length]; 1153 } 1154 final byte[] match=matchReturn[0]; 1155 1156 // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. 1157 1158 for(int i=readStart; i<readStop; i++){ 1159 byte c=read[i]; 1160 byte r=ref[refStart+i]; 1161 1162 assert(r!='.' && c!='.'); 1163 1164 if(c==r && c!='N'){ 1165 if(mode==MODE_MS){ 1166 timeInMode++; 1167 score+=POINTS_MATCH2; 1168 }else{ 1169 timeInMode=0; 1170 score+=POINTS_MATCH; 1171 } 1172 score+=baseScores[i]; 1173 match[i]='m'; 1174 mode=MODE_MS; 1175 }else if(c<0 || c=='N'){ 1176 score+=POINTS_NOCALL; 1177 match[i]='N'; 1178 }else if(r<0 || r=='N'){ 1179 score+=POINTS_NOREF; 1180 // match[i]='m'; 1181 match[i]='N'; 1182 }else{ 1183 match[i]='S'; 1184 if(mode==MODE_SUB){timeInMode++;} 1185 else{timeInMode=0;} 1186 1187 if(timeInMode==0){score+=POINTS_SUB;} 1188 else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;} 1189 else{score+=POINTS_SUB3;} 1190 mode=MODE_SUB; 1191 } 1192 } 1193 1194 return score; 1195 } 1196 1197 scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn)1198 public final static int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){ 1199 int score=0; 1200 int mode=-1; 1201 int timeInMode=0; 1202 1203 assert(refStart<=ref.length) : refStart+", "+ref.length; 1204 1205 //This block handles cases where the read runs outside the reference 1206 //Of course, padding the reference with 'N' would be better, but... 1207 int readStart=0; 1208 int readStop=read.length; 1209 final int refStop=refStart+read.length; 1210 if(refStart<0){ 1211 readStart=0-refStart; 1212 score+=POINTS_NOREF*readStart; 1213 } 1214 if(refStop>ref.length){ 1215 int dif=(refStop-ref.length); 1216 System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); 1217 readStop-=dif; 1218 score+=POINTS_NOREF*dif; 1219 } 1220 assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ 1221 ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; 1222 1223 assert(matchReturn!=null); 1224 assert(matchReturn.length==1); 1225 if(matchReturn[0]==null || matchReturn[0].length!=read.length){ 1226 assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length; 1227 matchReturn[0]=new byte[read.length]; 1228 } 1229 final byte[] match=matchReturn[0]; 1230 1231 // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. 1232 1233 for(int i=readStart; i<readStop; i++){ 1234 byte c=read[i]; 1235 byte r=ref[refStart+i]; 1236 1237 assert(r!='.' && c!='.'); 1238 1239 if(c==r && c!='N'){ 1240 if(mode==MODE_MS){ 1241 timeInMode++; 1242 score+=POINTS_MATCH2; 1243 }else{ 1244 timeInMode=0; 1245 score+=POINTS_MATCH; 1246 } 1247 match[i]='m'; 1248 mode=MODE_MS; 1249 }else if(c<0 || c=='N'){ 1250 score+=POINTS_NOCALL; 1251 match[i]='N'; 1252 }else if(r<0 || r=='N'){ 1253 score+=POINTS_NOREF; 1254 // match[i]='m'; 1255 match[i]='N'; 1256 }else{ 1257 match[i]='S'; 1258 if(mode==MODE_SUB){timeInMode++;} 1259 else{timeInMode=0;} 1260 1261 if(timeInMode==0){score+=POINTS_SUB;} 1262 else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;} 1263 else{score+=POINTS_SUB3;} 1264 mode=MODE_SUB; 1265 } 1266 } 1267 1268 return score; 1269 } 1270 maxQuality(int numBases)1271 public static final int maxQuality(int numBases){ 1272 return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2); 1273 } 1274 maxQuality(byte[] baseScores)1275 public static final int maxQuality(byte[] baseScores){ 1276 return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores); 1277 } 1278 maxImperfectScore(int numBases)1279 public static final int maxImperfectScore(int numBases){ 1280 // int maxQ=maxQuality(numBases); 1281 //// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB); 1282 // int maxI=maxQ+POINTS_DEL; 1283 // maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2); 1284 // maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)); 1285 1286 int maxQ=maxQuality(numBases); 1287 int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2); 1288 assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB))); 1289 return maxI; 1290 } 1291 1292 public static final int maxImperfectScore(byte[] baseScores){ 1293 // int maxQ=maxQuality(numBases); 1294 //// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB); 1295 // int maxI=maxQ+POINTS_DEL; 1296 // maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2); 1297 // maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)); 1298 1299 int maxQ=maxQuality(baseScores); 1300 int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2); 1301 assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB))); 1302 return maxI; 1303 } 1304 1305 public static final String toString(int[] a){ 1306 1307 int width=7; 1308 1309 StringBuilder sb=new StringBuilder((a.length+1)*width+2); 1310 for(int num : a){ 1311 String s=" "+num; 1312 int spaces=width-s.length(); 1313 assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; 1314 for(int i=0; i<spaces; i++){sb.append(' ');} 1315 sb.append(s); 1316 } 1317 1318 return sb.toString(); 1319 } 1320 1321 public static final String toTimePacked(int[] a){ 1322 int width=7; 1323 1324 StringBuilder sb=new StringBuilder((a.length+1)*width+2); 1325 for(int num_ : a){ 1326 int num=num_&TIMEMASK; 1327 String s=" "+num; 1328 int spaces=width-s.length(); 1329 assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; 1330 for(int i=0; i<spaces; i++){sb.append(' ');} 1331 sb.append(s); 1332 } 1333 1334 return sb.toString(); 1335 } 1336 1337 public static final String toScorePacked(int[] a){ 1338 int width=7; 1339 1340 String minString=" -"; 1341 String maxString=" "; 1342 while(minString.length()<width){minString+='9';} 1343 while(maxString.length()<width){maxString+='9';} 1344 1345 StringBuilder sb=new StringBuilder((a.length+1)*width+2); 1346 for(int num_ : a){ 1347 int num=num_>>SCOREOFFSET; 1348 String s=" "+num; 1349 if(s.length()>width){s=num>0 ? maxString : minString;} 1350 int spaces=width-s.length(); 1351 assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; 1352 for(int i=0; i<spaces; i++){sb.append(' ');} 1353 sb.append(s); 1354 } 1355 1356 return sb.toString(); 1357 } 1358 1359 public static final String toString(byte[] a){ 1360 1361 int width=6; 1362 1363 StringBuilder sb=new StringBuilder((a.length+1)*width+2); 1364 for(int num : a){ 1365 String s=" "+num; 1366 int spaces=width-s.length(); 1367 assert(spaces>=0); 1368 for(int i=0; i<spaces; i++){sb.append(' ');} 1369 sb.append(s); 1370 } 1371 1372 return sb.toString(); 1373 } 1374 1375 public static final String toString(byte[] ref, int startLoc, int stopLoc){ 1376 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1); 1377 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);} 1378 return sb.toString(); 1379 } 1380 1381 public static int calcDelScore(int len){ 1382 if(len<=0){return 0;} 1383 int score=POINTS_DEL; 1384 1385 if(len>LIMIT_FOR_COST_4){ 1386 score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4; 1387 len=LIMIT_FOR_COST_4; 1388 } 1389 if(len>LIMIT_FOR_COST_3){ 1390 score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3; 1391 len=LIMIT_FOR_COST_3; 1392 } 1393 if(len>1){ 1394 score+=(len-1)*POINTS_DEL2; 1395 } 1396 return score; 1397 } 1398 1399 private static int calcDelScoreOffset(int len){ 1400 if(len<=0){return 0;} 1401 int score=POINTSoff_DEL; 1402 1403 if(len>LIMIT_FOR_COST_4){ 1404 score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; 1405 len=LIMIT_FOR_COST_4; 1406 } 1407 if(len>LIMIT_FOR_COST_3){ 1408 score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; 1409 len=LIMIT_FOR_COST_3; 1410 } 1411 if(len>1){ 1412 score+=(len-1)*POINTSoff_DEL2; 1413 } 1414 return score; 1415 } 1416 1417 private static int calcMatchScoreOffset(int len){ 1418 if(len<=0){return 0;} 1419 int score=POINTSoff_MATCH; 1420 1421 if(len>1){ 1422 score+=(len-1)*POINTSoff_MATCH2; 1423 } 1424 return score; 1425 } 1426 1427 private static int calcSubScoreOffset(int len){ 1428 if(len<=0){return 0;} 1429 int score=POINTSoff_SUB; 1430 1431 if(len>LIMIT_FOR_COST_3){ 1432 score+=(len-LIMIT_FOR_COST_3)*POINTSoff_SUB3; 1433 len=LIMIT_FOR_COST_3; 1434 } 1435 if(len>1){ 1436 score+=(len-1)*POINTSoff_SUB2; 1437 } 1438 return score; 1439 } 1440 1441 public static int calcInsScore(int len){ 1442 if(len<=0){return 0;} 1443 int score=POINTS_INS; 1444 1445 if(len>LIMIT_FOR_COST_4){ 1446 score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4; 1447 len=LIMIT_FOR_COST_4; 1448 } 1449 if(len>LIMIT_FOR_COST_3){ 1450 score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3; 1451 len=LIMIT_FOR_COST_3; 1452 } 1453 if(len>1){ 1454 score+=(len-1)*POINTS_INS2; 1455 } 1456 return score; 1457 } 1458 1459 private static int calcInsScoreOffset(int len){ 1460 if(len<=0){return 0;} 1461 int score=POINTSoff_INS; 1462 if(len>LIMIT_FOR_COST_4){ 1463 score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; 1464 len=LIMIT_FOR_COST_4; 1465 } 1466 if(len>LIMIT_FOR_COST_3){ 1467 score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; 1468 len=LIMIT_FOR_COST_3; 1469 } 1470 if(len>1){ 1471 score+=(len-1)*POINTSoff_INS2; 1472 } 1473 return score; 1474 } 1475 1476 1477 private final int maxRows; 1478 private final int maxColumns; 1479 1480 private final int[][][] packed; 1481 1482 private final int[] vertLimit; 1483 private final int[] horizLimit; 1484 1485 private final int[] insScoreArray; 1486 private final int[] delScoreArray; 1487 private final int[] matchScoreArray; 1488 private final int[] subScoreArray; 1489 1490 CharSequence showVertLimit(){ 1491 StringBuilder sb=new StringBuilder(); 1492 for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");} 1493 return sb; 1494 } 1495 CharSequence showHorizLimit(){ 1496 StringBuilder sb=new StringBuilder(); 1497 for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");} 1498 return sb; 1499 } 1500 1501 // public static final int MODEBITS=2; 1502 public static final int TIMEBITS=12; 1503 public static final int SCOREBITS=32-TIMEBITS; 1504 public static final int MAX_TIME=((1<<TIMEBITS)-1); 1505 public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000; 1506 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD". 1507 1508 // public static final int MODEOFFSET=0; //Always zero. 1509 // public static final int TIMEOFFSET=0; 1510 public static final int SCOREOFFSET=TIMEBITS; 1511 1512 // public static final int MODEMASK=~((-1)<<MODEBITS); 1513 // public static final int TIMEMASK=(~((-1)<<TIMEBITS))<<TIMEOFFSET; 1514 public static final int TIMEMASK=~((-1)<<TIMEBITS); 1515 public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET; 1516 1517 private static final byte MODE_MS=0; 1518 private static final byte MODE_DEL=1; 1519 private static final byte MODE_INS=2; 1520 private static final byte MODE_SUB=3; 1521 1522 public static final int POINTS_NOREF=-10; 1523 public static final int POINTS_NOCALL=-10; 1524 public static final int POINTS_MATCH=90; 1525 public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives 1526 public static final int POINTS_COMPATIBLE=50; 1527 public static final int POINTS_SUB=-143; 1528 public static final int POINTS_SUBR=-161; //increased penalty if prior match streak was at most 1 1529 public static final int POINTS_SUB2=-54; 1530 public static final int POINTS_SUB3=-35; 1531 public static final int POINTS_MATCHSUB=-10; 1532 public static final int POINTS_INS=-207; 1533 public static final int POINTS_INS2=-51; 1534 public static final int POINTS_INS3=-37; 1535 public static final int POINTS_INS4=-15; 1536 public static final int POINTS_DEL=-273; 1537 public static final int POINTS_DEL2=-38; 1538 public static final int POINTS_DEL3=-27; 1539 public static final int POINTS_DEL4=-15; 1540 public static final int POINTS_DEL_REF_N=-10; 1541 1542 1543 public static final int LIMIT_FOR_COST_3=5; 1544 public static final int LIMIT_FOR_COST_4=30; 1545 1546 public static final int BAD=MIN_SCORE-1; 1547 1548 1549 public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET); 1550 public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET); 1551 public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET); 1552 public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET); 1553 public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET); 1554 public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET); 1555 public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET); 1556 public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET); 1557 public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET); 1558 public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET); 1559 public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET); 1560 public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET); 1561 public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET); 1562 public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET); 1563 public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET); 1564 public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET); 1565 public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET); 1566 public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET); 1567 public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET); 1568 public static final int BADoff=(BAD<<SCOREOFFSET); 1569 public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET; 1570 public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET; 1571 1572 private int rows; 1573 private int columns; 1574 1575 public long iterationsLimited=0; 1576 public long iterationsUnlimited=0; 1577 1578 public boolean verbose=false; 1579 public boolean verbose2=false; 1580 1581 } 1582