1 package aligner; 2 3 import dna.AminoAcid; 4 import shared.KillSwitch; 5 import shared.Tools; 6 7 /** 8 * Based on SSAFlat, but with previous state pointers removed. */ 9 public final class SingleStateAlignerFlat2Amino implements Aligner { 10 11 SingleStateAlignerFlat2Amino()12 public SingleStateAlignerFlat2Amino(){} 13 prefillTopRow()14 private void prefillTopRow(){ 15 final int[] header=packed[0]; 16 final int qlen=rows; 17 for(int i=0; i<=columns; i++){ 18 int x=columns-i+1; 19 int qbases=qlen-x; 20 21 //Minimal points to prefer a leftmost alignment 22 header[i]=qbases<=0 ? 0 : -qbases; 23 24 //Forces consumption of query, but does not allow for insertions... 25 // header[i]=qbases<=0 ? 0 : calcDelScoreOffset(qbases); 26 } 27 } 28 prefillLeftColumnStartingAt(int i)29 private void prefillLeftColumnStartingAt(int i){ 30 packed[0][0]=MODE_MATCH; 31 i=Tools.max(1, i); 32 for(int score=MODE_INS+(POINTS_INS*i); i<=maxRows; i++){//Fill column 0 with insertions 33 score+=POINTS_INS; 34 packed[i][0]=score; 35 } 36 } 37 initialize(int rows_, int columns_)38 private void initialize(int rows_, int columns_){ 39 rows=rows_; 40 columns=columns_; 41 if(rows<=maxRows && columns<=maxColumns){ 42 prefillTopRow(); 43 // prefillLeftColumn(); 44 return; 45 } 46 47 final int maxRows0=maxRows; 48 final int maxColumns0=maxColumns; 49 final int[][] packed0=packed; 50 51 //Monotonic increase 52 maxRows=Tools.max(maxRows, rows+10); 53 maxColumns=Tools.max(maxColumns, columns+10); 54 55 if(packed==null || maxColumns>maxColumns0){//Make a new matrix 56 packed=KillSwitch.allocInt2D(maxRows+1, maxColumns+1); 57 prefillLeftColumnStartingAt(1); 58 }else{//Copy old rows 59 assert(maxRows0>0 && maxColumns0>0); 60 assert(maxRows>maxRows0 && maxColumns<=maxColumns0); 61 packed=KillSwitch.allocInt2D(maxRows+1); 62 for(int i=0; i<packed.length; i++){ 63 if(i<packed0.length){ 64 packed[i]=packed0[i]; 65 }else{ 66 packed[i]=KillSwitch.allocInt1D(maxColumns+1); 67 } 68 } 69 //Fill column 0 with insertions 70 prefillLeftColumnStartingAt(maxRows0); 71 } 72 prefillTopRow(); 73 } 74 75 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart}; 76 * Will not fill areas that cannot match minScore */ 77 @Override fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)78 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ 79 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore); 80 } 81 82 @Override fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc)83 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ 84 return fillUnlimited(read, ref, refStartLoc, refEndLoc, -999999); 85 } 86 87 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart}; 88 * Min score is optional */ 89 @Override fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)90 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ 91 initialize(read.length, refEndLoc-refStartLoc+1); 92 93 //temporary, for finding a bug 94 if(rows>maxRows || columns>maxColumns){ 95 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); 96 } 97 98 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; 99 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; 100 101 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; 102 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length; 103 104 final int refOffset=refStartLoc-1; 105 for(int row=1; row<=rows; row++){ 106 107 final byte qBase=read[row-1]; 108 for(int col=1; col<=columns; col++){ 109 110 final byte rBase=ref[refOffset+col]; 111 112 final boolean match=(qBase==rBase); 113 final boolean defined=(AminoAcid.isFullyDefinedAA(qBase) && AminoAcid.isFullyDefinedAA(rBase)); 114 115 final int scoreFromDiag=packed[row-1][col-1]; 116 final int scoreFromDel=packed[row][col-1]; 117 final int scoreFromIns=packed[row-1][col]; 118 119 final int diagScoreM=POINTS_MATCH; 120 final int diagScoreS=POINTS_SUB; 121 final int delScore=scoreFromDel+POINTS_DEL; 122 final int insScore=scoreFromIns+POINTS_INS; 123 124 // final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF); 125 int diagScore=(match ? diagScoreM : diagScoreS); 126 diagScore=scoreFromDiag+(defined ? diagScore : POINTS_NOREF); 127 128 int score=diagScore>=delScore ? diagScore : delScore; 129 score=score>=insScore ? score : insScore; 130 131 packed[row][col]=score; 132 } 133 //iterationsUnlimited+=columns; 134 } 135 136 137 int maxCol=-1; 138 int maxState=-1; 139 int maxStart=-1; 140 int maxScore=Integer.MIN_VALUE; 141 142 for(int col=1; col<=columns; col++){ 143 int x=packed[rows][col]; 144 if(x>maxScore){ 145 maxScore=x; 146 maxCol=col; 147 148 // assert(rows-1<read.length) : (rows-1)+", "+read.length; 149 // assert(refOffset+col<ref.length) : refOffset+", "+col+", "+ref.length; 150 maxState=getState(rows, col, read[rows-1], ref[refOffset+col]); 151 maxStart=x; 152 } 153 } 154 155 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore); 156 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart}; 157 } 158 159 int getState(int row, int col, byte q, byte r){//zxvzxcv TODO: Fix - needs to find max 160 final boolean match=(q==r); 161 final boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r)); 162 163 final int scoreFromDiag=packed[row-1][col-1]; 164 final int scoreFromDel=packed[row][col-1]; 165 final int scoreFromIns=packed[row-1][col]; 166 // final int score=packed[row][col]; 167 168 final int diagScoreM=POINTS_MATCH; 169 final int diagScoreS=POINTS_SUB; 170 final int delScore=scoreFromDel+POINTS_DEL; 171 final int insScore=scoreFromIns+POINTS_INS; 172 173 final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF); 174 175 // int score2=diagScore>=delScore ? diagScore : delScore; 176 // score2=score>=insScore ? score : insScore; 177 178 // assert(score==score2) : score+", "+score2; 179 180 if(diagScore>=delScore && diagScore>=insScore){ 181 return defined ? match ? MODE_MATCH : MODE_SUB : MODE_N; 182 }else if(delScore>=insScore){ 183 return MODE_DEL; 184 } 185 return MODE_INS; 186 } 187 188 /** Generates the match string */ 189 @Override traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state)190 public final byte[] traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ 191 // assert(false); 192 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; 193 assert(row==rows); 194 195 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". 196 int outPos=0; 197 198 // assert(state==(packed[row][col]&MODEMASK)); 199 200 while(row>0 && col>0){ 201 byte q=query[row-1]; 202 byte r=ref[refStartLoc+col-1]; 203 boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r)); 204 state=getState(row, col, q, r); 205 // assert(defined) : state+", "+(int)q+", "+(int)r+", "+new String(query); 206 // assert(state!=MODE_N) : state+", "+Character.toString(q)+", "+Character.toString(r)+", "+new String(query); 207 if(state==MODE_MATCH){ 208 col--; 209 row--; 210 out[outPos]=defined ? (byte)'m' : (byte)'N'; 211 }else if(state==MODE_SUB){ 212 col--; 213 row--; 214 out[outPos]=defined ? (byte)'S' : (byte)'N'; 215 }else if(state==MODE_N){ 216 col--; 217 row--; 218 out[outPos]='N'; 219 }else if(state==MODE_DEL){ 220 col--; 221 out[outPos]='D'; 222 }else if(state==MODE_INS){ 223 row--; 224 // out[outPos]='I'; 225 if(col>=0 && col<columns){ 226 out[outPos]='I'; 227 }else{ 228 out[outPos]='C'; 229 col--; 230 } 231 }else{ 232 assert(false) : state; 233 } 234 outPos++; 235 } 236 237 assert(row==0 || col==0); 238 if(col!=row){//Not sure what this is doing 239 while(row>0){ 240 out[outPos]='C'; 241 outPos++; 242 row--; 243 col--; 244 } 245 if(col>0){ 246 //do nothing 247 } 248 } 249 250 //Shrink and reverse the string 251 byte[] out2=new byte[outPos]; 252 for(int i=0; i<outPos; i++){ 253 out2[i]=out[outPos-i-1]; 254 } 255 out=null; 256 257 return out2; 258 } 259 260 @Override 261 /** Generates identity; 262 * fills 'extra' with {match, sub, del, ins, N, clip} if present */ tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra)263 public float tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){ 264 265 // assert(false); 266 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; 267 assert(row==rows); 268 269 // assert(state==(packed[row][col]&MODEMASK)); 270 int match=0, sub=0, del=0, ins=0, noref=0, clip=0; 271 272 while(row>0 && col>0){ 273 byte q=query[row-1]; 274 byte r=ref[refStartLoc+col-1]; 275 boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r)); 276 state=getState(row, col, q, r); 277 if(state==MODE_MATCH){ 278 col--; 279 row--; 280 match+=(defined ? 1 : 0); 281 noref+=(defined ? 0 : 1); 282 }else if(state==MODE_SUB){ 283 col--; 284 row--; 285 sub+=(defined ? 1 : 0); 286 noref+=(defined ? 0 : 1); 287 }else if(state==MODE_N){ 288 col--; 289 row--; 290 noref++; 291 }else if(state==MODE_DEL){ 292 col--; 293 del++; 294 }else if(state==MODE_INS){ 295 row--; 296 boolean edge=(col<=1 || col>=columns); 297 ins+=(edge ? 0 : 1); 298 clip+=(edge ? 1 : 0); 299 }else{ 300 assert(false) : state; 301 } 302 } 303 304 assert(row==0 || col==0); 305 if(col!=row){//Not sure what this is doing 306 while(row>0){ 307 clip++; 308 row--; 309 col--; 310 } 311 if(col>0){ 312 //do nothing 313 } 314 } 315 316 if(extra!=null){ 317 assert(extra.length==5); 318 extra[0]=match; 319 extra[1]=sub; 320 extra[2]=del; 321 extra[3]=ins; 322 extra[4]=noref; 323 extra[5]=clip; 324 } 325 326 float len=match+sub+ins+del+noref*0.1f; 327 float id=match/Tools.max(1.0f, len); 328 return id; 329 } 330 331 /** Generates identity; 332 * fills 'extra' with {match, sub, del, ins, N, clip} if present */ tracebackIdentityAmino(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra)333 public float tracebackIdentityAmino(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){ 334 335 // assert(false); 336 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; 337 assert(row==rows); 338 339 // assert(state==(packed[row][col]&MODEMASK)); 340 int match=0, sub=0, del=0, ins=0, noref=0, clip=0; 341 342 while(row>0 && col>0){ 343 byte q=query[row-1]; 344 byte r=ref[refStartLoc+col-1]; 345 boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r)); 346 state=getState(row, col, q, r); 347 if(state==MODE_MATCH){ 348 col--; 349 row--; 350 match+=(defined ? 1 : 0); 351 noref+=(defined ? 0 : 1); 352 }else if(state==MODE_SUB){ 353 col--; 354 row--; 355 sub+=(defined ? 1 : 0); 356 noref+=(defined ? 0 : 1); 357 }else if(state==MODE_N){ 358 col--; 359 row--; 360 noref++; 361 }else if(state==MODE_DEL){ 362 col--; 363 del++; 364 }else if(state==MODE_INS){ 365 row--; 366 boolean edge=(col<=1 || col>=columns); 367 ins+=(edge ? 0 : 1); 368 clip+=(edge ? 1 : 0); 369 }else{ 370 assert(false) : state; 371 } 372 } 373 374 assert(row==0 || col==0); 375 if(col!=row){//Not sure what this is doing 376 while(row>0){ 377 clip++; 378 row--; 379 col--; 380 } 381 if(col>0){ 382 //do nothing 383 } 384 } 385 386 if(extra!=null){ 387 assert(extra.length==5); 388 extra[0]=match; 389 extra[1]=sub; 390 extra[2]=del; 391 extra[3]=ins; 392 extra[4]=noref; 393 extra[5]=clip; 394 } 395 396 float len=match+sub+ins+del+noref*0.1f; 397 float id=match/Tools.max(1.0f, len); 398 return id; 399 } 400 401 /** @return {score, bestRefStart, bestRefStop} */ 402 @Override score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, final int maxRow, final int maxCol, final int maxState )403 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, 404 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){ 405 406 int row=maxRow; 407 int col=maxCol; 408 int state=maxState; 409 410 assert(maxState>=0 && maxState<packed.length) : 411 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); 412 assert(maxRow>=0 && maxRow<packed.length) : 413 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); 414 assert(maxCol>=0 && maxCol<packed[0].length) : 415 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); 416 417 int score=packed[maxRow][maxCol]; //Or zero, if it is to be recalculated 418 419 if(row<rows){ 420 int difR=rows-row; 421 int difC=columns-col; 422 423 while(difR>difC){ 424 score+=POINTS_NOREF; 425 difR--; 426 } 427 428 row+=difR; 429 col+=difR; 430 431 } 432 433 assert(refStartLoc<=refEndLoc); 434 assert(row==rows); 435 436 437 final int bestRefStop=refStartLoc+col-1; 438 439 while(row>0 && col>0){ 440 final byte q=read[row-1]; 441 final byte r=ref[refStartLoc+col-1]; 442 // final boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r)); 443 state=getState(row, col, q, r); 444 if(state==MODE_MATCH){ 445 col--; 446 row--; 447 }else if(state==MODE_SUB){ 448 col--; 449 row--; 450 }else if(state==MODE_N){ 451 col--; 452 row--; 453 }else if(state==MODE_DEL){ 454 col--; 455 }else if(state==MODE_INS){ 456 row--; 457 }else{ 458 assert(false) : state; 459 } 460 } 461 // assert(false) : row+", "+col; 462 if(row>col){ 463 col-=row; 464 } 465 466 final int bestRefStart=refStartLoc+col; 467 468 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart); 469 int[] rvec; 470 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow 471 int padLeft=Tools.max(0, refStartLoc-bestRefStart); 472 int padRight=Tools.max(0, bestRefStop-refEndLoc); 473 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; 474 }else{ 475 rvec=new int[] {score, bestRefStart, bestRefStop}; 476 } 477 return rvec; 478 } 479 480 481 /** Will not fill areas that cannot match minScore. 482 * @return {score, bestRefStart, bestRefStop} */ 483 @Override fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)484 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ 485 int a=Tools.max(0, refStartLoc); 486 int b=Tools.min(ref.length-1, refEndLoc); 487 assert(b>=a); 488 489 if(b-a>=maxColumns){ 490 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); 491 assert(false) : refStartLoc+", "+refEndLoc; 492 b=Tools.min(ref.length-1, a+maxColumns-1); 493 } 494 int[] max=fillLimited(read, ref, a, b, minScore); 495 // return max==null ? null : new int[] {max[3], 0, max[1]}; 496 497 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/)); 498 499 return score; 500 } 501 toString(byte[] ref, int startLoc, int stopLoc)502 public static final String toString(byte[] ref, int startLoc, int stopLoc){ 503 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1); 504 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);} 505 return sb.toString(); 506 } 507 508 // public static int calcDelScore(int len){ 509 // if(len<=0){return 0;} 510 // int score=POINTS_DEL; 511 // if(len>1){ 512 // score+=(len-1)*POINTS_DEL2; 513 // } 514 // return score; 515 // } 516 517 // public int maxScoreByIdentity(int len, float identity){ 518 // assert(identity>=0 && identity<=1); 519 // return (int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB)); 520 // } 521 522 @Override minScoreByIdentity(int len, float identity)523 public int minScoreByIdentity(int len, float identity){ 524 assert(identity>=0 && identity<=1); 525 526 int a=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB)); 527 int b=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_INS)); 528 int c=(int)(len*(1*POINTS_MATCH+((1/(Tools.max(identity, 0.000001f)))-1)*POINTS_DEL)); 529 return Tools.min(a, b, c); 530 } 531 calcDelScore(int len)532 private static int calcDelScore(int len){ 533 if(len<=0){return 0;} 534 int score=POINTS_DEL*len; 535 return score; 536 } 537 538 @Override rows()539 public int rows(){return rows;} 540 @Override columns()541 public int columns(){return columns;} 542 543 544 private int maxRows; 545 private int maxColumns; 546 547 private int[][] packed; 548 549 public static final int MAX_SCORE=Integer.MAX_VALUE-2000; 550 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD". 551 552 //For some reason changing MODE_DEL from 1 to 0 breaks everything 553 private static final byte MODE_DEL=1; 554 private static final byte MODE_INS=2; 555 private static final byte MODE_SUB=3; 556 private static final byte MODE_MATCH=4; 557 private static final byte MODE_N=5; 558 559 public static final int POINTS_NOREF=-15; 560 public static final int POINTS_MATCH=100; 561 public static final int POINTS_SUB=-50; 562 public static final int POINTS_INS=-121; 563 public static final int POINTS_DEL=-111; 564 565 // public static final int POINTS_NOREF=-100000; 566 // public static final int POINTS_MATCH=100; 567 // public static final int POINTS_SUB=-100; 568 // public static final int POINTS_INS=-100; 569 // public static final int POINTS_DEL=-100; 570 571 public static final int BAD=MIN_SCORE-1; 572 573 private int rows; 574 private int columns; 575 576 // public long iterationsLimited=0; 577 // public long iterationsUnlimited=0; 578 579 public boolean verbose=false; 580 public boolean verbose2=false; 581 582 } 583