1 package aligner;
2 
3 import dna.AminoAcid;
4 import shared.KillSwitch;
5 import shared.Tools;
6 
7 /**
8  * Based on SSAFlat, but with previous state pointers removed. */
9 public final class SingleStateAlignerFlat2Amino implements Aligner {
10 
11 
SingleStateAlignerFlat2Amino()12 	public SingleStateAlignerFlat2Amino(){}
13 
prefillTopRow()14 	private void prefillTopRow(){
15 		final int[] header=packed[0];
16 		final int qlen=rows;
17 		for(int i=0; i<=columns; i++){
18 			int x=columns-i+1;
19 			int qbases=qlen-x;
20 
21 			//Minimal points to prefer a leftmost alignment
22 			header[i]=qbases<=0 ? 0 : -qbases;
23 
24 			//Forces consumption of query, but does not allow for insertions...
25 //			header[i]=qbases<=0 ? 0 : calcDelScoreOffset(qbases);
26 		}
27 	}
28 
prefillLeftColumnStartingAt(int i)29 	private void prefillLeftColumnStartingAt(int i){
30 		packed[0][0]=MODE_MATCH;
31 		i=Tools.max(1, i);
32 		for(int score=MODE_INS+(POINTS_INS*i); i<=maxRows; i++){//Fill column 0 with insertions
33 			score+=POINTS_INS;
34 			packed[i][0]=score;
35 		}
36 	}
37 
initialize(int rows_, int columns_)38 	private void initialize(int rows_, int columns_){
39 		rows=rows_;
40 		columns=columns_;
41 		if(rows<=maxRows && columns<=maxColumns){
42 			prefillTopRow();
43 //			prefillLeftColumn();
44 			return;
45 		}
46 
47 		final int maxRows0=maxRows;
48 		final int maxColumns0=maxColumns;
49 		final int[][] packed0=packed;
50 
51 		//Monotonic increase
52 		maxRows=Tools.max(maxRows, rows+10);
53 		maxColumns=Tools.max(maxColumns, columns+10);
54 
55 		if(packed==null || maxColumns>maxColumns0){//Make a new matrix
56 			packed=KillSwitch.allocInt2D(maxRows+1, maxColumns+1);
57 			prefillLeftColumnStartingAt(1);
58 		}else{//Copy old rows
59 			assert(maxRows0>0 && maxColumns0>0);
60 			assert(maxRows>maxRows0 && maxColumns<=maxColumns0);
61 			packed=KillSwitch.allocInt2D(maxRows+1);
62 			for(int i=0; i<packed.length; i++){
63 				if(i<packed0.length){
64 					packed[i]=packed0[i];
65 				}else{
66 					packed[i]=KillSwitch.allocInt1D(maxColumns+1);
67 				}
68 			}
69 			//Fill column 0 with insertions
70 			prefillLeftColumnStartingAt(maxRows0);
71 		}
72 		prefillTopRow();
73 	}
74 
75 	/** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
76 	 * Will not fill areas that cannot match minScore */
77 	@Override
fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)78 	public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
79 		return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore);
80 	}
81 
82 	@Override
fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc)83 	public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
84 		return fillUnlimited(read, ref, refStartLoc, refEndLoc, -999999);
85 	}
86 
87 	/** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
88 	 * Min score is optional */
89 	@Override
fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)90 	public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
91 		initialize(read.length, refEndLoc-refStartLoc+1);
92 
93 		//temporary, for finding a bug
94 		if(rows>maxRows || columns>maxColumns){
95 			throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
96 		}
97 
98 		assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
99 		assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
100 
101 		assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
102 		assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
103 
104 		final int refOffset=refStartLoc-1;
105 		for(int row=1; row<=rows; row++){
106 
107 			final byte qBase=read[row-1];
108 			for(int col=1; col<=columns; col++){
109 
110 				final byte rBase=ref[refOffset+col];
111 
112 				final boolean match=(qBase==rBase);
113 				final boolean defined=(AminoAcid.isFullyDefinedAA(qBase) && AminoAcid.isFullyDefinedAA(rBase));
114 
115 				final int scoreFromDiag=packed[row-1][col-1];
116 				final int scoreFromDel=packed[row][col-1];
117 				final int scoreFromIns=packed[row-1][col];
118 
119 				final int diagScoreM=POINTS_MATCH;
120 				final int diagScoreS=POINTS_SUB;
121 				final int delScore=scoreFromDel+POINTS_DEL;
122 				final int insScore=scoreFromIns+POINTS_INS;
123 
124 //				final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF);
125 				int diagScore=(match ? diagScoreM : diagScoreS);
126 				diagScore=scoreFromDiag+(defined ? diagScore : POINTS_NOREF);
127 
128 				int score=diagScore>=delScore ? diagScore : delScore;
129 				score=score>=insScore ? score : insScore;
130 
131 				packed[row][col]=score;
132 			}
133 			//iterationsUnlimited+=columns;
134 		}
135 
136 
137 		int maxCol=-1;
138 		int maxState=-1;
139 		int maxStart=-1;
140 		int maxScore=Integer.MIN_VALUE;
141 
142 		for(int col=1; col<=columns; col++){
143 			int x=packed[rows][col];
144 			if(x>maxScore){
145 				maxScore=x;
146 				maxCol=col;
147 
148 //				assert(rows-1<read.length) : (rows-1)+", "+read.length;
149 //				assert(refOffset+col<ref.length) : refOffset+", "+col+", "+ref.length;
150 				maxState=getState(rows, col, read[rows-1], ref[refOffset+col]);
151 				maxStart=x;
152 			}
153 		}
154 
155 //		System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore);
156 		return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart};
157 	}
158 
159 	int getState(int row, int col, byte q, byte r){//zxvzxcv TODO: Fix - needs to find max
160 		final boolean match=(q==r);
161 		final boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
162 
163 		final int scoreFromDiag=packed[row-1][col-1];
164 		final int scoreFromDel=packed[row][col-1];
165 		final int scoreFromIns=packed[row-1][col];
166 //		final int score=packed[row][col];
167 
168 		final int diagScoreM=POINTS_MATCH;
169 		final int diagScoreS=POINTS_SUB;
170 		final int delScore=scoreFromDel+POINTS_DEL;
171 		final int insScore=scoreFromIns+POINTS_INS;
172 
173 		final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF);
174 
175 //		int score2=diagScore>=delScore ? diagScore : delScore;
176 //		score2=score>=insScore ? score : insScore;
177 
178 //		assert(score==score2) : score+", "+score2;
179 
180 		if(diagScore>=delScore && diagScore>=insScore){
181 			return defined ? match ? MODE_MATCH : MODE_SUB : MODE_N;
182 		}else if(delScore>=insScore){
183 			return MODE_DEL;
184 		}
185 		return MODE_INS;
186 	}
187 
188 	/** Generates the match string */
189 	@Override
traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state)190 	public final byte[] traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
191 //		assert(false);
192 		assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
193 		assert(row==rows);
194 
195 		byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
196 		int outPos=0;
197 
198 //		assert(state==(packed[row][col]&MODEMASK));
199 
200 		while(row>0 && col>0){
201 			byte q=query[row-1];
202 			byte r=ref[refStartLoc+col-1];
203 			boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
204 			state=getState(row, col, q, r);
205 //			assert(defined) : state+", "+(int)q+", "+(int)r+", "+new String(query);
206 //			assert(state!=MODE_N) : state+", "+Character.toString(q)+", "+Character.toString(r)+", "+new String(query);
207 			if(state==MODE_MATCH){
208 				col--;
209 				row--;
210 				out[outPos]=defined ? (byte)'m' : (byte)'N';
211 			}else if(state==MODE_SUB){
212 				col--;
213 				row--;
214 				out[outPos]=defined ? (byte)'S' : (byte)'N';
215 			}else if(state==MODE_N){
216 				col--;
217 				row--;
218 				out[outPos]='N';
219 			}else if(state==MODE_DEL){
220 				col--;
221 				out[outPos]='D';
222 			}else if(state==MODE_INS){
223 				row--;
224 //				out[outPos]='I';
225 				if(col>=0 && col<columns){
226 					out[outPos]='I';
227 				}else{
228 					out[outPos]='C';
229 					col--;
230 				}
231 			}else{
232 				assert(false) : state;
233 			}
234 			outPos++;
235 		}
236 
237 		assert(row==0 || col==0);
238 		if(col!=row){//Not sure what this is doing
239 			while(row>0){
240 				out[outPos]='C';
241 				outPos++;
242 				row--;
243 				col--;
244 			}
245 			if(col>0){
246 				//do nothing
247 			}
248 		}
249 
250 		//Shrink and reverse the string
251 		byte[] out2=new byte[outPos];
252 		for(int i=0; i<outPos; i++){
253 			out2[i]=out[outPos-i-1];
254 		}
255 		out=null;
256 
257 		return out2;
258 	}
259 
260 	@Override
261 	/** Generates identity;
262 	 * fills 'extra' with {match, sub, del, ins, N, clip} if present */
tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra)263 	public float tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){
264 
265 //		assert(false);
266 		assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
267 		assert(row==rows);
268 
269 //		assert(state==(packed[row][col]&MODEMASK));
270 		int match=0, sub=0, del=0, ins=0, noref=0, clip=0;
271 
272 		while(row>0 && col>0){
273 			byte q=query[row-1];
274 			byte r=ref[refStartLoc+col-1];
275 			boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
276 			state=getState(row, col, q, r);
277 			if(state==MODE_MATCH){
278 				col--;
279 				row--;
280 				match+=(defined ? 1 : 0);
281 				noref+=(defined ? 0 : 1);
282 			}else if(state==MODE_SUB){
283 				col--;
284 				row--;
285 				sub+=(defined ? 1 : 0);
286 				noref+=(defined ? 0 : 1);
287 			}else if(state==MODE_N){
288 				col--;
289 				row--;
290 				noref++;
291 			}else if(state==MODE_DEL){
292 				col--;
293 				del++;
294 			}else if(state==MODE_INS){
295 				row--;
296 				boolean edge=(col<=1 || col>=columns);
297 				ins+=(edge ? 0 : 1);
298 				clip+=(edge ? 1 : 0);
299 			}else{
300 				assert(false) : state;
301 			}
302 		}
303 
304 		assert(row==0 || col==0);
305 		if(col!=row){//Not sure what this is doing
306 			while(row>0){
307 				clip++;
308 				row--;
309 				col--;
310 			}
311 			if(col>0){
312 				//do nothing
313 			}
314 		}
315 
316 		if(extra!=null){
317 			assert(extra.length==5);
318 			extra[0]=match;
319 			extra[1]=sub;
320 			extra[2]=del;
321 			extra[3]=ins;
322 			extra[4]=noref;
323 			extra[5]=clip;
324 		}
325 
326 		float len=match+sub+ins+del+noref*0.1f;
327 		float id=match/Tools.max(1.0f, len);
328 		return id;
329 	}
330 
331 	/** Generates identity;
332 	 * fills 'extra' with {match, sub, del, ins, N, clip} if present */
tracebackIdentityAmino(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra)333 	public float tracebackIdentityAmino(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){
334 
335 //		assert(false);
336 		assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
337 		assert(row==rows);
338 
339 //		assert(state==(packed[row][col]&MODEMASK));
340 		int match=0, sub=0, del=0, ins=0, noref=0, clip=0;
341 
342 		while(row>0 && col>0){
343 			byte q=query[row-1];
344 			byte r=ref[refStartLoc+col-1];
345 			boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
346 			state=getState(row, col, q, r);
347 			if(state==MODE_MATCH){
348 				col--;
349 				row--;
350 				match+=(defined ? 1 : 0);
351 				noref+=(defined ? 0 : 1);
352 			}else if(state==MODE_SUB){
353 				col--;
354 				row--;
355 				sub+=(defined ? 1 : 0);
356 				noref+=(defined ? 0 : 1);
357 			}else if(state==MODE_N){
358 				col--;
359 				row--;
360 				noref++;
361 			}else if(state==MODE_DEL){
362 				col--;
363 				del++;
364 			}else if(state==MODE_INS){
365 				row--;
366 				boolean edge=(col<=1 || col>=columns);
367 				ins+=(edge ? 0 : 1);
368 				clip+=(edge ? 1 : 0);
369 			}else{
370 				assert(false) : state;
371 			}
372 		}
373 
374 		assert(row==0 || col==0);
375 		if(col!=row){//Not sure what this is doing
376 			while(row>0){
377 				clip++;
378 				row--;
379 				col--;
380 			}
381 			if(col>0){
382 				//do nothing
383 			}
384 		}
385 
386 		if(extra!=null){
387 			assert(extra.length==5);
388 			extra[0]=match;
389 			extra[1]=sub;
390 			extra[2]=del;
391 			extra[3]=ins;
392 			extra[4]=noref;
393 			extra[5]=clip;
394 		}
395 
396 		float len=match+sub+ins+del+noref*0.1f;
397 		float id=match/Tools.max(1.0f, len);
398 		return id;
399 	}
400 
401 	/** @return {score, bestRefStart, bestRefStop} */
402 	@Override
score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, final int maxRow, final int maxCol, final int maxState )403 	public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
404 			final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){
405 
406 		int row=maxRow;
407 		int col=maxCol;
408 		int state=maxState;
409 
410 		assert(maxState>=0 && maxState<packed.length) :
411 			maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
412 		assert(maxRow>=0 && maxRow<packed.length) :
413 			maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
414 		assert(maxCol>=0 && maxCol<packed[0].length) :
415 			maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
416 
417 		int score=packed[maxRow][maxCol]; //Or zero, if it is to be recalculated
418 
419 		if(row<rows){
420 			int difR=rows-row;
421 			int difC=columns-col;
422 
423 			while(difR>difC){
424 				score+=POINTS_NOREF;
425 				difR--;
426 			}
427 
428 			row+=difR;
429 			col+=difR;
430 
431 		}
432 
433 		assert(refStartLoc<=refEndLoc);
434 		assert(row==rows);
435 
436 
437 		final int bestRefStop=refStartLoc+col-1;
438 
439 		while(row>0 && col>0){
440 			final byte q=read[row-1];
441 			final byte r=ref[refStartLoc+col-1];
442 //			final boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
443 			state=getState(row, col, q, r);
444 			if(state==MODE_MATCH){
445 				col--;
446 				row--;
447 			}else if(state==MODE_SUB){
448 				col--;
449 				row--;
450 			}else if(state==MODE_N){
451 				col--;
452 				row--;
453 			}else if(state==MODE_DEL){
454 				col--;
455 			}else if(state==MODE_INS){
456 				row--;
457 			}else{
458 				assert(false) : state;
459 			}
460 		}
461 //		assert(false) : row+", "+col;
462 		if(row>col){
463 			col-=row;
464 		}
465 
466 		final int bestRefStart=refStartLoc+col;
467 
468 //		System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart);
469 		int[] rvec;
470 		if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
471 			int padLeft=Tools.max(0, refStartLoc-bestRefStart);
472 			int padRight=Tools.max(0, bestRefStop-refEndLoc);
473 			rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
474 		}else{
475 			rvec=new int[] {score, bestRefStart, bestRefStop};
476 		}
477 		return rvec;
478 	}
479 
480 
481 	/** Will not fill areas that cannot match minScore.
482 	 * @return {score, bestRefStart, bestRefStop}  */
483 	@Override
fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)484 	public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
485 		int a=Tools.max(0, refStartLoc);
486 		int b=Tools.min(ref.length-1, refEndLoc);
487 		assert(b>=a);
488 
489 		if(b-a>=maxColumns){
490 			System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
491 			assert(false) : refStartLoc+", "+refEndLoc;
492 			b=Tools.min(ref.length-1, a+maxColumns-1);
493 		}
494 		int[] max=fillLimited(read, ref, a, b, minScore);
495 //		return max==null ? null : new int[] {max[3], 0, max[1]};
496 
497 		int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/));
498 
499 		return score;
500 	}
501 
toString(byte[] ref, int startLoc, int stopLoc)502 	public static final String toString(byte[] ref, int startLoc, int stopLoc){
503 		StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
504 		for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
505 		return sb.toString();
506 	}
507 
508 //	public static int calcDelScore(int len){
509 //		if(len<=0){return 0;}
510 //		int score=POINTS_DEL;
511 //		if(len>1){
512 //			score+=(len-1)*POINTS_DEL2;
513 //		}
514 //		return score;
515 //	}
516 
517 //	public int maxScoreByIdentity(int len, float identity){
518 //		assert(identity>=0 && identity<=1);
519 //		return (int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
520 //	}
521 
522 	@Override
minScoreByIdentity(int len, float identity)523 	public int minScoreByIdentity(int len, float identity){
524 		assert(identity>=0 && identity<=1);
525 
526 		int a=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
527 		int b=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_INS));
528 		int c=(int)(len*(1*POINTS_MATCH+((1/(Tools.max(identity, 0.000001f)))-1)*POINTS_DEL));
529 		return Tools.min(a, b, c);
530 	}
531 
calcDelScore(int len)532 	private static int calcDelScore(int len){
533 		if(len<=0){return 0;}
534 		int score=POINTS_DEL*len;
535 		return score;
536 	}
537 
538 	@Override
rows()539 	public int rows(){return rows;}
540 	@Override
columns()541 	public int columns(){return columns;}
542 
543 
544 	private int maxRows;
545 	private int maxColumns;
546 
547 	private int[][] packed;
548 
549 	public static final int MAX_SCORE=Integer.MAX_VALUE-2000;
550 	public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
551 
552 	//For some reason changing MODE_DEL from 1 to 0 breaks everything
553 	private static final byte MODE_DEL=1;
554 	private static final byte MODE_INS=2;
555 	private static final byte MODE_SUB=3;
556 	private static final byte MODE_MATCH=4;
557 	private static final byte MODE_N=5;
558 
559 	public static final int POINTS_NOREF=-15;
560 	public static final int POINTS_MATCH=100;
561 	public static final int POINTS_SUB=-50;
562 	public static final int POINTS_INS=-121;
563 	public static final int POINTS_DEL=-111;
564 
565 //	public static final int POINTS_NOREF=-100000;
566 //	public static final int POINTS_MATCH=100;
567 //	public static final int POINTS_SUB=-100;
568 //	public static final int POINTS_INS=-100;
569 //	public static final int POINTS_DEL=-100;
570 
571 	public static final int BAD=MIN_SCORE-1;
572 
573 	private int rows;
574 	private int columns;
575 
576 //	public long iterationsLimited=0;
577 //	public long iterationsUnlimited=0;
578 
579 	public boolean verbose=false;
580 	public boolean verbose2=false;
581 
582 }
583