1 package prok;
2 
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Collections;
7 
8 import aligner.SingleStateAlignerFlat2;
9 import aligner.SingleStateAlignerFlat3;
10 import aligner.SingleStateAlignerFlatFloat;
11 import dna.AminoAcid;
12 import shared.KillSwitch;
13 import shared.Tools;
14 import stream.Read;
15 import structures.FloatList;
16 import structures.IntList;
17 import structures.LongHashSet;
18 
19 
20 /**
21  * This class calls genes within a single thread.
22  * @author Brian Bushnell
23  * @date Sep 24, 2018
24  *
25  */
26 public class GeneCaller extends ProkObject {
27 
28 	/*--------------------------------------------------------------*/
29 	/*----------------             Init             ----------------*/
30 	/*--------------------------------------------------------------*/
31 
GeneCaller(int minLen_, int maxOverlapSameStrand_, int maxOverlapOppositeStrand_, float minStartScore_, float minStopScore_, float minInnerScore_, float minOrfScore_, float minAvgScore_, GeneModel pgm_)32 	GeneCaller(int minLen_, int maxOverlapSameStrand_, int maxOverlapOppositeStrand_,
33 			float minStartScore_, float minStopScore_, float minInnerScore_,
34 			float minOrfScore_, float minAvgScore_, GeneModel pgm_){
35 		minLen=minLen_;
36 		maxOverlapSameStrand=maxOverlapSameStrand_;
37 		maxOverlapOppositeStrand=maxOverlapOppositeStrand_;
38 		pgm=pgm_;
39 
40 		minStartScore=minStartScore_;
41 		minStopScore=minStopScore_;
42 		minInnerScore=minInnerScore_;
43 		minOrfScore=minOrfScore_;
44 		minAvgScore=minAvgScore_;
45 	}
46 
47 	/*--------------------------------------------------------------*/
48 	/*----------------         Outer Methods        ----------------*/
49 	/*--------------------------------------------------------------*/
50 
callGenes(Read r)51 	public ArrayList<Orf> callGenes(Read r){
52 		return callGenes(r, pgm);
53 	}
54 
callGenes(Read r, GeneModel pgm_)55 	public ArrayList<Orf> callGenes(Read r, GeneModel pgm_){
56 		pgm=pgm_;
57 
58 		final String name=r.id;
59 		final byte[] bases=r.bases;
60 
61 		//Lists of all longest orfs per frame
62 		ArrayList<Orf>[] frameLists=makeOrfs(name, bases, minLen);
63 		//Lists of all high-scoring orfs per frame, with potentially multiple orfs sharing stops.
64 		ArrayList<Orf>[] brokenLists=breakOrfs(frameLists, bases);
65 
66 		ArrayList<Orf>[] rnaLists=null;
67 		final int rlen=r.length();
68 		if(calltRNA || (call16S && rlen>800) || (call23S && rlen>1500) || call5S || (call18S && rlen>1000)){
69 			rnaLists=makeRnas(name, bases);
70 
71 			brokenLists[0].addAll(rnaLists[0]);
72 			brokenLists[3].addAll(rnaLists[1]);
73 			Collections.sort(brokenLists[0]);
74 			Collections.sort(brokenLists[3]);
75 		}
76 
77 		boolean printAllOrfs=false;
78 		boolean printRnas=false;
79 		if(printAllOrfs){
80 			ArrayList<Orf> temp=new ArrayList<Orf>();
81 			for(ArrayList<Orf> broken : brokenLists){
82 				temp.addAll(broken);
83 			}
84 			Collections.sort(temp);
85 			return temp;
86 		}
87 
88 		if(printRnas && rnaLists!=null){
89 			ArrayList<Orf> temp=new ArrayList<Orf>();
90 			for(ArrayList<Orf> list : rnaLists){
91 				temp.addAll(list);
92 			}
93 			Collections.sort(temp);
94 			return temp;
95 		}
96 
97 		stCds2.add(brokenLists);
98 
99 		//Find the optimal path through Orfs
100 		ArrayList<Orf> path=findPath(brokenLists, bases);
101 //		geneStartsOut+=path.size();
102 
103 		if(callCDS){stCdsPass.add(path);}
104 		if(calltRNA){sttRNA.add(path);}
105 		if(call16S){st16s.add(path);}
106 		if(call23S){st23s.add(path);}
107 		if(call5S){st5s.add(path);}
108 		if(call18S){st18s.add(path);}
109 
110 		return path;
111 	}
112 
113 	/*--------------------------------------------------------------*/
114 	/*----------------         Inner Methods        ----------------*/
115 	/*--------------------------------------------------------------*/
116 
117 	/**
118 	 * Generates lists of all max-length non-overlapping Orfs per frame.
119 	 * There IS overlap between frames.
120 	 * All Orfs come out flipped to + orientation.
121 	 * */
makeRnas(String name, byte[] bases)122 	ArrayList<Orf>[] makeRnas(String name, byte[] bases){
123 		@SuppressWarnings("unchecked")
124 		ArrayList<Orf>[] array=new ArrayList[2];
125 		array[0]=new ArrayList<Orf>();
126 		array[1]=new ArrayList<Orf>();
127 		final float[] scores=new float[bases.length];
128 		final int[] kmersSeen=(lsuKmers==null && ssuKmers==null && trnaKmers==null && r5SKmers==null) ? null : new int[bases.length];
129 		for(int strand=0; strand<2; strand++){
130 			for(StatsContainer sc : pgm.rnaContainers){
131 				if(ProkObject.callType(sc.type)){
132 					ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, (sc.kmerSet()==null ? null : kmersSeen), false, -1);//TODO: Make this loop through all RNA types
133 					if(strand==1 && list!=null){
134 						for(Orf orf : list){
135 							assert(orf.strand==strand);
136 							orf.flip();
137 						}
138 					}
139 					if(list!=null){array[strand].addAll(list);}
140 				}
141 			}
142 			Collections.sort(array[strand]);
143 			AminoAcid.reverseComplementBasesInPlace(bases);
144 		}
145 		return array;
146 	}
147 
148 	/** Designed for quickly calling a single SSU */
makeRna(String name, byte[] bases, int type)149 	public Orf makeRna(String name, byte[] bases, int type){
150 		final float[] scores=new float[bases.length];//TODO: Big and slow; make a FloatList?
151 		StatsContainer sc=pgm.allContainers[type];
152 		final int[] kmersSeen=(sc.kmerSet()==null ? null : new int[bases.length]);//TODO: IntList?
153 
154 		int strand=0;
155 		ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1);
156 		final Orf best1=pickBest(list);
157 		assert(best1==null || best1.start>=0 && best1.stop<bases.length) : bases.length+"\n"+best1;
158 		if(best1!=null && best1.orfScore>-999){return best1;}
159 
160 		strand++;
161 		AminoAcid.reverseComplementBasesInPlace(bases);
162 		list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1);
163 		AminoAcid.reverseComplementBasesInPlace(bases);
164 		if(strand==1 && list!=null){
165 			for(Orf orf : list){
166 				assert(orf.strand==strand);
167 				orf.flip();
168 			}
169 		}
170 		final Orf best2=pickBest(list);
171 		assert(best2==null || best2.start>=0 && best2.stop<bases.length) : bases.length+"\n"+best2;
172 		if(best2!=null && best2.orfScore>-999){return best2;}
173 		return best1!=null ? best1 : best2;
174 	}
175 
pickBest(ArrayList<Orf> list)176 	final Orf pickBest(ArrayList<Orf> list){
177 		if(list==null){return null;}
178 		Orf best=null;
179 		for(Orf orf : list){
180 			if(best==null || orf.orfScore>best.orfScore){
181 				best=orf;
182 			}
183 		}
184 		return best;
185 	}
186 
187 	/**
188 	 * Generates lists of all max-length non-overlapping Orfs per frame.
189 	 * There IS overlap between frames.
190 	 * All Orfs come out flipped to + orientation.
191 	 * */
makeOrfs(String name, byte[] bases, int minlen)192 	static ArrayList<Orf>[] makeOrfs(String name, byte[] bases, int minlen){
193 		@SuppressWarnings("unchecked")
194 		ArrayList<Orf>[] array=new ArrayList[6];
195 		for(int strand=0; strand<2; strand++){
196 			for(int frame=0; frame<3; frame++){
197 				ArrayList<Orf> list=makeOrfsForFrame(name, bases, frame, strand, minlen);
198 				array[frame+3*strand]=list;
199 				if(strand==1 && list!=null){
200 					for(Orf orf : list){
201 						assert(orf.frame==frame);
202 						assert(orf.strand==strand);
203 						orf.flip();
204 					}
205 				}
206 			}
207 			AminoAcid.reverseComplementBasesInPlace(bases);
208 		}
209 		return array;
210 	}
211 
212 	/**
213 	 * Dynamic programming phase.
214 	 * @param frameLists
215 	 * @param bases
216 	 * @return
217 	 */
findPath(ArrayList<Orf>[] frameLists, byte[] bases)218 	private ArrayList<Orf> findPath(ArrayList<Orf>[] frameLists, byte[] bases){
219 		ArrayList<Orf> all=new ArrayList<Orf>();
220 		for(ArrayList<Orf> list : frameLists){all.addAll(list);}
221 		if(all.isEmpty()){return all;}
222 		Collections.sort(all);
223 
224 		for(Orf orf : all){
225 			orf.pathScorePlus=-999999;
226 			orf.pathScoreMinus=-999999;
227 		}
228 
229 		int[] lastPositionScored=KillSwitch.allocInt1D(6);
230 		Arrays.fill(lastPositionScored, -1);
231 
232 		//Index of highest-scoring ORF in this frame, with prev on the plus strand
233 		int[] bestIndexPlus=KillSwitch.allocInt1D(6);
234 		//Index of highest-scoring ORF in this frame, with prev on the minus strand
235 		int[] bestIndexMinus=KillSwitch.allocInt1D(6);
236 		//Highest-scoring ORF in this frame, with prev on the plus strand
237 		Orf[] bestOrfPlus=new Orf[6];
238 		//Highest-scoring ORF in this frame, with prev on the minus strand
239 		Orf[] bestOrfMinus=new Orf[6];
240 
241 		int[][] bestIndex=new int[][] {bestIndexPlus, bestIndexMinus};
242 		Orf[][] bestOrf=new Orf[][] {bestOrfPlus, bestOrfMinus};
243 
244 		for(Orf orf : all){
245 			final int myListNum=3*orf.strand+orf.frame;
246 			calcPathScore(orf, frameLists, lastPositionScored, bestIndex);
247 			if(bestOrfPlus[myListNum]==null || orf.pathScorePlus>=bestOrfPlus[myListNum].pathScorePlus){
248 				bestOrfPlus[myListNum]=orf;
249 				bestIndexPlus[myListNum]=lastPositionScored[myListNum];
250 				assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf);
251 			}
252 			if(bestOrfMinus[myListNum]==null || orf.pathScoreMinus>=bestOrfMinus[myListNum].pathScoreMinus){
253 				bestOrfMinus[myListNum]=orf;
254 				bestIndexMinus[myListNum]=lastPositionScored[myListNum];
255 				assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf);
256 			}
257 		}
258 
259 		Orf best=bestOrf[0][0];
260 		for(Orf[] array : bestOrf){
261 			for(Orf orf : array){
262 				if(best==null || (orf!=null && orf.pathScore()>best.pathScore())){
263 					best=orf;
264 				}
265 			}
266 		}
267 		ArrayList<Orf> bestPath=new ArrayList<Orf>();
268 		for(Orf orf=best; orf!=null; orf=orf.prev()){
269 			bestPath.add(orf);
270 			if(orf.type==CDS){geneStartsOut++;}
271 			else if(orf.type==tRNA){tRNAOut++;}
272 			else if(orf.type==r16S){r16SOut++;}
273 			else if(orf.type==r23S){r23SOut++;}
274 			else if(orf.type==r5S){r5SOut++;}
275 			else if(orf.type==r18S){r18SOut++;}
276 		}
277 		Collections.sort(bestPath);
278 		return bestPath;
279 	}
280 
281 	/**
282 	 * Calculate the best path to this ORF.
283 	 * @param orf
284 	 * @param frameLists
285 	 * @param lastPositionScored
286 	 * @param bestIndex
287 	 */
calcPathScore(Orf orf, ArrayList<Orf>[] frameLists, int[] lastPositionScored, int[][] bestIndex)288 	private void calcPathScore(Orf orf, ArrayList<Orf>[] frameLists, int[] lastPositionScored, int[][] bestIndex){
289 		final int myListNum=3*orf.strand+orf.frame;
290 
291 //		System.err.println("* "+orf);
292 //		System.err.println("* "+Arrays.toString(lastPositionScored));
293 //		System.err.println();
294 
295 		for(int listStrand=0; listStrand<2; listStrand++){
296 			for(int listFrame=0; listFrame<3; listFrame++){
297 				int listNum=listFrame+3*listStrand;
298 				ArrayList<Orf> list=frameLists[listNum];
299 				int lastPos=lastPositionScored[listNum];
300 				int bestPos=bestIndex[listStrand][listNum];
301 				if(listStrand==0){
302 					calcPathScorePlus(orf, list, listStrand, lastPos, bestPos);
303 				}else{
304 					calcPathScoreMinus(orf, list, listStrand, lastPos, bestPos);
305 				}
306 			}
307 		}
308 
309 //		System.err.println(myListNum+", "+Arrays.toString(lastPositionScored)+", "+frameLists[myListNum].size());
310 
311 		lastPositionScored[myListNum]++;
312 		assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf) : myListNum+"\n"+orf+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum])+"\n"
313 			+Arrays.toString(lastPositionScored)+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum]+1);
314 
315 		//These are sanity checks to make sure that the path did not break in the middle.
316 		//Safe to disable.
317 //		assert(orf.prevPlus!=null || orf.stop<100000);
318 //		assert(orf.prevMinus!=null || orf.stop<100000);
319 //		assert(orf.pathScore>-10) : orf.pathScore+"\n"+orf+"\n"+orf.prev+"\n";
320 	}
321 
322 	/**
323 	 * Calculate the best path to this ORF from a plus-strand previous ORF.
324 	 * @param orf
325 	 * @param list
326 	 * @param listStrand
327 	 * @param lastPos
328 	 * @param bestPos
329 	 */
calcPathScorePlus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos)330 	private void calcPathScorePlus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){
331 		assert(listStrand==0);
332 		if(lastPos<0){
333 			if(orf.prevPlus==null){
334 				orf.pathScorePlus=orf.orfScore;
335 				orf.pathLengthPlus=1;
336 			}
337 			return;
338 		}
339 		if(list.isEmpty()){return;}
340 
341 //		System.err.println("\nExamining   \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame);
342 		boolean found=false;
343 		final boolean sameStrand=(orf.strand==listStrand);
344 		final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand);
345 		for(int i=lastPos, min=Tools.max(0, bestPos-lookbackPlus); i>=min || (i>0 && !found); i--){
346 			Orf prev=list.get(i);
347 			assert(prev!=orf) : prev;
348 //			System.err.println("Comparing to \t"+prev);
349 			if(orf.isValidPrev(prev, maxOverlap)){
350 				int overlap=Tools.max(0, prev.stop-orf.start+1);
351 				float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap);
352 
353 				final float prevScore=prev.pathScore();
354 				final int prevLength=prev.pathLength();
355 
356 				float pathScore;
357 				final int pathLength;
358 				if(sameStrand){
359 					pathLength=prevLength+1;
360 					pathScore=prevScore+orfScore;
361 					pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4));
362 				}else{
363 					pathLength=1;
364 					pathScore=prev.pathScore()+orfScore;
365 					pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5);
366 				}
367 
368 				if(overlap<1 && prevScore>0){found=true;}
369 				if(pathScore>=orf.pathScorePlus){
370 					orf.pathScorePlus=pathScore;
371 					orf.prevPlus=prev;
372 					orf.pathLengthPlus=pathLength;
373 //					System.err.println("Set as best");
374 				}
375 			}
376 			if(found && prev.stop<maxOverlap-2000 && orf.prevPlus!=null){
377 				System.err.println("Breaking");
378 				break;
379 			}
380 		}
381 	}
382 
383 	/**
384 	 * Calculate the best path to this ORF from a minus-strand previous ORF.
385 	 * @param orf
386 	 * @param list
387 	 * @param listStrand
388 	 * @param lastPos
389 	 * @param bestPos
390 	 */
calcPathScoreMinus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos)391 	private void calcPathScoreMinus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){
392 		assert(listStrand==1);
393 		if(lastPos<0){
394 			if(orf.prevMinus==null){
395 				orf.pathScoreMinus=orf.orfScore;
396 				orf.pathLengthMinus=1;
397 			}
398 			return;
399 		}
400 		if(list.isEmpty()){return;}
401 
402 //		System.err.println("\nExamining   \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame);
403 		boolean found=false;
404 		final boolean sameStrand=(orf.strand==listStrand);
405 		final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand);
406 		for(int i=lastPos, min=Tools.max(0, bestPos-lookbackMinus); i>=min || (i>0 && !found); i--){
407 			Orf prev=list.get(i);
408 			assert(prev!=orf) : prev;
409 //			System.err.println("Comparing to \t"+prev);
410 			if(orf.isValidPrev(prev, maxOverlap)){
411 				int overlap=Tools.max(0, prev.stop-orf.start+1);
412 				float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap);
413 
414 				final float prevScore=prev.pathScore();
415 				final int prevLength=prev.pathLength();
416 
417 				float pathScore;
418 				final int pathLength;
419 				if(sameStrand){
420 					pathLength=prevLength+1;
421 					pathScore=prevScore+orfScore;
422 					pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4));
423 				}else{
424 					pathLength=1;
425 					pathScore=prev.pathScore()+orfScore;
426 					pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5);
427 				}
428 				if(overlap<1 && prevScore>0){found=true;}
429 				if(pathScore>=orf.pathScoreMinus){
430 					orf.pathScoreMinus=pathScore;
431 					orf.prevMinus=prev;
432 					orf.pathLengthMinus=pathLength;
433 //					System.err.println("Set as best");
434 				}
435 			}
436 			if(found && prev.stop<maxOverlap-2000 && orf.prevMinus!=null){
437 				System.err.println("Breaking");
438 				break;
439 			}
440 		}
441 	}
442 
443 	/**
444 	 * Generates a list of maximal-length Orfs only (non-overlapping).
445 	 * All Orfs come out in native orientation (unflipped).
446 	 * */
makeOrfsForFrame(String name, byte[] bases, int startFrame, int strand, int minlen)447 	static ArrayList<Orf> makeOrfsForFrame(String name, byte[] bases, int startFrame, int strand, int minlen){
448 //		assert(false) : "TODO";
449 		assert(minlen>=3);
450 		if(bases==null || bases.length<minlen){return null;}
451 		ArrayList<Orf> orfs=new ArrayList<Orf>();
452 		if(!ProkObject.callCDS){return orfs;}
453 //		int mask=63;
454 		int code=0;
455 		int start=-2;
456 		int frame=0;
457 		int pos=startFrame;
458 
459 
460 		for(; pos<bases.length; pos++){
461 			byte b=bases[pos];
462 			int x=AminoAcid.baseToNumber[b];
463 //			code=((code<<2)|x)&mask;
464 			code=((code<<2)|x);
465 			frame++;
466 			if(frame==3){
467 				frame=0;
468 				if(start>=0){
469 					if(GeneModel.isStopCodon(code) || code<0){//NOTE: This adds a stop codon wherever there are Ns.
470 						int len=pos-start+1;
471 						if(len>=minlen){
472 							Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS);
473 							orfs.add(f);
474 						}
475 						start=-1;
476 					}
477 				}else{
478 					if(start==-2 || (start<0 && GeneModel.isStartCodon(code))){
479 						start=pos-2;
480 					}
481 				}
482 				code=0;
483 			}
484 		}
485 
486 		//Add a stop codon at the sequence end.
487 		if(start>=0){
488 			pos--;
489 			while(frame!=3 && frame!=-1){
490 				pos--;
491 				frame--;
492 			}
493 			int len=pos-start+1;
494 			if(len>=minlen){
495 				assert(pos<bases.length) : start+", "+pos+", "+bases.length;
496 				Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS);
497 				orfs.add(f);
498 			}
499 		}
500 
501 		return orfs;
502 	}
503 
504 	/**
505 	 * Generates a list of maximal-length RNAs (non-overlapping).
506 	 * All RNAs come out in native orientation (unflipped).
507 	 * */
508 	ArrayList<Orf> makeRnasForStrand(String name, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen, boolean quitEarly, float bias){
509 		final int window=sc.lengthAvg;
510 		if(bases==null || bases.length*2<window){return null;}
511 		ArrayList<Orf> orfs=new ArrayList<Orf>(sc.type==tRNA ? 32 : 8);
512 
513 		final FrameStats inner=sc.inner;
514 //		final FrameStats start=sc.start;
515 //		final FrameStats stop=sc.stop;
516 
517 		final int k=inner.k;
518 		final int mask=inner.mask;
519 //		final float invLen=sc.invLengthAvg;
520 		final int halfWindow=window/2;
521 		final int maxWindow=(int)(window*1.5f);
522 		final int maxWindow2=(int)(window*2.5f);
523 //		final int slop=Tools.max(50, window/8);
524 		int len=0;
525 		int kmer=0;
526 		float currentScore=0;
527 		float currentScoreAbs=0;
528 		bias=(bias>-1 ? bias : biases[sc.type]);
529 		final float maxBias=biases[sc.type]*1.45f;
530 
531 		float thresh=cutoff1[sc.type];
532 		float prevScore=0;
533 		int lastStart=0;
534 
535 		float max=0;
536 		int maxPos=0;
537 
538 		for(int pos=0; pos<bases.length; pos++){
539 			final byte b=bases[pos];
540 			assert(b>=0 && b<128) : "Invalid base b="+((int)b)+"; pos="+pos+"\n"+new String(bases)+"\n";
541 			final int x=AminoAcid.baseToNumber[b];
542 			kmer=((kmer<<2)|x)&mask;
543 
544 			if(x>=0){
545 				len++;
546 				if(len>=k){
547 					float prob=inner.probs[0][kmer];
548 					float dif=prob-bias;//Prob above 1 is more likely than average
549 					currentScoreAbs+=prob;
550 					currentScore=Tools.max(0, currentScore+dif);
551 				}
552 
553 				if(currentScore>0){
554 					if(currentScore>max){
555 						max=currentScore;
556 						maxPos=pos;
557 					}
558 					if(prevScore<=0){
559 						lastStart=pos;
560 					}
561 				}else{
562 					int rnaLen=maxPos-lastStart;
563 					if(max>thresh && rnaLen>=halfWindow){
564 						if(rnaLen>maxWindow){
565 							if(bias<=maxBias){
566 								orfs=null;
567 								float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f);
568 								return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult);
569 							}
570 						}
571 						if(rnaLen<=maxWindow2){
572 							Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type);
573 							orfs.add(orf);
574 							orf.orfScore=max;
575 							if(verbose){
576 								final int start2=(strand==0 ? lastStart : bases.length-maxPos-1);
577 								final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1);
578 								System.err.println("Made Orf "+start2+"\t"+stop2+"\t"+max);
579 							}
580 						}
581 					}
582 					max=0;
583 					lastStart=pos;
584 				}
585 //				System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart);
586 				prevScore=currentScore;
587 
588 //				if(pos>=223000 && pos<232000){
589 ////					System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart);
590 //					System.out.println(pos+"\t"+currentScore);
591 //				}
592 
593 			}else{
594 				len=0;
595 				kmer=0;
596 			}
597 			scores[pos]=currentScoreAbs;
598 		}
599 
600 //		System.err.println("size="+orfs.size()+", type="+Orf.typeStrings[sc.type]);
601 
602 
603 		{
604 			int rnaLen=maxPos-lastStart;
605 			if(max>thresh && rnaLen>=halfWindow){
606 				if(rnaLen>maxWindow){
607 					if(bias<=maxBias){
608 						orfs=null;
609 						float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f);
610 						return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult);
611 					}
612 				}
613 				if(rnaLen<=maxWindow2){
614 					Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type);
615 					orfs.add(orf);
616 					orf.orfScore=max;
617 					if(verbose){
618 						final int start2=(strand==0 ? lastStart : bases.length-maxPos-1);
619 						final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1);
620 						System.err.println(start2+"\t"+stop2+"\t"+max);
621 					}
622 				}
623 			}
624 		}
625 
626 		if(kmersSeen!=null && orfs.size()>0 && sc.kmerSet()!=null){
fillKmersSeen(bases, kmersSeen, sc.kmerSet(), sc.kLongLen())627 			fillKmersSeen(bases, kmersSeen, sc.kmerSet(), sc.kLongLen());
628 		}
629 
630 		float cutoff=cutoff2[sc.type];
631 
632 		for(int i=0; i<orfs.size(); i++){
633 			Orf orf=orfs.get(i);
634 //			System.err.println(orf.orfScore);
635 			boolean good=refineRna(orf, bases, strand, sc, scores, kmersSeen);
636 			if(orf.orfScore<cutoff || !good){
637 				if(verbose){System.err.println("REJECT: "+orf.toStringFlipped());}
orfs.set(i, null)638 				orfs.set(i, null);
639 			}else{
640 				if(verbose){System.err.println("ACCEPT: "+orf.toStringFlipped());}
641 				if(quitEarly){
orfs.clear()642 					orfs.clear();
643 					orfs.add(orf);
644 					return orfs;
645 				}
646 			}
647 		}
Tools.condenseStrict(orfs)648 		Tools.condenseStrict(orfs);
649 
650 //		assert(false);
651 
652 //		for(int pos=0; pos<bases.length; pos++){
653 //			final byte b=bases[pos];
654 //			final int x=AminoAcid.baseToNumber[b];
655 //			kmer=((kmer<<2)|x)&mask;
656 //
657 //			if(x>=0){
658 //				len++;
659 //				if(len>=k){
660 //					float prob=inner.probs[0][kmer];
661 //					float dif=prob-1.2f;//Prob above 1 is more likely than average
662 //					currentScore=Tools.max(0, currentScore+dif);
663 //					if(currentScore>0){
664 //						currentStreak++;
665 //					}else{
666 //						currentStreak=0;
667 //					}
668 //					if(currentScore>200 && currentStreak>1500 && currentStreak<1700){
669 //						Orf orf=new Orf(name, pos-currentStreak-1, pos, strand, 0, bases, false);
670 //						orfs.add(orf);
671 //						orf.orfScore=currentScore;
672 //						orf.startScore=start.scorePoint(orf.start, bases);
673 //						orf.stopScore=stop.scorePoint(orf.stop, bases);
674 //						currentStreak=0;
675 //						currentScore=0;
676 //					}
677 //				}
678 //			}else{
679 //				len=0;
680 //				kmer=0;
681 //			}
682 //		}
683 
684 		return orfs;
685 	}
686 
fillKmersSeen(byte[] bases, int[] kmersSeen, LongHashSet set, int k)687 	void fillKmersSeen(byte[] bases, int[] kmersSeen, LongHashSet set, int k){
688 		final long mask=~((-1L)<<(2*k));
689 		long kmer=0;
690 		int len=0;
691 		int seen=0;
692 		for(int i=0; i<bases.length; i++){
693 			final byte b=bases[i];
694 			final int num=AminoAcid.baseToNumber[b];
695 			if(num>=0){
696 				len++;
697 				kmer=((kmer<<2)|num)&mask;
698 				if(len>=k && set.contains(kmer)){seen++;}
699 			}else{
700 				len=0;
701 			}
702 			kmersSeen[i]=seen;
703 		}
704 	}
705 
refineRna(Orf orf, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen)706 	boolean refineRna(Orf orf, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen){
707 		if(orf==null){return false;}
708 		if(verbose){System.err.println("REFINE: "+orf.toStringFlipped());}
709 		final int window=sc.lengthAvg;
710 //		final int halfWindow=window/2;
711 //		final int maxWindow=(int)(window*1.5f);
712 		final int slop=Tools.max(60, 10+window/8);
713 		final float invWindow=sc.invLengthAvg;
714 		final float oldScore=orf.orfScore;
715 		IntList starts=new IntList();
716 		IntList stops=new IntList();
717 		FloatList startScores=new FloatList();
718 		FloatList stopScores=new FloatList();
719 
720 		final int leftmost=Tools.max(0, orf.start-slop);
721 		final int rightmost=Tools.min(bases.length-1, orf.stop+slop);
722 		if(kmersSeen!=null){
723 			if(kmersSeen[leftmost]>=kmersSeen[rightmost]){
724 //				System.err.println("Bad: "+oldScore);
725 				orf.orfScore=-999;
726 				return false;
727 			}else{
728 //				System.err.println("Good: "+oldScore);
729 			}
730 		}
731 		if(verbose){System.err.println("REFINE2");}
732 
733 		{//starts
734 			final int left=leftmost;
735 			final int right=Tools.min(bases.length-1, orf.stop+slop-window);
736 			final float thresh=cutoff3[sc.type];
737 			fillPoints(left, right, bases, sc.start, thresh, starts, startScores);
738 		}
739 		if(verbose){System.err.println("starts: "+starts.size);}
740 //		if((orf.start+"").startsWith("146") || true){System.err.println(starts);}
741 		if(verbose){System.err.println(startScores);}
742 
743 		{//stops
744 			final int left=Tools.max(0, orf.start-slop+window);
745 			final int right=rightmost;
746 			final float thresh=cutoff4[sc.type];
747 			fillPoints(left, right, bases, sc.stop, thresh, stops, stopScores);
748 		}
749 		if(verbose){System.err.println("stops: "+stops.size);}
750 //		if((orf.start+"").startsWith("146") || true){System.err.println(stops);}
751 		if(verbose){System.err.println(stopScores);}
752 
753 		final float innerThresh=cutoff5[sc.type];
754 
755 		final int minlen=Tools.max(window/2, window-slop);
756 		final int maxlen=window+slop;
757 
758 		orf.orfScore=0;
759 		int lastStop=0;
760 		for(int i=0; i<starts.size; i++){
761 			final int start=starts.get(i);
762 			final int startSeen=kmersSeen==null ? 0 : kmersSeen[start];
763 			final float startScore=startScores.get(i);
764 //			System.err.println("start="+start);
765 			for(int j=lastStop; j<stops.size; j++){
766 				final int stop=stops.get(j);
767 				final int rnalen=stop-start+1;
768 //				System.err.println("stop="+stop);
769 				if(rnalen<minlen){
770 					lastStop=j+1;
771 				}else if(rnalen>maxlen){
772 //					System.err.println("broke");
773 					break;
774 				}else if(kmersSeen!=null && kmersSeen[stop]<=startSeen){//TODO: Test this
775 //					//skip
776 				}else{
777 					final int len=stop-start+1;
778 					final float stopScore=stopScores.get(j);
779 					final float innerScore=(scores[stop]-scores[start])/len;
780 //					System.err.println("innerScore="+innerScore);
781 					assert(rnalen<=maxlen) : "start="+start+", stop="+stop+", rnalen="+rnalen+", minlen="+minlen+", maxlen="+maxlen;
782 					if(innerScore>=innerThresh){
783 						final float a=Tools.max(startScore+0.25f, 0.25f);
784 						final float b=Tools.max(stopScore+0.25f, 0.25f);
785 						final float c=innerScore*innerScore;
786 						final float d=(window-(2.4f*Tools.absdif(len, window)))*invWindow;
787 						final float score=c*d*(float)Math.sqrt(a*b);
788 						if(verbose && score>=0.2f*orf.orfScore){
789 							final int start2=(strand==0 ? start : bases.length-stop-1);
790 							final int stop2=(strand==0 ? stop : bases.length-start-1);
791 							System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore);
792 						}
793 						if(score>orf.orfScore){
794 							orf.start=start;
795 							orf.stop=stop;
796 							orf.kmerScore=innerScore*len;
797 //							if(verbose){
798 //								final int start2=(strand==0 ? start : bases.length-stop-1);
799 //								final int stop2=(strand==0 ? stop : bases.length-start-1);
800 //								System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore);
801 //							}
802 							orf.startScore=startScore;
803 							orf.stopScore=stopScore;
804 							orf.orfScore=score;
805 						}
806 					}else{
807 						assert(true);
808 					}
809 				}
810 			}
811 		}
812 		orf.orfScore*=scoreMult[sc.type];
813 
814 		if(starts.isEmpty() || stops.isEmpty()){
815 			if(verbose){System.err.println("No starts or stops.");}
816 			orf.orfScore=Tools.min(-999, orf.orfScore-9999);
817 			return false;
818 		}
819 		return refineByAlignment(orf, bases, strand, sc);//Returns true if no consensus is present
820 	}
821 
refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc)822 	boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc){
823 		if(verbose){System.err.println("ALIGN");}
824 		Read[] consensus=ProkObject.consensusReads(sc.type);
825 		if(consensus==null || consensus.length==0){return true;}
826 		boolean refined=false;
827 //		System.err.println("Initial: "+orf.start+", "+orf.stop);
828 		for(Read r : consensus){
829 //			refined=refineByAlignment(orf, bases, strand, sc, r.bases, 15, 15, 2);
830 			refined=refineByAlignment(orf, bases, strand, sc, r.bases, sc.startSlop(), sc.stopSlop(), 2);
831 			if(refined || sc.type==r18S || true){break;}
832 		}
833 		if(refined){
834 			if(verbose){System.err.println("Aligned to: "+orf.start+", "+orf.stop);}
835 		}else{
836 			if(verbose){System.err.println("Alignment failed.");}
837 			orf.orfScore=Tools.min(-999, orf.orfScore-9999);
838 		}
839 		return refined;
840 	}
841 
refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc, byte[] consensus, final int startSlop, final int stopSlop, int recurLimit)842 	boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc, byte[] consensus,
843 			final int startSlop, final int stopSlop, int recurLimit){
844 		final int start0=orf.start;
845 		final int stop0=orf.stop;
846 
847 		assert(start0>=0 && start0<bases.length) : start0+", "+stop0;
848 		assert(stop0>=0 && stop0<bases.length) : start0+", "+stop0;
849 
850 		final float minID=sc.minIdentity();
851 		final int padding=Tools.min(alignmentPadding, 30+sc.lengthAvg/4);
852 		final int a=Tools.max(0, orf.start-padding);
853 		final int b=Tools.min(bases.length-1, orf.stop+padding);
854 		final int reflen=b-a+1;
855 		if(reflen>10*sc.lengthAvg && reflen>20000){
856 			System.err.println("Skipped reflen "+reflen+"/"+sc.lengthAvg+" for "
857 					+ "seqlen="+bases.length+", orf="+orf.toString());
858 			assert(false);
859 			//TODO: Possibly change return to -1, 0, 1 ("can't align")
860 			//Should be a limit on window size...
861 			//Also consider shrinking matrix after jumbo alignments
862 			return false;
863 		}
864 		assert(a>=0 && b<bases.length) : a+", "+b;
865 		SingleStateAlignerFlat2 ssa=getSSA();
866 		final int minScore=ssa.minScoreByIdentity(consensus.length, minID);
867 		int[] max=ssa.fillUnlimited(consensus, bases, a, b, minScore);
868 		if(max==null){return false;}
869 
870 		final int rows=max[0];
871 		final int maxCol=max[1];
872 		final int maxState=max[2];
873 //		final int maxScore=max[3];
874 //		final int maxStart=max[4];
875 
876 		//returns {score, bestRefStart, bestRefStop}
877 		//padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
878 		final int[] score=ssa.score(consensus, bases, a, b, rows, maxCol, maxState);
879 		final int rstart=Tools.max(score[1], 0);
880 		final int rstop=Tools.min(score[2], bases.length-1);
881 		final float id=ssa.tracebackIdentity(consensus, bases, a, b, rows, maxCol, maxState, null);
882 
883 		assert(rstart>=0 && rstart<bases.length) : "bases="+bases.length+
884 			", rstart="+rstart+", rstop="+rstop+", a="+a+", b="+b+"\n"+"score="+Arrays.toString(score)+", id="+id;
885 		assert(rstop>=0 && rstop<bases.length) : rstart+", "+rstop;
886 
887 		if(score.length>3 && recurLimit>0){
888 			final int padLeft=score[3];
889 			final int padRight=score[4];
890 			//TODO:  This takes extra memory.  It may be better to cap the width or ignore start0/stop0 here.
891 			int rstart2=Tools.max(0, Tools.min(start0, rstart)-10);
892 			int rstop2=Tools.min(bases.length-1, Tools.max(stop0, rstop)+10);
893 			assert(rstart2>=0 && rstart2<bases.length) : rstart2+", "+rstop2;
894 			assert(rstop2>=0 && rstop2<bases.length) : rstart2+", "+rstop2;
895 			orf.start=rstart2;
896 			orf.stop=rstop2;
897 			if(orf.start<a || orf.stop>b){
898 				boolean ret=refineByAlignment(orf, bases, strand, sc, consensus, 0, 0, recurLimit-1);
899 				if(ret){return true;}
900 			}
901 			orf.start=start0;
902 			orf.stop=stop0;
903 //			assert(false) : "Don't traceback after recur.";
904 		}
905 		if(verbose){
906 			System.err.println("Identity: "+id);
907 		}
908 //		assert(score.length==3) : "TODO: Handle padding requests.";
909 
910 //		System.err.println("Identity: "+String.format("%.2f", 100*id)+"; location: "+rstart+"-"+rstop);
911 		if(id<minID){return false;}
912 
913 
914 		if(Tools.absdif(rstart, start0)>startSlop){orf.start=rstart;}
915 		if(Tools.absdif(rstop, stop0)>stopSlop){orf.stop=rstop;}
916 		return true;
917 	}
918 
fillPoints(final int left, final int right, final byte[] bases, final FrameStats fs, float thresh, final IntList points, final FloatList scores)919 	void fillPoints(final int left, final int right, final byte[] bases, final FrameStats fs, float thresh, final IntList points, final FloatList scores){
920 		points.clear();
921 		scores.clear();
922 		final float minThresh=thresh;//thresh*0.05f;
923 //		System.err.println(left+", "+right+", "+thresh+", "+minThresh);
924 		while(points.size()<8 && thresh>=minThresh){
925 //			System.err.println("Running with thresh="+thresh);
926 			points.clear();
927 			scores.clear();
928 			for(int i=left; i<right; i++){
929 				float score=fs.scorePoint(i, bases);
930 //				System.err.println(i+", "+score);
931 				if(score>=thresh){
932 					points.add(i);
933 					scores.add(score);
934 				}
935 			}
936 			thresh=thresh*0.75f;
937 		}
938 	}
939 
940 	/**
941 	 * Generate all possible genes from each Orf, and return them in a new set of lists.
942 	 * @param frameLists
943 	 * @param bases
944 	 * @return Lists of orfs.
945 	 */
breakOrfs(ArrayList<Orf>[] frameLists, byte[] bases)946 	private ArrayList<Orf>[] breakOrfs(ArrayList<Orf>[] frameLists, byte[] bases){
947 
948 		@SuppressWarnings("unchecked")
949 		ArrayList<Orf>[] brokenLists=new ArrayList[6];
950 		for(int strand=0; strand<2; strand++){
951 			for(int frame=0; frame<3; frame++){
952 				int fnum=frame+3*strand;
953 				ArrayList<Orf> longest=frameLists[fnum]; //Longest Orf per stop
954 				ArrayList<Orf> broken=new ArrayList<Orf>(); //All high scoring Orfs, including multiple per stop, with different starts
955 				if(longest!=null) {
956 					for(Orf orf : longest){
957 						assert(orf.frame==frame);
958 						assert(orf.strand==strand);
959 						ArrayList<Orf> temp=breakOrf(orf, bases);
960 						if(temp!=null){
961 							broken.addAll(temp);
962 						}
963 					}
964 				}
965 				Collections.sort(broken);
966 				brokenLists[fnum]=broken;
967 			}
968 			//Reverse-complement bases after processing each strand
969 			AminoAcid.reverseComplementBasesInPlace(bases);
970 		}
971 		return brokenLists;
972 	}
973 
974 	/**
975 	 * Generate an Orf for each possible start codon.
976 	 * Retain only the high-scoring ones.
977 	 * @param longest Longest open reading frame for a given stop.
978 	 * @param bases Bases, oriented for this Orf.
979 	 * @return List of Orfs.
980 	 */
breakOrf(Orf longest, byte[] bases)981 	private ArrayList<Orf> breakOrf(Orf longest, byte[] bases){
982 		assert(longest.start<longest.stop);
983 		final int flipped=longest.flipped();
984 		if(flipped==1){longest.flip();}//Now the orf is aligned to its native strand
985 
986 		geneStopsMade++;
987 
988 		final FrameStats innerStats=pgm.statsCDS.inner;
989 		final FrameStats startStats=pgm.statsCDS.start;
990 		final FrameStats stopStats=pgm.statsCDS.stop;
991 
992 		final String name=longest.scafName;
993 		final int start=longest.start;
994 		final int stop=longest.stop;
995 		final int strand=longest.strand;
996 		final int max=Tools.min(longest.stop-2, longest.stop-minLen+4);
997 
998 		assert(pgm!=null) : pgm;
999 		assert(pgm.statsCDS!=null) : pgm;
1000 		assert(pgm.statsCDS.inner!=null) : pgm.statsCDS;
1001 		assert(pgm.statsCDS.inner.k>0) : pgm.statsCDS.inner;
1002 
1003 		final int k=innerStats.k;
1004 		final int mask=~((-1)<<(2*k));
1005 
1006 		final float stopScore=stopStats.scorePoint(longest.stop, bases);
1007 		stCds.geneStopScoreSum+=stopScore;
1008 		stCds.geneStopScoreCount++;
1009 
1010 		ArrayList<Orf> broken=new ArrayList<Orf>();
1011 		int created=0;
1012 
1013 		int codon=0;
1014 		int kmer=0;
1015 		int len=0;
1016 		int numKmers=0;
1017 		float currentScore=0;
1018 		for(int pos=start, currentFrame=0; pos<=stop; pos++){
1019 			final byte b=bases[pos];
1020 			final int x=AminoAcid.baseToNumber[b];
1021 			codon=((codon<<2)|x);
1022 			kmer=((kmer<<2)|x)&mask;
1023 
1024 			if(x>=0){
1025 				len++;
1026 				if(len>=k){
1027 					float prob=innerStats.probs[currentFrame][kmer];
1028 					float dif=prob-0.99f;//Prob above 1 is more likely than average
1029 					currentScore+=dif;
1030 //					outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", dif)+",\tscore="+String.format(Locale.ROOT, "%.4f", currentScore)+
1031 //							"\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+
1032 //							"\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame));
1033 				}else{
1034 //					outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", 0.0)+",\tscore="+String.format(Locale.ROOT, "%.4f", 0.0)+
1035 //							"\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+
1036 //							"\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame));
1037 				}
1038 			}else{
1039 				len=0;
1040 				kmer=0;
1041 			}
1042 
1043 			currentFrame++;
1044 //			outstream.println("pos="+pos+", codon="+AminoAcid.kmerToString(kmer, 3)+", frame="+currentFrame+", start="+start+", isStartCodon="+pgm.isStartCodon(codon));
1045 			if(currentFrame>2){
1046 				currentFrame=0;
1047 				if(pos<max && created<breakLimit && (pos==start+2 || pgm.isStartCodon(codon))){
1048 //					outstream.println(x);
1049 					int glen=stop-pos+3;
1050 					assert(glen>=minLen) : "glen="+glen+", minLen="+minLen+", pos="+pos+", max="+max+", start="+start;
1051 
1052 					int oStart=pos-2;
1053 					float startScore=startStats.scorePoint(oStart, bases);
1054 
1055 					stCds.geneStartScoreSum+=startScore;
1056 					stCds.geneStartScoreCount++;
1057 
1058 					stCds.lengthSum+=(stop-(pos-2)+1);
1059 					stCds.lengthCount++;
1060 
1061 					if((startScore>=minStartScore || pos<6) /* && stopScore>=minStopScore /*|| broken.isEmpty()*/){
1062 						Orf orf=new Orf(name, pos-2, stop, strand, longest.frame, bases, false, longest.type);
1063 
1064 						geneStartsMade++;
1065 						orf.kmerScore=currentScore;
1066 						orf.startScore=startScore;
1067 						orf.stopScore=stopScore;
1068 
1069 						assert(orf.frame==longest.frame);
1070 						assert(orf.strand==strand);
1071 
1072 						if(strand==1){orf.flip();}
1073 						broken.add(orf);
1074 						created++;
1075 					}
1076 				}
1077 				codon=0;
1078 			}
1079 		}
1080 
1081 		final int size=broken.size();
1082 		final int sizeCutoff=Tools.max(5, size/2);
1083 		if(size<1){return broken;}
1084 		Orf best=broken.get(0);
1085 		Orf bestStart=broken.get(0);
1086 		for(int i=0; i<size; i++){
1087 			Orf orf=broken.get(i);
1088 
1089 			//This fixes scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2).
1090 			orf.kmerScore=currentScore-orf.kmerScore;
1091 			orf.orfScore=orf.calcOrfScore();
1092 			if(orf.orfScore>=best.orfScore){best=orf;}
1093 			if(orf.startScore>=bestStart.startScore){bestStart=orf;}
1094 
1095 			stCds.geneInnerScoreSum+=orf.averageKmerScore();
1096 			stCds.geneInnerScoreCount++;
1097 		}
1098 
1099 		//Sort to by score descending to eliminate low-scoring copies.
1100 		if(broken.size()>1){Collections.sort(broken, Feature.featureComparatorScore);}
1101 
1102 		int removed=0;
1103 		for(int i=0; i<size; i++){//Fix scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2).
1104 			Orf orf=broken.get(i);
1105 			if(orf.averageKmerScore()<minInnerScore || orf.orfScore<minOrfScore ||
1106 					orf.orfScore/orf.length()<minAvgScore ||
1107 					orf.orfScore<0.5f*best.orfScore-10 || (orf.startScore<bestStart.startScore-0.55f && orf.kmerScore<best.kmerScore*1.1f && orf!=best)){
1108 				broken.set(i, null);
1109 				removed++;
1110 			}else if(i>sizeCutoff){
1111 				broken.set(i, null);
1112 				removed++;
1113 			}
1114 		}
1115 		if(removed>0){
1116 			Tools.condenseStrict(broken);
1117 		}
1118 
1119 		geneStartsRetained+=broken.size();
1120 		geneStopsRetained+=(broken.size()>0 ? 1 : 0);
1121 
1122 		if(flipped==1){longest.flip();}
1123 		return broken;
1124 	}
1125 
1126 	/*--------------------------------------------------------------*/
1127 	/*----------------            Fields            ----------------*/
1128 	/*--------------------------------------------------------------*/
1129 
1130 	/**
1131 	 * Current gene model.
1132 	 * TODO: Dynamically swap this as needed for contigs with varying GC.
1133 	 */
1134 	GeneModel pgm;
1135 
1136 	//Gene-calling cutoffs
1137 	final int minLen;
1138 	final int maxOverlapSameStrand;
1139 	final int maxOverlapOppositeStrand;
1140 	final float minStartScore;
1141 	final float minStopScore;
1142 	final float minInnerScore;
1143 	final float minOrfScore;
1144 	final float minAvgScore;
1145 
1146 //	static float[] cutoff1=new float[] {0, 40f, 200f, 150f, 45f};
1147 //	static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 60f};
1148 //	static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.4f, 1.8f};
1149 //	static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 0.6f, 1.1f};
1150 //	static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.55f};//inner score
1151 //	static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f};
1152 //	static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f};
1153 
1154 //	static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 45f};
1155 //	static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f};
1156 //	static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.3f, 1.8f};
1157 //	static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.5f, 1.1f};
1158 //	static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score
1159 //	static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f};
1160 //	static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f};
1161 
1162 	//for k=4,2,2
1163 //	static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 40f};
1164 //	static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f};
1165 //	static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.1f, 1.8f};
1166 //	static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.45f, 1.1f};
1167 //	static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score
1168 //	static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f};
1169 //	static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f};
1170 
1171 //	//for k=5,3,3
1172 //	static float[] cutoff1=new float[] {0, 40f, 300f, 300f, 135f};//sum score
1173 //	static float[] cutoff2=new float[] {0, 44f, 300f, 400f, 100f};//orf score
1174 //	static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.5f, 3.5f};//start score
1175 //	static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 1.4f, 1.8f};//stop score
1176 //	static float[] cutoff5=new float[] {0, 1.0f, 1.1f, 1.15f, 1.4f};//inner score
1177 //	static float[] scoreMult=new float[] {1f, 8f, 80f, 120f, 5f};//score mult
1178 //	static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f};//bias for sum score
1179 
1180 	//for k=6,3,3 - this has low scores for correct 23s stops; may try k=2 or k=4 for that end
1181 	//Also, 5S seems to score low very for archaea
1182 //	public static int CDS=0, tRNA=1, /*r16S=2,*/ r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
1183 //	static float[] cutoff1=new float[] {0, 100f, 600f, 400f, 300f};//sum score
1184 //	static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f};//orf score
1185 //	static float[] cutoff3=new float[] {0, 3.0f, 1.8f, 1.6f, 4.0f};//start score
1186 //	static float[] cutoff4=new float[] {0, 2.8f, 2.0f, 0.90f, 1.9f};//stop score
1187 //	static float[] cutoff5=new float[] {0, 2.8f, 1.1f, 1.55f, 1.4f};//inner score
1188 //	static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f};//score mult
1189 //	static float[] biases=new float[] {1f, 1.25f, 1.30f, 1.30f, 1.22f};//16S bias for sum score: 1.25 best for archs, 1.4 best for bacts
1190 
1191 ////	{"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
1192 //	static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 270f, 300f};//sum score
1193 //	static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score
1194 //	static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 3.8f, 1.65f};//start score
1195 //	static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.8f, 1.70f};//stop score
1196 //	static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 1.4f, 1.70f};//inner score
1197 //	static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f, 35f};//score mult
1198 //	static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.30f, 1.30f};
1199 
1200 ////	{"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
1201 //	static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 90f, 300f};//sum score
1202 //	static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 24f, 300f};//orf score
1203 //	static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 2.0f, 1.65f};//start score
1204 //	static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score
1205 //	static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score
1206 //	static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult
1207 //	static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.50f, 1.30f};
1208 
1209 ////	{"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
1210 //	static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 100f, 300f};//sum score
1211 //	static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score
1212 //	static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 1.8f, 1.65f};//start score
1213 //	static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score
1214 //	static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score
1215 //	static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult
1216 //	static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.55f, 1.30f};
1217 
1218 //	{"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
1219 	static float[] cutoff1=new float[] {0, 20f, 300f, 400f, 100f, 400f};//sum score
1220 	static float[] cutoff2=new float[] {0, 36f, 300f, 300f, 32f, 300f};//orf score //tRNA is very sensitive here
1221 	static float[] cutoff3=new float[] {0, 2.4f, 1.65f, 1.6f, 1.8f, 1.5f};//start score
1222 	static float[] cutoff4=new float[] {0, 1.5f, 1.70f, 0.90f, 1.0f, 1.1f};//stop score
1223 	static float[] cutoff5=new float[] {0, 2.2f, 1.70f, 1.55f, 2.6f, 1.5f};//inner score
1224 	static float[] scoreMult=new float[] {1f, 1.0f, 35f, 80f, 1.25f, 35f};//score mult
1225 	static float[] biases=new float[] {1f, 1.45f, 1.30f, 1.30f, 1.55f, 1.50f};
1226 
1227 	long geneStopsMade=0;
1228 	long geneStartsMade=0;
1229 	long geneStartsRetained=0;
1230 	long geneStopsRetained=0;
1231 	long geneStartsOut=0;
1232 
1233 	long tRNAOut=0;
1234 	long r16SOut=0;
1235 	long r23SOut=0;
1236 	long r5SOut=0;
1237 	long r18SOut=0;
1238 
1239 	ScoreTracker stCds=new ScoreTracker(CDS);
1240 	ScoreTracker stCds2=new ScoreTracker(CDS);
1241 	ScoreTracker stCdsPass=new ScoreTracker(CDS);
1242 	ScoreTracker sttRNA=new ScoreTracker(tRNA);
1243 	ScoreTracker st16s=new ScoreTracker(r16S);
1244 	ScoreTracker st23s=new ScoreTracker(r23S);
1245 	ScoreTracker st5s=new ScoreTracker(r5S);
1246 	ScoreTracker st18s=new ScoreTracker(r18S);
1247 
1248 	ScoreTracker[] trackers=new ScoreTracker[] {stCdsPass, sttRNA, st16s, st23s, st5s, st18s};
1249 
getSSA()1250 	public static SingleStateAlignerFlat2 getSSA(){
1251 		SingleStateAlignerFlat2 ssa=localSSA.get();
1252 		if(ssa==null){
1253 			synchronized(localSSA){
1254 				ssa=localSSA.get();
1255 				if(ssa==null){
1256 					ssa=new SingleStateAlignerFlat2();
1257 					localSSA.set(ssa);
1258 				}
1259 			}
1260 		}
1261 		return ssa;
1262 	}
1263 
getSSA3()1264 	public static SingleStateAlignerFlat3 getSSA3(){
1265 		SingleStateAlignerFlat3 ssa=localSSA3.get();
1266 		if(ssa==null){
1267 			synchronized(localSSA3){
1268 				ssa=localSSA3.get();
1269 				if(ssa==null){
1270 					ssa=new SingleStateAlignerFlat3();
1271 					localSSA3.set(ssa);
1272 				}
1273 			}
1274 		}
1275 		return ssa;
1276 	}
1277 
getSSAF()1278 	public static SingleStateAlignerFlatFloat getSSAF(){
1279 		SingleStateAlignerFlatFloat ssa=localSSAF.get();
1280 		if(ssa==null){
1281 			synchronized(localSSAF){
1282 				ssa=localSSAF.get();
1283 				if(ssa==null){
1284 					ssa=new SingleStateAlignerFlatFloat();
1285 					localSSAF.set(ssa);
1286 				}
1287 			}
1288 		}
1289 		return ssa;
1290 	}
1291 
1292 	private static ThreadLocal<SingleStateAlignerFlat2> localSSA=new ThreadLocal<SingleStateAlignerFlat2>();
1293 	private static ThreadLocal<SingleStateAlignerFlat3> localSSA3=new ThreadLocal<SingleStateAlignerFlat3>();
1294 	private static ThreadLocal<SingleStateAlignerFlatFloat> localSSAF=new ThreadLocal<SingleStateAlignerFlatFloat>();
1295 //	public static int maxAlignmentEndpointDifference=15;
1296 	public static int alignmentPadding=300;
1297 
1298 	public static int breakLimit=12;
1299 	public static int lookbackPlus=70;
1300 	public static int lookbackMinus=25;
1301 
1302 //	pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4));
1303 
1304 	//Operon length modifiers for same strand
1305 	public static float p0=-30f;
1306 	public static float p1=-0.35f; //Sensitive - higher decreases FP and TP
1307 	public static float p2=4.0f;//Insensitive - higher positive decreases FP and TP
1308 	public static float p3=12f; //Higher decreases FP (substantially) and TP (slightly)
1309 	public static float p4=-10f; //Lower decreases FP (weakly) and TP (greatly)
1310 	public static float p5=2.0f; //Insensitive - lower increases FP and TP
1311 	public static float p6=2f; //Greater increases FP and TP
1312 
1313 //	pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5);
1314 
1315 	//Operon length modifiers for opposite strand
1316 	public static float q1=-36f;
1317 	public static float q2=-1.6f; //q2 and q4 must have opposite signs
1318 	public static float q3=-12f;
1319 	public static float q4=3.0f; //(Lower [even negative] decreases FP and TP)
1320 	public static float q5=-40f;
1321 
1322 	private static PrintStream outstream=System.err;
1323 	static boolean verbose;
1324 
1325 }
1326