1 package prok;
2 
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.BitSet;
6 import java.util.HashMap;
7 
8 import aligner.SingleStateAlignerFlat2;
9 import dna.AminoAcid;
10 import fileIO.FileFormat;
11 import gff.GffLine;
12 import shared.KillSwitch;
13 import shared.Shared;
14 import shared.Tools;
15 import stream.Read;
16 import stream.ReadInputStream;
17 import structures.ByteBuilder;
18 import structures.IntList;
19 
20 /**
21  * This class is designed to store kmer frequencies related to gene
22  * starts, stops, and interiors.  It can be loaded from a pgm file.
23  *
24  * It's possible to use multiple GeneModels; for example, one for
25  * each of several GC ranges or clades.
26  * @author Brian Bushnell
27  * @date Sep 24, 2018
28  *
29  */
30 public class GeneModel extends ProkObject {
31 
32 	/*--------------------------------------------------------------*/
33 	/*----------------        Initialization        ----------------*/
34 	/*--------------------------------------------------------------*/
35 
GeneModel(boolean fill)36 	public GeneModel(boolean fill){
37 		if(fill){
38 			fillContainers();
39 		}
40 	}
41 
fillContainers()42 	void fillContainers(){
43 		statsCDS.setInner(kInnerCDS, 3);
44 		statsCDS.setStart(kStartCDS, startFrames, startLeftOffset);
45 		statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset);
46 
47 		for(int i=0; i<rnaContainers.length; i++){
48 			StatsContainer sc=rnaContainers[i];
49 			sc.setInner(kInnerRNA, 1);
50 		}
51 
52 		statstRNA.setStart(kStartRNA, 14, 4);
53 		statstRNA.setStop(kStopRNA, 14, 6);
54 
55 		stats16S.setStart(kStartRNA, 20, 7);
56 		stats16S.setStop(kStopRNA, 12, 16);
57 
58 		stats23S.setStart(kStartRNA, 17, 3);
59 		stats23S.setStop(kStopRNA, 15, 12);
60 
61 		stats5S.setStart(kStartRNA, 20, 5);
62 		stats5S.setStop(kStopRNA, 15, 5);
63 
64 		stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined
65 		stats18S.setStop(kStopRNA, 12, 16);
66 	}
67 
68 	/*--------------------------------------------------------------*/
69 	/*----------------        Public Methods        ----------------*/
70 	/*--------------------------------------------------------------*/
71 
process(String genomeFname, String gffFname)72 	public boolean process(String genomeFname, String gffFname){
73 //		fnames.add(ReadWrite.stripPath(genomeFname));
74 		numFiles++;
75 		FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true);
76 		FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true);
77 
78 		if(fnaFile==null || gffFile==null){
79 			errorState=true;
80 			return true;
81 		}
82 		filesProcessed++;
83 
84 		ArrayList<ScafData> scafList;
85 		{//Scoped to save memory
86 			ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads);
87 			readsProcessed+=reads.size();
88 			scafList=new ArrayList<ScafData>(reads.size());
89 			for(Read r : reads){
90 				basesProcessed+=r.length();
91 				scafList.add(new ScafData(r));
92 			}
93 		}
94 		{//Scoped to save memory
95 			ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true);
96 			ArrayList<GffLine> cds=allGffLines[0];
97 			ArrayList<GffLine> rrna=allGffLines[1];
98 			ArrayList<GffLine> trna=allGffLines[2];
99 			genesProcessed+=cds.size();
100 			genesProcessed+=(rrna==null ? 0 : rrna.size());
101 			genesProcessed+=(trna==null ? 0 : trna.size());
102 
103 			HashMap<String, ScafData> scafMap=makeScafMap(scafList);
104 			fillScafDataCDS(cds, scafMap);
105 			fillScafDataRNA(rrna, scafMap);
106 			fillScafDataRNA(trna, scafMap);
107 		}
108 
109 		countBases(scafList);
110 		if(PROCESS_PLUS_STRAND){
111 			processStrand(scafList, Shared.PLUS);
112 		}
113 		if(PROCESS_MINUS_STRAND){
114 			for(ScafData sd : scafList){
115 				sd.clear();
116 				sd.reverseComplement();
117 			}
118 			processStrand(scafList, Shared.MINUS);
119 			for(ScafData sd : scafList){
120 				sd.clear();
121 				sd.reverseComplement();
122 			}
123 		}
124 		return false;
125 	}
126 
add(GeneModel pgm)127 	public void add(GeneModel pgm){
128 		for(int i=0; i<allContainers.length; i++){
129 //			System.err.println("merging "+allContainers[i].name);
130 			allContainers[i].add(pgm.allContainers[i]);
131 		}
132 
133 		readsProcessed+=pgm.readsProcessed;
134 		basesProcessed+=pgm.basesProcessed;
135 		genesProcessed+=pgm.genesProcessed;
136 		filesProcessed+=pgm.filesProcessed;
137 
138 //		geneStartsProcessed+=pgm.geneStartsProcessed;
139 //		tRNAProcessed+=pgm.tRNAProcessed;
140 //		r16SProcessed+=pgm.r16SProcessed;
141 //		r23SProcessed+=pgm.r23SProcessed;
142 //		r5SProcessed+=pgm.r5SProcessed;
143 //		r18SProcessed+=pgm.r18SProcessed;
144 
145 //		fnames.addAll(pgm.fnames);
146 		numFiles+=pgm.numFiles;
147 		taxIds.addAll(pgm.taxIds);
148 		Tools.add(baseCounts, pgm.baseCounts);
149 	}
150 
multiplyBy(double mult)151 	public void multiplyBy(double mult) {
152 		for(int i=0; i<allContainers.length; i++){
153 			allContainers[i].multiplyBy(mult);
154 		}
155 
156 		readsProcessed=Math.round(readsProcessed*mult);
157 		basesProcessed=Math.round(basesProcessed*mult);
158 		genesProcessed=Math.round(genesProcessed*mult);
159 		filesProcessed=Math.round(filesProcessed*mult);
160 
161 		for(int i=0; i<baseCounts.length; i++){
162 			baseCounts[i]=Math.round(baseCounts[i]*mult);
163 		}
164 	}
165 
166 	/*--------------------------------------------------------------*/
167 	/*----------------        Outer Methods         ----------------*/
168 	/*--------------------------------------------------------------*/
169 
gc()170 	public float gc(){
171 		long a=baseCounts[0];
172 		long c=baseCounts[1];
173 		long g=baseCounts[2];
174 		long t=baseCounts[3];
175 		return (float)((g+c)/Tools.max(1.0, a+t+g+c));
176 	}
177 
makeScafMap(ArrayList<ScafData> scafList)178 	HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){
179 		HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3);
180 		for(ScafData sd : scafList){scafMap.put(sd.name, sd);}
181 		for(ScafData sd : scafList){
182 			String name=sd.name;
183 			int idx=name.indexOf(' ');
184 			if(idx>=0){
185 				String prefix=name.substring(0, idx);
186 				if(scafMap.containsKey(prefix)){
187 					assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'";
188 				}else{
189 					scafMap.put(prefix, sd);
190 				}
191 			}
192 		}
193 		return scafMap;
194 	}
195 
fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap)196 	public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){
197 		if(!callCDS){return;}
198 		for(GffLine gline : cdsLines){
199 			ScafData sd=scafMap.get(gline.seqid);
200 			assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
201 			sd.addCDS(gline);
202 		}
203 	}
204 
fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap)205 	public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){
206 		for(GffLine gline : rnaLines){
207 			ScafData sd=scafMap.get(gline.seqid);
208 			assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
209 			if(processType(gline.prokType())){
210 				sd.addRNA(gline);
211 			}
212 		}
213 	}
214 
processStrand(ArrayList<ScafData> scafList, int strand)215 	public void processStrand(ArrayList<ScafData> scafList, int strand){
216 		for(ScafData sd : scafList){
217 			processCDS(sd, strand);
218 			processRNA(sd, strand);
219 		}
220 	}
221 
222 	/*--------------------------------------------------------------*/
223 	/*----------------        Inner Methods         ----------------*/
224 	/*--------------------------------------------------------------*/
225 
countBases(ArrayList<ScafData> scafList)226 	private void countBases(ArrayList<ScafData> scafList){
227 		for(ScafData sd : scafList){
228 			countBases(sd.bases);
229 		}
230 	}
231 
countBases(byte[] bases)232 	private void countBases(byte[] bases){
233 		for(byte b : bases){
234 			int x=AminoAcid.baseToNumberACGTother[b];
235 			baseCounts[x]++;
236 		}
237 	}
238 
239 	/*--------------------------------------------------------------*/
240 	/*----------------        Finding Codons        ----------------*/
241 	/*--------------------------------------------------------------*/
242 
findStopCodons(byte[] bases, IntList list, BitSet valid)243 	private static void findStopCodons(byte[] bases, IntList list, BitSet valid){
244 		final int k=3;
245 		final int mask=~((-1)<<(2*k));
246 		int kmer=0;
247 		int len=0;
248 
249 		for(int i=0; i<bases.length; i++){
250 			byte b=bases[i];
251 			int x=AminoAcid.baseToNumber[b];
252 			kmer=((kmer<<2)|x)&mask;
253 			if(x>=0){
254 				len++;
255 				if(len>=k){
256 					int point=i;//End of the stop codon
257 					if(isStopCodon(kmer) && !valid.get(point)){
258 						list.add(point);
259 						valid.set(point);
260 					}
261 				}
262 			}else{len=0;}
263 		}
264 
265 		for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
266 			if(!valid.get(i)){
267 				list.add(i);
268 			}
269 		}
270 	}
271 
findStartCodons(byte[] bases, IntList list, BitSet valid)272 	private static void findStartCodons(byte[] bases, IntList list, BitSet valid){
273 		final int k=3;
274 		final int mask=~((-1)<<(2*k));
275 		int kmer=0;
276 		int len=0;
277 
278 		for(int i=0; i<bases.length; i++){
279 			byte b=bases[i];
280 			int x=AminoAcid.baseToNumber[b];
281 			kmer=((kmer<<2)|x)&mask;
282 			if(x>=0){
283 				len++;
284 				if(len>=k){
285 					int point=i-k+1;//Start of the start codon
286 					if(isStartCodon(kmer) && !valid.get(point)){
287 						list.add(point);
288 						valid.set(point);
289 					}
290 				}
291 			}else{len=0;}
292 		}
293 
294 		for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
295 			if(!valid.get(i)){
296 				list.add(i);
297 			}
298 		}
299 	}
300 
301 	/*--------------------------------------------------------------*/
302 	/*----------------     Processing GffLines      ----------------*/
303 	/*--------------------------------------------------------------*/
304 
processGene(GffLine gline, ScafData sd)305 	private static void processGene(GffLine gline, ScafData sd){
306 		if(gline.length()<2){return;}
307 		final int strand=gline.strand;
308 		assert(strand==sd.strand());
309 		final byte[] frames=sd.frames;
310 		int start=gline.start-1, stop=gline.stop-1;
311 		if(start<0 || stop>=sd.length()){return;}
312 //		assert(start<stop) : gline; //Not always true for euks...
313 		if(strand==Shared.MINUS){
314 			int x=sd.length()-start-1;
315 			int y=sd.length()-stop-1;
316 			start=y;
317 			stop=x;
318 
319 //			String a=new String(sd.bases, start, 3);
320 //			String b=new String(sd.bases, stop-2, 3);
321 ////			assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3);
322 //			outstream.println(a+", "+b+", "+start+", "+stop);
323 		}
324 		assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
325 		markFrames(start, stop, frames, kInnerCDS);
326 		sd.starts.add(start);
327 		sd.stops.add(stop);
328 //		assert(gline.start!=337) : gline+"\n"+start+", "+stop;
329 	}
330 
processRnaLine(final GffLine gline, final ScafData sd, final int type)331 	private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){
332 		final int strand=gline.strand;
333 		assert(strand==sd.strand());
334 		final byte[] frames=sd.frames;
335 		int start=gline.start-1, stop=gline.stop-1;
336 		if(start<0 || stop>=sd.length()){return false;}
337 		assert(start<=stop);
338 		if(strand==Shared.MINUS){
339 			int x=sd.length()-start-1;
340 			int y=sd.length()-stop-1;
341 			start=y;
342 			stop=x;
343 		}
344 
345 		if(AnalyzeGenes.alignRibo){
346 //			byte[] seq=sd.fetch(start, stop);
347 			Read[] consensusReads=ProkObject.consensusReads(type);
348 			byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null);
349 			float minIdentity=ProkObject.minID(type);
350 			if(universal!=null){
351 				int[] coords=KillSwitch.allocInt1D(2);
352 				final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50));
353 				final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50));
354 				float id1=align(universal, sd.bases, a, b, minIdentity, coords);
355 				final int rstart=coords[0], rstop=coords[1];
356 //				assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop;
357 				if(id1<minIdentity){
358 //					System.err.println("Low identity: "+String.format("%.2s", 100*id1));
359 					return false;
360 				}else{
361 //					System.err.println("Good identity: "+String.format("%.2s", 100*id1));
362 				}
363 				if(AnalyzeGenes.adjustEndpoints){
364 					int startSlop=startSlop(type);
365 					int stopSlop=stopSlop(type);
366 					if(Tools.absdif(start, rstart)>startSlop){
367 //						System.err.println("rstart:\t"+start+" -> "+rstart);
368 						start=rstart;
369 					}
370 					if(Tools.absdif(stop, rstop)>stopSlop){
371 //						System.err.println("rstop: \t"+stop+" -> "+rstop);
372 						stop=rstop;
373 					}
374 				}
375 			}
376 		}
377 
378 		StatsContainer sc=allContainers[type];
379 		sc.start.processPoint(sd.bases, start, 1);
380 		sc.stop.processPoint(sd.bases, stop, 1);
381 		assert(sc!=statsCDS);
382 
383 		assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
384 		final byte flag=typeToFlag(type);
385 		for(int i=start+kInnerRNA-1; i<=stop; i++){
386 			frames[i]|=flag;
387 		}
388 		return true;
389 	}
390 
align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords)391 	private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){
392 //		final int a=0, b=ref.length-1;
393 		SingleStateAlignerFlat2 ssa=GeneCaller.getSSA();
394 		final int minScore=ssa.minScoreByIdentity(query.length, minIdentity);
395 		int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore);
396 		if(max==null){return 0;}
397 
398 		final int rows=max[0];
399 		final int maxCol=max[1];
400 		final int maxState=max[2];
401 //		final int maxScore=max[3];
402 //		final int maxStart=max[4];
403 
404 		//returns {score, bestRefStart, bestRefStop}
405 		//padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
406 		final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState);
407 		final int rstart=Tools.max(score[1], 0);
408 		final int rstop=Tools.min(score[2], ref.length-1);
409 		if(coords!=null){
410 			coords[0]=rstart;
411 			coords[1]=rstop;
412 		}
413 		final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null);
414 		return id;
415 	}
416 
417 	/**
418 	 * Each frame byte has a bit marked for valid coding frames.
419 	 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon.
420 	 * If frames[23]=0, then no coding kmer end at that location on this strand.
421 	 * @param start
422 	 * @param stop
423 	 * @param frames
424 	 * @param k
425 	 */
markFrames(int start, int stop, byte[] frames, int k)426 	private static void markFrames(int start, int stop, byte[] frames, int k){
427 		assert(start<=stop) : start+", "+stop;
428 		for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){
429 			frames[i]=(byte)(frames[i]|frameBit);
430 			frameBit<<=1;
431 			if(frameBit>4){frameBit=1;}
432 		}
433 //		assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct
434 	}
435 
436 	/*--------------------------------------------------------------*/
437 	/*----------------        Counting Kmers        ----------------*/
438 	/*--------------------------------------------------------------*/
439 
processCDS(ScafData sd, int strand)440 	private void processCDS(ScafData sd, int strand){
441 		if(!callCDS){return;}
442 		ArrayList<GffLine> glines=sd.cdsLines[strand];
443 		for(GffLine gline : glines){
444 			assert(gline.strand==strand);
445 			processGene(gline, sd);
446 			statsCDS.addLength(gline.length());
447 		}
448 
449 		statsCDS.inner.processCDSFrames(sd.bases, sd.frames);
450 		BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1);
451 		BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1);
452 //		outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops.");
453 		sd.clear();
454 		findStartCodons(sd.bases, sd.starts, startSet);
455 		findStopCodons(sd.bases, sd.stops, stopSet);
456 //		outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops.");
457 		processEnds(sd.bases, statsCDS.start, sd.starts, 0);
458 		processEnds(sd.bases, statsCDS.stop, sd.stops, 0);
459 	}
460 
getGlineType(GffLine gline, ScafData sd)461 	private static int getGlineType(GffLine gline, ScafData sd){
462 		if(!gline.inbounds(sd.length()) || gline.partial()){return -1;}
463 
464 		final int length=gline.length();
465 		final int type=gline.prokType();
466 		if(type<0){
467 			return type;
468 		}else if(type==CDS){
469 			return type;
470 		}else if(type==tRNA && length>=40 && length<=120){
471 			return type;
472 		}else if(type==r16S && length>=1440 && length<=1620){
473 			return type;
474 		}else if(type==r23S && length>=2720 && length<=3170){
475 			return type;
476 		}else if(type==r5S && length>=90 && length<=150){
477 			return type;
478 		}else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range
479 			return type;
480 		}
481 		return -1;
482 	}
483 
processRNA(ScafData sd, int strand)484 	private void processRNA(ScafData sd, int strand){
485 		sd.clear();
486 		ArrayList<GffLine> lines=sd.rnaLines[strand];
487 		for(GffLine gline : lines){
488 			assert(gline.strand==strand);
489 			final int type=getGlineType(gline, sd);
490 			if(type>0){
491 				StatsContainer sc=allContainers[type];
492 				sc.addLength(gline.length());
493 				processRnaLine(gline, sd, type);
494 			}
495 		}
496 		processRnaInner(sd);
497 		processRnaEnds(sd);
498 	}
499 
processRnaInner(ScafData sd)500 	void processRnaInner(ScafData sd){
501 		byte[] bases=sd.bases;
502 		byte[] frames=sd.frames;
503 		final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs.
504 		final int mask=~((-1)<<(2*k));
505 		int kmer=0;
506 		int len=0;
507 
508 		for(int i=0; i<bases.length; i++){
509 			byte b=bases[i];
510 			int x=AminoAcid.baseToNumber[b];
511 			kmer=((kmer<<2)|x)&mask;
512 			if(x>=0){
513 				len++;
514 				if(len>=k){
515 					int vf=frames[i];
516 					for(int type=0; type<5; type++){
517 						int valid=vf&1;
518 						rnaContainers[type].inner.add(kmer, 0, valid);
519 						vf=(vf>>1);
520 					}
521 				}
522 			}else{len=0;}
523 		}
524 	}
525 
processRnaEnds(ScafData sd)526 	void processRnaEnds(ScafData sd){
527 		byte[] bases=sd.bases;
528 
529 		final int k=stats16S.start.k;
530 		final int kMax=stats16S.start.kMax;
531 		final int mask=stats16S.start.mask;
532 		final long[] counts=new long[kMax];//TODO: Slow
533 
534 		int kmer=0;
535 		int len=0;
536 
537 		for(int i=0; i<bases.length; i++){
538 			byte b=bases[i];
539 			int x=AminoAcid.baseToNumber[b];
540 			kmer=((kmer<<2)|x)&mask;
541 
542 			if(x>=0){
543 				len++;
544 				if(len>=k){
545 					counts[kmer]++;
546 				}
547 			}else{len=0;}
548 		}
549 		for(StatsContainer sc : rnaContainers){
550 			FrameStats fs=sc.start;
551 			for(long[] array : fs.countsFalse){
552 				Tools.add(array, counts);
553 			}
554 			fs=sc.stop;
555 			for(long[] array : fs.countsFalse){
556 				Tools.add(array, counts);
557 			}
558 		}
559 	}
560 
processEnds(byte[] bases, FrameStats stats, IntList list, int valid)561 	private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){
562 		BitSet points=new BitSet(bases.length);
563 		for(int i=0; i<list.size; i++){
564 			int point=list.get(i);
565 			stats.processPoint(bases, list.get(i), valid);
566 			points.set(point);
567 		}
568 		return points;
569 	}
570 
571 	/*--------------------------------------------------------------*/
572 	/*----------------           Scoring            ----------------*/
573 	/*--------------------------------------------------------------*/
574 
575 //	//Assumes bases are in the correct strand
576 //	public float calcStartScore(int start, int stop, byte[] bases){
577 //		float f=scorePoint(start, bases, startStats);
578 ////		float ss=scoreStart2(start, bases, stop, innerKmerStats);
579 ////		if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study.
580 //		return f;
581 //	}
582 //
583 //	//Assumes bases are in the correct strand
584 //	public float calcStopScore(int stop, byte[] bases){
585 //		float f=scorePoint(stop, bases, stopStats);
586 //		return f;
587 //	}
588 //
589 //	//Assumes bases are in the correct strand
590 //	public float calcRnaStartScore(int start, int stop, byte[] bases){
591 //		float f=scorePoint(start, bases, rrnaStartStats);
592 //		return f;
593 //	}
594 //
595 //	//Assumes bases are in the correct strand
596 //	public float calcRnaStopScore(int stop, byte[] bases){
597 //		float f=scorePoint(stop, bases, rrnaStopStats);
598 //		return f;
599 //	}
600 
601 //	public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){
602 //
603 //		assert(stats.frames==3);
604 //		final int k=stats.k;
605 //		final int mask=~((-1)<<(2*k));
606 //
607 //		int kmer=0;
608 //		int len=0;
609 //		float score=0;
610 //		int numKmers=0;
611 //
612 //		for(int pos=start, currentFrame=startFrame; pos<stop; pos++){
613 //			final byte b=bases[pos];
614 //			final int x=AminoAcid.baseToNumber[b];
615 //
616 //			if(x>=0){
617 //				kmer=((kmer<<2)|x)&mask;
618 //				len++;
619 //				if(len>=k){
620 //					float prob=stats.probs[currentFrame][kmer];
621 //					float dif=prob-0.99f;//Prob above 1 is more likely than average
622 //					score+=dif;
623 //					numKmers++;
624 //				}
625 //			}else{
626 //				len=0;
627 //				kmer=0;
628 //			}
629 //
630 //			currentFrame++;
631 //			if(currentFrame>2){currentFrame=0;}
632 //		}
633 //		return score/Tools.max(1f, numKmers);
634 //	}
635 //
636 //	/**
637 //	 * TODO
638 //	 * Evaluate the relative difference between left and right frequencies.
639 //	 * The purpose is to find locations where the left side looks noncoding and the right side looks coding.
640 //	 * Does not currently yield useful results.
641 //	 */
642 //	public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){
643 //		final int k=stats.k;
644 //
645 //		int start=point-45;
646 //		if(start<0 || stop>bases.length){return 0.5f;}
647 //
648 //		float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats);
649 //		float right=calcKmerScore(point, stop-3, 0, bases, stats);
650 //		return right-left; //High numbers are likely to be starts; non-starts should be near 0.
651 //	}
652 
653 	/*--------------------------------------------------------------*/
654 	/*----------------           toString           ----------------*/
655 	/*--------------------------------------------------------------*/
656 
657 	@Override
toString()658 	public String toString(){
659 		return appendTo(new ByteBuilder()).toString();
660 	}
661 
appendTo(ByteBuilder bb)662 	public ByteBuilder appendTo(ByteBuilder bb){
663 
664 //		Collections.sort(fnames);
665 		taxIds.sort();
666 
667 		bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n");
668 		bb.append("#files");
669 		bb.tab().append(numFiles);
670 //		if(fnames.size()>5){
671 //			bb.tab().append(fnames.size());
672 //		}else{
673 //			for(String fname : fnames){
674 //				bb.tab().append(fname);
675 //			}
676 //		}
677 		bb.nl();
678 		bb.append("#taxIDs");
679 		for(int i=0; i<taxIds.size; i++){
680 			bb.tab().append(taxIds.get(i));
681 		}
682 		bb.nl();
683 //		bb.append("#k_inner\t").append(innerKmerLength).nl();
684 //		bb.append("#k_end\t").append(endKmerLength).nl();
685 //		bb.append("#start_left_offset\t").append(startLeftOffset).nl();
686 //		bb.append("#start_right_offset\t").append(startRightOffset).nl();
687 //		bb.append("#stop_left_offset\t").append(stopLeftOffset).nl();
688 //		bb.append("#stop_right_offset\t").append(stopRightOffset).nl();
689 		bb.append("#scaffolds\t").append(readsProcessed).nl();
690 		bb.append("#bases\t").append(basesProcessed).nl();
691 		bb.append("#genes\t").append(genesProcessed).nl();
692 		bb.append("#GC\t").append(gc(),2).nl();
693 		bb.append("#ACGTN");
694 		for(long x : baseCounts){
695 			bb.tab().append(x);
696 		}
697 		bb.nl();
698 
699 		for(StatsContainer sc : allContainers){
700 			sc.appendTo(bb);
701 		}
702 		assert(allContainers.length>5) : allContainers.length;
703 
704 		return bb;
705 	}
706 
707 	/*--------------------------------------------------------------*/
708 	/*----------------            Stats             ----------------*/
709 	/*--------------------------------------------------------------*/
710 
711 	public final StatsContainer statsCDS=new StatsContainer(CDS);
712 	public final StatsContainer statstRNA=new StatsContainer(tRNA);
713 	public final StatsContainer stats16S=new StatsContainer(r16S);
714 	public final StatsContainer stats23S=new StatsContainer(r23S);
715 	public final StatsContainer stats5S=new StatsContainer(r5S);
716 	public final StatsContainer stats18S=new StatsContainer(r18S);
717 
718 	final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S};
719 	final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S};
720 	//public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
721 
722 //	public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0);
723 //	public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset);
724 //	public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset);
725 //
726 //	public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8);
727 //	public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8);
728 //
729 //	public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0);
730 //	public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0);
731 //	public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0);
732 //	public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0);
733 //	public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats};
734 
735 	/*--------------------------------------------------------------*/
736 
737 //	public ArrayList<String> fnames=new ArrayList<String>();
738 	public int numFiles=0;
739 	public IntList taxIds=new IntList();
740 
741 
742 	/*--------------------------------------------------------------*/
743 	/*----------------            Fields            ----------------*/
744 	/*--------------------------------------------------------------*/
745 
746 	private long maxReads=-1;
747 
748 	long readsProcessed=0;
749 	long basesProcessed=0;
750 	long genesProcessed=0;
751 	long filesProcessed=0;
752 	long[] baseCounts=new long[5];
753 
754 	/*--------------------------------------------------------------*/
755 	/*----------------        Static Setters        ----------------*/
756 	/*--------------------------------------------------------------*/
757 
setStatics()758 	public synchronized void setStatics(){
759 //		assert(!setStatics);
760 		kInnerCDS=statsCDS.inner.k;
761 		kStartCDS=statsCDS.start.k;
762 		kStopCDS=statsCDS.stop.k;
763 
764 		setStartLeftOffset(statsCDS.start.leftOffset);
765 		setStartRightOffset(statsCDS.start.rightOffset());
766 
767 		setStopLeftOffset(statsCDS.stop.leftOffset);
768 		setStopRightOffset(statsCDS.stop.rightOffset());
769 
770 		kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here?
771 		kStartRNA=stats16S.start.k;
772 		kStopRNA=stats16S.stop.k;
773 		setStatics=true;
774 	}
775 
setInnerK(int k)776 	public static void setInnerK(int k){
777 		kInnerCDS=k;
778 	}
779 
setStartK(int k)780 	public static void setStartK(int k){
781 		kStartCDS=k;
782 	}
783 
setStopK(int k)784 	public static void setStopK(int k){
785 		kStopCDS=k;
786 	}
787 
setStartLeftOffset(int x)788 	public static void setStartLeftOffset(int x){
789 		startLeftOffset=x;
790 		startFrames=startLeftOffset+startRightOffset+1;
791 //		System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
792 	}
793 
setStartRightOffset(int x)794 	public static void setStartRightOffset(int x){
795 		startRightOffset=x;
796 		startFrames=startLeftOffset+startRightOffset+1;
797 //		System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
798 //		assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
799 	}
800 
setStopLeftOffset(int x)801 	public static void setStopLeftOffset(int x){
802 		stopLeftOffset=x;
803 		stopFrames=stopLeftOffset+stopRightOffset+1;
804 //		System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
805 	}
806 
setStopRightOffset(int x)807 	public static void setStopRightOffset(int x){
808 		stopRightOffset=x;
809 		stopFrames=stopLeftOffset+stopRightOffset+1;
810 //		System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
811 //		assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
812 	}
813 
isStartCodon(int code)814 	public static final boolean isStartCodon(int code){
815 		return code>=0 && code<=63 && isStartCodon[code];
816 	}
isStopCodon(int code)817 	public static final boolean isStopCodon(int code){
818 		return code>=0 && code<=63 && isStopCodon[code];
819 	}
820 
821 	/*--------------------------------------------------------------*/
822 	/*----------------          Class Init          ----------------*/
823 	/*--------------------------------------------------------------*/
824 
makeIsCodon(String[] codons)825 	private static boolean[] makeIsCodon(String[] codons){
826 		boolean[] array=new boolean[64];
827 		for(String s : codons){
828 			int x=AminoAcid.toNumber(s);
829 			array[x]=true;
830 		}
831 		return array;
832 	}
833 
834 	public static int kInnerCDS=6;
835 	public static int kStartCDS=3;
836 	public static int kStopCDS=3;
837 
startLeftOffset()838 	static int startLeftOffset(){return startLeftOffset;}
startRightOffset()839 	static int startRightOffset(){return startRightOffset;}
startFrames()840 	static int startFrames(){return startFrames;}
841 
842 	private static int startLeftOffset=21; //21 works well for k=4
843 	private static int startRightOffset=8; //10 works well for k=4
844 	private static int startFrames=startLeftOffset+startRightOffset+1;
845 
846 	private static int stopLeftOffset=9;
847 	private static int stopRightOffset=12;
848 	private static int stopFrames=stopLeftOffset+stopRightOffset+1;
849 
850 	private static boolean setStatics=false;
851 
852 	/*--------------------------------------------------------------*/
853 	/*----------------         More Statics         ----------------*/
854 	/*--------------------------------------------------------------*/
855 
856 	//E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9]
857 	public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"};
858 	public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"};
859 	public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"};
860 	public static boolean[] isStartCodon=makeIsCodon(startCodons);
861 	public static boolean[] isStopCodon=makeIsCodon(stopCodons);
862 
863 	/*--------------------------------------------------------------*/
864 
865 	private static PrintStream outstream=System.err;
866 	public static boolean verbose=false;
867 	public static boolean errorState=false;
868 
869 }
870