1 package prok; 2 3 import java.io.PrintStream; 4 import java.util.ArrayList; 5 import java.util.BitSet; 6 import java.util.HashMap; 7 8 import aligner.SingleStateAlignerFlat2; 9 import dna.AminoAcid; 10 import fileIO.FileFormat; 11 import gff.GffLine; 12 import shared.KillSwitch; 13 import shared.Shared; 14 import shared.Tools; 15 import stream.Read; 16 import stream.ReadInputStream; 17 import structures.ByteBuilder; 18 import structures.IntList; 19 20 /** 21 * This class is designed to store kmer frequencies related to gene 22 * starts, stops, and interiors. It can be loaded from a pgm file. 23 * 24 * It's possible to use multiple GeneModels; for example, one for 25 * each of several GC ranges or clades. 26 * @author Brian Bushnell 27 * @date Sep 24, 2018 28 * 29 */ 30 public class GeneModel extends ProkObject { 31 32 /*--------------------------------------------------------------*/ 33 /*---------------- Initialization ----------------*/ 34 /*--------------------------------------------------------------*/ 35 GeneModel(boolean fill)36 public GeneModel(boolean fill){ 37 if(fill){ 38 fillContainers(); 39 } 40 } 41 fillContainers()42 void fillContainers(){ 43 statsCDS.setInner(kInnerCDS, 3); 44 statsCDS.setStart(kStartCDS, startFrames, startLeftOffset); 45 statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset); 46 47 for(int i=0; i<rnaContainers.length; i++){ 48 StatsContainer sc=rnaContainers[i]; 49 sc.setInner(kInnerRNA, 1); 50 } 51 52 statstRNA.setStart(kStartRNA, 14, 4); 53 statstRNA.setStop(kStopRNA, 14, 6); 54 55 stats16S.setStart(kStartRNA, 20, 7); 56 stats16S.setStop(kStopRNA, 12, 16); 57 58 stats23S.setStart(kStartRNA, 17, 3); 59 stats23S.setStop(kStopRNA, 15, 12); 60 61 stats5S.setStart(kStartRNA, 20, 5); 62 stats5S.setStop(kStopRNA, 15, 5); 63 64 stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined 65 stats18S.setStop(kStopRNA, 12, 16); 66 } 67 68 /*--------------------------------------------------------------*/ 69 /*---------------- Public Methods ----------------*/ 70 /*--------------------------------------------------------------*/ 71 process(String genomeFname, String gffFname)72 public boolean process(String genomeFname, String gffFname){ 73 // fnames.add(ReadWrite.stripPath(genomeFname)); 74 numFiles++; 75 FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true); 76 FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true); 77 78 if(fnaFile==null || gffFile==null){ 79 errorState=true; 80 return true; 81 } 82 filesProcessed++; 83 84 ArrayList<ScafData> scafList; 85 {//Scoped to save memory 86 ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads); 87 readsProcessed+=reads.size(); 88 scafList=new ArrayList<ScafData>(reads.size()); 89 for(Read r : reads){ 90 basesProcessed+=r.length(); 91 scafList.add(new ScafData(r)); 92 } 93 } 94 {//Scoped to save memory 95 ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true); 96 ArrayList<GffLine> cds=allGffLines[0]; 97 ArrayList<GffLine> rrna=allGffLines[1]; 98 ArrayList<GffLine> trna=allGffLines[2]; 99 genesProcessed+=cds.size(); 100 genesProcessed+=(rrna==null ? 0 : rrna.size()); 101 genesProcessed+=(trna==null ? 0 : trna.size()); 102 103 HashMap<String, ScafData> scafMap=makeScafMap(scafList); 104 fillScafDataCDS(cds, scafMap); 105 fillScafDataRNA(rrna, scafMap); 106 fillScafDataRNA(trna, scafMap); 107 } 108 109 countBases(scafList); 110 if(PROCESS_PLUS_STRAND){ 111 processStrand(scafList, Shared.PLUS); 112 } 113 if(PROCESS_MINUS_STRAND){ 114 for(ScafData sd : scafList){ 115 sd.clear(); 116 sd.reverseComplement(); 117 } 118 processStrand(scafList, Shared.MINUS); 119 for(ScafData sd : scafList){ 120 sd.clear(); 121 sd.reverseComplement(); 122 } 123 } 124 return false; 125 } 126 add(GeneModel pgm)127 public void add(GeneModel pgm){ 128 for(int i=0; i<allContainers.length; i++){ 129 // System.err.println("merging "+allContainers[i].name); 130 allContainers[i].add(pgm.allContainers[i]); 131 } 132 133 readsProcessed+=pgm.readsProcessed; 134 basesProcessed+=pgm.basesProcessed; 135 genesProcessed+=pgm.genesProcessed; 136 filesProcessed+=pgm.filesProcessed; 137 138 // geneStartsProcessed+=pgm.geneStartsProcessed; 139 // tRNAProcessed+=pgm.tRNAProcessed; 140 // r16SProcessed+=pgm.r16SProcessed; 141 // r23SProcessed+=pgm.r23SProcessed; 142 // r5SProcessed+=pgm.r5SProcessed; 143 // r18SProcessed+=pgm.r18SProcessed; 144 145 // fnames.addAll(pgm.fnames); 146 numFiles+=pgm.numFiles; 147 taxIds.addAll(pgm.taxIds); 148 Tools.add(baseCounts, pgm.baseCounts); 149 } 150 multiplyBy(double mult)151 public void multiplyBy(double mult) { 152 for(int i=0; i<allContainers.length; i++){ 153 allContainers[i].multiplyBy(mult); 154 } 155 156 readsProcessed=Math.round(readsProcessed*mult); 157 basesProcessed=Math.round(basesProcessed*mult); 158 genesProcessed=Math.round(genesProcessed*mult); 159 filesProcessed=Math.round(filesProcessed*mult); 160 161 for(int i=0; i<baseCounts.length; i++){ 162 baseCounts[i]=Math.round(baseCounts[i]*mult); 163 } 164 } 165 166 /*--------------------------------------------------------------*/ 167 /*---------------- Outer Methods ----------------*/ 168 /*--------------------------------------------------------------*/ 169 gc()170 public float gc(){ 171 long a=baseCounts[0]; 172 long c=baseCounts[1]; 173 long g=baseCounts[2]; 174 long t=baseCounts[3]; 175 return (float)((g+c)/Tools.max(1.0, a+t+g+c)); 176 } 177 makeScafMap(ArrayList<ScafData> scafList)178 HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){ 179 HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3); 180 for(ScafData sd : scafList){scafMap.put(sd.name, sd);} 181 for(ScafData sd : scafList){ 182 String name=sd.name; 183 int idx=name.indexOf(' '); 184 if(idx>=0){ 185 String prefix=name.substring(0, idx); 186 if(scafMap.containsKey(prefix)){ 187 assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'"; 188 }else{ 189 scafMap.put(prefix, sd); 190 } 191 } 192 } 193 return scafMap; 194 } 195 fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap)196 public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){ 197 if(!callCDS){return;} 198 for(GffLine gline : cdsLines){ 199 ScafData sd=scafMap.get(gline.seqid); 200 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid; 201 sd.addCDS(gline); 202 } 203 } 204 fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap)205 public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){ 206 for(GffLine gline : rnaLines){ 207 ScafData sd=scafMap.get(gline.seqid); 208 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid; 209 if(processType(gline.prokType())){ 210 sd.addRNA(gline); 211 } 212 } 213 } 214 processStrand(ArrayList<ScafData> scafList, int strand)215 public void processStrand(ArrayList<ScafData> scafList, int strand){ 216 for(ScafData sd : scafList){ 217 processCDS(sd, strand); 218 processRNA(sd, strand); 219 } 220 } 221 222 /*--------------------------------------------------------------*/ 223 /*---------------- Inner Methods ----------------*/ 224 /*--------------------------------------------------------------*/ 225 countBases(ArrayList<ScafData> scafList)226 private void countBases(ArrayList<ScafData> scafList){ 227 for(ScafData sd : scafList){ 228 countBases(sd.bases); 229 } 230 } 231 countBases(byte[] bases)232 private void countBases(byte[] bases){ 233 for(byte b : bases){ 234 int x=AminoAcid.baseToNumberACGTother[b]; 235 baseCounts[x]++; 236 } 237 } 238 239 /*--------------------------------------------------------------*/ 240 /*---------------- Finding Codons ----------------*/ 241 /*--------------------------------------------------------------*/ 242 findStopCodons(byte[] bases, IntList list, BitSet valid)243 private static void findStopCodons(byte[] bases, IntList list, BitSet valid){ 244 final int k=3; 245 final int mask=~((-1)<<(2*k)); 246 int kmer=0; 247 int len=0; 248 249 for(int i=0; i<bases.length; i++){ 250 byte b=bases[i]; 251 int x=AminoAcid.baseToNumber[b]; 252 kmer=((kmer<<2)|x)&mask; 253 if(x>=0){ 254 len++; 255 if(len>=k){ 256 int point=i;//End of the stop codon 257 if(isStopCodon(kmer) && !valid.get(point)){ 258 list.add(point); 259 valid.set(point); 260 } 261 } 262 }else{len=0;} 263 } 264 265 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise 266 if(!valid.get(i)){ 267 list.add(i); 268 } 269 } 270 } 271 findStartCodons(byte[] bases, IntList list, BitSet valid)272 private static void findStartCodons(byte[] bases, IntList list, BitSet valid){ 273 final int k=3; 274 final int mask=~((-1)<<(2*k)); 275 int kmer=0; 276 int len=0; 277 278 for(int i=0; i<bases.length; i++){ 279 byte b=bases[i]; 280 int x=AminoAcid.baseToNumber[b]; 281 kmer=((kmer<<2)|x)&mask; 282 if(x>=0){ 283 len++; 284 if(len>=k){ 285 int point=i-k+1;//Start of the start codon 286 if(isStartCodon(kmer) && !valid.get(point)){ 287 list.add(point); 288 valid.set(point); 289 } 290 } 291 }else{len=0;} 292 } 293 294 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise 295 if(!valid.get(i)){ 296 list.add(i); 297 } 298 } 299 } 300 301 /*--------------------------------------------------------------*/ 302 /*---------------- Processing GffLines ----------------*/ 303 /*--------------------------------------------------------------*/ 304 processGene(GffLine gline, ScafData sd)305 private static void processGene(GffLine gline, ScafData sd){ 306 if(gline.length()<2){return;} 307 final int strand=gline.strand; 308 assert(strand==sd.strand()); 309 final byte[] frames=sd.frames; 310 int start=gline.start-1, stop=gline.stop-1; 311 if(start<0 || stop>=sd.length()){return;} 312 // assert(start<stop) : gline; //Not always true for euks... 313 if(strand==Shared.MINUS){ 314 int x=sd.length()-start-1; 315 int y=sd.length()-stop-1; 316 start=y; 317 stop=x; 318 319 // String a=new String(sd.bases, start, 3); 320 // String b=new String(sd.bases, stop-2, 3); 321 //// assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3); 322 // outstream.println(a+", "+b+", "+start+", "+stop); 323 } 324 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name; 325 markFrames(start, stop, frames, kInnerCDS); 326 sd.starts.add(start); 327 sd.stops.add(stop); 328 // assert(gline.start!=337) : gline+"\n"+start+", "+stop; 329 } 330 processRnaLine(final GffLine gline, final ScafData sd, final int type)331 private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){ 332 final int strand=gline.strand; 333 assert(strand==sd.strand()); 334 final byte[] frames=sd.frames; 335 int start=gline.start-1, stop=gline.stop-1; 336 if(start<0 || stop>=sd.length()){return false;} 337 assert(start<=stop); 338 if(strand==Shared.MINUS){ 339 int x=sd.length()-start-1; 340 int y=sd.length()-stop-1; 341 start=y; 342 stop=x; 343 } 344 345 if(AnalyzeGenes.alignRibo){ 346 // byte[] seq=sd.fetch(start, stop); 347 Read[] consensusReads=ProkObject.consensusReads(type); 348 byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null); 349 float minIdentity=ProkObject.minID(type); 350 if(universal!=null){ 351 int[] coords=KillSwitch.allocInt1D(2); 352 final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50)); 353 final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50)); 354 float id1=align(universal, sd.bases, a, b, minIdentity, coords); 355 final int rstart=coords[0], rstop=coords[1]; 356 // assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop; 357 if(id1<minIdentity){ 358 // System.err.println("Low identity: "+String.format("%.2s", 100*id1)); 359 return false; 360 }else{ 361 // System.err.println("Good identity: "+String.format("%.2s", 100*id1)); 362 } 363 if(AnalyzeGenes.adjustEndpoints){ 364 int startSlop=startSlop(type); 365 int stopSlop=stopSlop(type); 366 if(Tools.absdif(start, rstart)>startSlop){ 367 // System.err.println("rstart:\t"+start+" -> "+rstart); 368 start=rstart; 369 } 370 if(Tools.absdif(stop, rstop)>stopSlop){ 371 // System.err.println("rstop: \t"+stop+" -> "+rstop); 372 stop=rstop; 373 } 374 } 375 } 376 } 377 378 StatsContainer sc=allContainers[type]; 379 sc.start.processPoint(sd.bases, start, 1); 380 sc.stop.processPoint(sd.bases, stop, 1); 381 assert(sc!=statsCDS); 382 383 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name; 384 final byte flag=typeToFlag(type); 385 for(int i=start+kInnerRNA-1; i<=stop; i++){ 386 frames[i]|=flag; 387 } 388 return true; 389 } 390 align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords)391 private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){ 392 // final int a=0, b=ref.length-1; 393 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA(); 394 final int minScore=ssa.minScoreByIdentity(query.length, minIdentity); 395 int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore); 396 if(max==null){return 0;} 397 398 final int rows=max[0]; 399 final int maxCol=max[1]; 400 final int maxState=max[2]; 401 // final int maxScore=max[3]; 402 // final int maxStart=max[4]; 403 404 //returns {score, bestRefStart, bestRefStop} 405 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight}; 406 final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState); 407 final int rstart=Tools.max(score[1], 0); 408 final int rstop=Tools.min(score[2], ref.length-1); 409 if(coords!=null){ 410 coords[0]=rstart; 411 coords[1]=rstop; 412 } 413 final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null); 414 return id; 415 } 416 417 /** 418 * Each frame byte has a bit marked for valid coding frames. 419 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon. 420 * If frames[23]=0, then no coding kmer end at that location on this strand. 421 * @param start 422 * @param stop 423 * @param frames 424 * @param k 425 */ markFrames(int start, int stop, byte[] frames, int k)426 private static void markFrames(int start, int stop, byte[] frames, int k){ 427 assert(start<=stop) : start+", "+stop; 428 for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){ 429 frames[i]=(byte)(frames[i]|frameBit); 430 frameBit<<=1; 431 if(frameBit>4){frameBit=1;} 432 } 433 // assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct 434 } 435 436 /*--------------------------------------------------------------*/ 437 /*---------------- Counting Kmers ----------------*/ 438 /*--------------------------------------------------------------*/ 439 processCDS(ScafData sd, int strand)440 private void processCDS(ScafData sd, int strand){ 441 if(!callCDS){return;} 442 ArrayList<GffLine> glines=sd.cdsLines[strand]; 443 for(GffLine gline : glines){ 444 assert(gline.strand==strand); 445 processGene(gline, sd); 446 statsCDS.addLength(gline.length()); 447 } 448 449 statsCDS.inner.processCDSFrames(sd.bases, sd.frames); 450 BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1); 451 BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1); 452 // outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops."); 453 sd.clear(); 454 findStartCodons(sd.bases, sd.starts, startSet); 455 findStopCodons(sd.bases, sd.stops, stopSet); 456 // outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops."); 457 processEnds(sd.bases, statsCDS.start, sd.starts, 0); 458 processEnds(sd.bases, statsCDS.stop, sd.stops, 0); 459 } 460 getGlineType(GffLine gline, ScafData sd)461 private static int getGlineType(GffLine gline, ScafData sd){ 462 if(!gline.inbounds(sd.length()) || gline.partial()){return -1;} 463 464 final int length=gline.length(); 465 final int type=gline.prokType(); 466 if(type<0){ 467 return type; 468 }else if(type==CDS){ 469 return type; 470 }else if(type==tRNA && length>=40 && length<=120){ 471 return type; 472 }else if(type==r16S && length>=1440 && length<=1620){ 473 return type; 474 }else if(type==r23S && length>=2720 && length<=3170){ 475 return type; 476 }else if(type==r5S && length>=90 && length<=150){ 477 return type; 478 }else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range 479 return type; 480 } 481 return -1; 482 } 483 processRNA(ScafData sd, int strand)484 private void processRNA(ScafData sd, int strand){ 485 sd.clear(); 486 ArrayList<GffLine> lines=sd.rnaLines[strand]; 487 for(GffLine gline : lines){ 488 assert(gline.strand==strand); 489 final int type=getGlineType(gline, sd); 490 if(type>0){ 491 StatsContainer sc=allContainers[type]; 492 sc.addLength(gline.length()); 493 processRnaLine(gline, sd, type); 494 } 495 } 496 processRnaInner(sd); 497 processRnaEnds(sd); 498 } 499 processRnaInner(ScafData sd)500 void processRnaInner(ScafData sd){ 501 byte[] bases=sd.bases; 502 byte[] frames=sd.frames; 503 final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs. 504 final int mask=~((-1)<<(2*k)); 505 int kmer=0; 506 int len=0; 507 508 for(int i=0; i<bases.length; i++){ 509 byte b=bases[i]; 510 int x=AminoAcid.baseToNumber[b]; 511 kmer=((kmer<<2)|x)&mask; 512 if(x>=0){ 513 len++; 514 if(len>=k){ 515 int vf=frames[i]; 516 for(int type=0; type<5; type++){ 517 int valid=vf&1; 518 rnaContainers[type].inner.add(kmer, 0, valid); 519 vf=(vf>>1); 520 } 521 } 522 }else{len=0;} 523 } 524 } 525 processRnaEnds(ScafData sd)526 void processRnaEnds(ScafData sd){ 527 byte[] bases=sd.bases; 528 529 final int k=stats16S.start.k; 530 final int kMax=stats16S.start.kMax; 531 final int mask=stats16S.start.mask; 532 final long[] counts=new long[kMax];//TODO: Slow 533 534 int kmer=0; 535 int len=0; 536 537 for(int i=0; i<bases.length; i++){ 538 byte b=bases[i]; 539 int x=AminoAcid.baseToNumber[b]; 540 kmer=((kmer<<2)|x)&mask; 541 542 if(x>=0){ 543 len++; 544 if(len>=k){ 545 counts[kmer]++; 546 } 547 }else{len=0;} 548 } 549 for(StatsContainer sc : rnaContainers){ 550 FrameStats fs=sc.start; 551 for(long[] array : fs.countsFalse){ 552 Tools.add(array, counts); 553 } 554 fs=sc.stop; 555 for(long[] array : fs.countsFalse){ 556 Tools.add(array, counts); 557 } 558 } 559 } 560 processEnds(byte[] bases, FrameStats stats, IntList list, int valid)561 private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){ 562 BitSet points=new BitSet(bases.length); 563 for(int i=0; i<list.size; i++){ 564 int point=list.get(i); 565 stats.processPoint(bases, list.get(i), valid); 566 points.set(point); 567 } 568 return points; 569 } 570 571 /*--------------------------------------------------------------*/ 572 /*---------------- Scoring ----------------*/ 573 /*--------------------------------------------------------------*/ 574 575 // //Assumes bases are in the correct strand 576 // public float calcStartScore(int start, int stop, byte[] bases){ 577 // float f=scorePoint(start, bases, startStats); 578 //// float ss=scoreStart2(start, bases, stop, innerKmerStats); 579 //// if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study. 580 // return f; 581 // } 582 // 583 // //Assumes bases are in the correct strand 584 // public float calcStopScore(int stop, byte[] bases){ 585 // float f=scorePoint(stop, bases, stopStats); 586 // return f; 587 // } 588 // 589 // //Assumes bases are in the correct strand 590 // public float calcRnaStartScore(int start, int stop, byte[] bases){ 591 // float f=scorePoint(start, bases, rrnaStartStats); 592 // return f; 593 // } 594 // 595 // //Assumes bases are in the correct strand 596 // public float calcRnaStopScore(int stop, byte[] bases){ 597 // float f=scorePoint(stop, bases, rrnaStopStats); 598 // return f; 599 // } 600 601 // public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){ 602 // 603 // assert(stats.frames==3); 604 // final int k=stats.k; 605 // final int mask=~((-1)<<(2*k)); 606 // 607 // int kmer=0; 608 // int len=0; 609 // float score=0; 610 // int numKmers=0; 611 // 612 // for(int pos=start, currentFrame=startFrame; pos<stop; pos++){ 613 // final byte b=bases[pos]; 614 // final int x=AminoAcid.baseToNumber[b]; 615 // 616 // if(x>=0){ 617 // kmer=((kmer<<2)|x)&mask; 618 // len++; 619 // if(len>=k){ 620 // float prob=stats.probs[currentFrame][kmer]; 621 // float dif=prob-0.99f;//Prob above 1 is more likely than average 622 // score+=dif; 623 // numKmers++; 624 // } 625 // }else{ 626 // len=0; 627 // kmer=0; 628 // } 629 // 630 // currentFrame++; 631 // if(currentFrame>2){currentFrame=0;} 632 // } 633 // return score/Tools.max(1f, numKmers); 634 // } 635 // 636 // /** 637 // * TODO 638 // * Evaluate the relative difference between left and right frequencies. 639 // * The purpose is to find locations where the left side looks noncoding and the right side looks coding. 640 // * Does not currently yield useful results. 641 // */ 642 // public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){ 643 // final int k=stats.k; 644 // 645 // int start=point-45; 646 // if(start<0 || stop>bases.length){return 0.5f;} 647 // 648 // float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats); 649 // float right=calcKmerScore(point, stop-3, 0, bases, stats); 650 // return right-left; //High numbers are likely to be starts; non-starts should be near 0. 651 // } 652 653 /*--------------------------------------------------------------*/ 654 /*---------------- toString ----------------*/ 655 /*--------------------------------------------------------------*/ 656 657 @Override toString()658 public String toString(){ 659 return appendTo(new ByteBuilder()).toString(); 660 } 661 appendTo(ByteBuilder bb)662 public ByteBuilder appendTo(ByteBuilder bb){ 663 664 // Collections.sort(fnames); 665 taxIds.sort(); 666 667 bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n"); 668 bb.append("#files"); 669 bb.tab().append(numFiles); 670 // if(fnames.size()>5){ 671 // bb.tab().append(fnames.size()); 672 // }else{ 673 // for(String fname : fnames){ 674 // bb.tab().append(fname); 675 // } 676 // } 677 bb.nl(); 678 bb.append("#taxIDs"); 679 for(int i=0; i<taxIds.size; i++){ 680 bb.tab().append(taxIds.get(i)); 681 } 682 bb.nl(); 683 // bb.append("#k_inner\t").append(innerKmerLength).nl(); 684 // bb.append("#k_end\t").append(endKmerLength).nl(); 685 // bb.append("#start_left_offset\t").append(startLeftOffset).nl(); 686 // bb.append("#start_right_offset\t").append(startRightOffset).nl(); 687 // bb.append("#stop_left_offset\t").append(stopLeftOffset).nl(); 688 // bb.append("#stop_right_offset\t").append(stopRightOffset).nl(); 689 bb.append("#scaffolds\t").append(readsProcessed).nl(); 690 bb.append("#bases\t").append(basesProcessed).nl(); 691 bb.append("#genes\t").append(genesProcessed).nl(); 692 bb.append("#GC\t").append(gc(),2).nl(); 693 bb.append("#ACGTN"); 694 for(long x : baseCounts){ 695 bb.tab().append(x); 696 } 697 bb.nl(); 698 699 for(StatsContainer sc : allContainers){ 700 sc.appendTo(bb); 701 } 702 assert(allContainers.length>5) : allContainers.length; 703 704 return bb; 705 } 706 707 /*--------------------------------------------------------------*/ 708 /*---------------- Stats ----------------*/ 709 /*--------------------------------------------------------------*/ 710 711 public final StatsContainer statsCDS=new StatsContainer(CDS); 712 public final StatsContainer statstRNA=new StatsContainer(tRNA); 713 public final StatsContainer stats16S=new StatsContainer(r16S); 714 public final StatsContainer stats23S=new StatsContainer(r23S); 715 public final StatsContainer stats5S=new StatsContainer(r5S); 716 public final StatsContainer stats18S=new StatsContainer(r18S); 717 718 final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S}; 719 final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S}; 720 //public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; 721 722 // public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0); 723 // public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset); 724 // public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset); 725 // 726 // public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8); 727 // public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8); 728 // 729 // public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0); 730 // public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0); 731 // public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0); 732 // public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0); 733 // public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats}; 734 735 /*--------------------------------------------------------------*/ 736 737 // public ArrayList<String> fnames=new ArrayList<String>(); 738 public int numFiles=0; 739 public IntList taxIds=new IntList(); 740 741 742 /*--------------------------------------------------------------*/ 743 /*---------------- Fields ----------------*/ 744 /*--------------------------------------------------------------*/ 745 746 private long maxReads=-1; 747 748 long readsProcessed=0; 749 long basesProcessed=0; 750 long genesProcessed=0; 751 long filesProcessed=0; 752 long[] baseCounts=new long[5]; 753 754 /*--------------------------------------------------------------*/ 755 /*---------------- Static Setters ----------------*/ 756 /*--------------------------------------------------------------*/ 757 setStatics()758 public synchronized void setStatics(){ 759 // assert(!setStatics); 760 kInnerCDS=statsCDS.inner.k; 761 kStartCDS=statsCDS.start.k; 762 kStopCDS=statsCDS.stop.k; 763 764 setStartLeftOffset(statsCDS.start.leftOffset); 765 setStartRightOffset(statsCDS.start.rightOffset()); 766 767 setStopLeftOffset(statsCDS.stop.leftOffset); 768 setStopRightOffset(statsCDS.stop.rightOffset()); 769 770 kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here? 771 kStartRNA=stats16S.start.k; 772 kStopRNA=stats16S.stop.k; 773 setStatics=true; 774 } 775 setInnerK(int k)776 public static void setInnerK(int k){ 777 kInnerCDS=k; 778 } 779 setStartK(int k)780 public static void setStartK(int k){ 781 kStartCDS=k; 782 } 783 setStopK(int k)784 public static void setStopK(int k){ 785 kStopCDS=k; 786 } 787 setStartLeftOffset(int x)788 public static void setStartLeftOffset(int x){ 789 startLeftOffset=x; 790 startFrames=startLeftOffset+startRightOffset+1; 791 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames); 792 } 793 setStartRightOffset(int x)794 public static void setStartRightOffset(int x){ 795 startRightOffset=x; 796 startFrames=startLeftOffset+startRightOffset+1; 797 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames); 798 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames; 799 } 800 setStopLeftOffset(int x)801 public static void setStopLeftOffset(int x){ 802 stopLeftOffset=x; 803 stopFrames=stopLeftOffset+stopRightOffset+1; 804 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames); 805 } 806 setStopRightOffset(int x)807 public static void setStopRightOffset(int x){ 808 stopRightOffset=x; 809 stopFrames=stopLeftOffset+stopRightOffset+1; 810 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames); 811 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames; 812 } 813 isStartCodon(int code)814 public static final boolean isStartCodon(int code){ 815 return code>=0 && code<=63 && isStartCodon[code]; 816 } isStopCodon(int code)817 public static final boolean isStopCodon(int code){ 818 return code>=0 && code<=63 && isStopCodon[code]; 819 } 820 821 /*--------------------------------------------------------------*/ 822 /*---------------- Class Init ----------------*/ 823 /*--------------------------------------------------------------*/ 824 makeIsCodon(String[] codons)825 private static boolean[] makeIsCodon(String[] codons){ 826 boolean[] array=new boolean[64]; 827 for(String s : codons){ 828 int x=AminoAcid.toNumber(s); 829 array[x]=true; 830 } 831 return array; 832 } 833 834 public static int kInnerCDS=6; 835 public static int kStartCDS=3; 836 public static int kStopCDS=3; 837 startLeftOffset()838 static int startLeftOffset(){return startLeftOffset;} startRightOffset()839 static int startRightOffset(){return startRightOffset;} startFrames()840 static int startFrames(){return startFrames;} 841 842 private static int startLeftOffset=21; //21 works well for k=4 843 private static int startRightOffset=8; //10 works well for k=4 844 private static int startFrames=startLeftOffset+startRightOffset+1; 845 846 private static int stopLeftOffset=9; 847 private static int stopRightOffset=12; 848 private static int stopFrames=stopLeftOffset+stopRightOffset+1; 849 850 private static boolean setStatics=false; 851 852 /*--------------------------------------------------------------*/ 853 /*---------------- More Statics ----------------*/ 854 /*--------------------------------------------------------------*/ 855 856 //E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9] 857 public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"}; 858 public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"}; 859 public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"}; 860 public static boolean[] isStartCodon=makeIsCodon(startCodons); 861 public static boolean[] isStopCodon=makeIsCodon(stopCodons); 862 863 /*--------------------------------------------------------------*/ 864 865 private static PrintStream outstream=System.err; 866 public static boolean verbose=false; 867 public static boolean errorState=false; 868 869 } 870