1 package prok; 2 3 import java.io.PrintStream; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.Collections; 7 8 import aligner.SingleStateAlignerFlat2; 9 import aligner.SingleStateAlignerFlat3; 10 import aligner.SingleStateAlignerFlatFloat; 11 import dna.AminoAcid; 12 import shared.KillSwitch; 13 import shared.Tools; 14 import stream.Read; 15 import structures.FloatList; 16 import structures.IntList; 17 import structures.LongHashSet; 18 19 20 /** 21 * This class calls genes within a single thread. 22 * @author Brian Bushnell 23 * @date Sep 24, 2018 24 * 25 */ 26 public class GeneCaller extends ProkObject { 27 28 /*--------------------------------------------------------------*/ 29 /*---------------- Init ----------------*/ 30 /*--------------------------------------------------------------*/ 31 GeneCaller(int minLen_, int maxOverlapSameStrand_, int maxOverlapOppositeStrand_, float minStartScore_, float minStopScore_, float minInnerScore_, float minOrfScore_, float minAvgScore_, GeneModel pgm_)32 GeneCaller(int minLen_, int maxOverlapSameStrand_, int maxOverlapOppositeStrand_, 33 float minStartScore_, float minStopScore_, float minInnerScore_, 34 float minOrfScore_, float minAvgScore_, GeneModel pgm_){ 35 minLen=minLen_; 36 maxOverlapSameStrand=maxOverlapSameStrand_; 37 maxOverlapOppositeStrand=maxOverlapOppositeStrand_; 38 pgm=pgm_; 39 40 minStartScore=minStartScore_; 41 minStopScore=minStopScore_; 42 minInnerScore=minInnerScore_; 43 minOrfScore=minOrfScore_; 44 minAvgScore=minAvgScore_; 45 } 46 47 /*--------------------------------------------------------------*/ 48 /*---------------- Outer Methods ----------------*/ 49 /*--------------------------------------------------------------*/ 50 callGenes(Read r)51 public ArrayList<Orf> callGenes(Read r){ 52 return callGenes(r, pgm); 53 } 54 callGenes(Read r, GeneModel pgm_)55 public ArrayList<Orf> callGenes(Read r, GeneModel pgm_){ 56 pgm=pgm_; 57 58 final String name=r.id; 59 final byte[] bases=r.bases; 60 61 //Lists of all longest orfs per frame 62 ArrayList<Orf>[] frameLists=makeOrfs(name, bases, minLen); 63 //Lists of all high-scoring orfs per frame, with potentially multiple orfs sharing stops. 64 ArrayList<Orf>[] brokenLists=breakOrfs(frameLists, bases); 65 66 ArrayList<Orf>[] rnaLists=null; 67 final int rlen=r.length(); 68 if(calltRNA || (call16S && rlen>800) || (call23S && rlen>1500) || call5S || (call18S && rlen>1000)){ 69 rnaLists=makeRnas(name, bases); 70 71 brokenLists[0].addAll(rnaLists[0]); 72 brokenLists[3].addAll(rnaLists[1]); 73 Collections.sort(brokenLists[0]); 74 Collections.sort(brokenLists[3]); 75 } 76 77 boolean printAllOrfs=false; 78 boolean printRnas=false; 79 if(printAllOrfs){ 80 ArrayList<Orf> temp=new ArrayList<Orf>(); 81 for(ArrayList<Orf> broken : brokenLists){ 82 temp.addAll(broken); 83 } 84 Collections.sort(temp); 85 return temp; 86 } 87 88 if(printRnas && rnaLists!=null){ 89 ArrayList<Orf> temp=new ArrayList<Orf>(); 90 for(ArrayList<Orf> list : rnaLists){ 91 temp.addAll(list); 92 } 93 Collections.sort(temp); 94 return temp; 95 } 96 97 stCds2.add(brokenLists); 98 99 //Find the optimal path through Orfs 100 ArrayList<Orf> path=findPath(brokenLists, bases); 101 // geneStartsOut+=path.size(); 102 103 if(callCDS){stCdsPass.add(path);} 104 if(calltRNA){sttRNA.add(path);} 105 if(call16S){st16s.add(path);} 106 if(call23S){st23s.add(path);} 107 if(call5S){st5s.add(path);} 108 if(call18S){st18s.add(path);} 109 110 return path; 111 } 112 113 /*--------------------------------------------------------------*/ 114 /*---------------- Inner Methods ----------------*/ 115 /*--------------------------------------------------------------*/ 116 117 /** 118 * Generates lists of all max-length non-overlapping Orfs per frame. 119 * There IS overlap between frames. 120 * All Orfs come out flipped to + orientation. 121 * */ makeRnas(String name, byte[] bases)122 ArrayList<Orf>[] makeRnas(String name, byte[] bases){ 123 @SuppressWarnings("unchecked") 124 ArrayList<Orf>[] array=new ArrayList[2]; 125 array[0]=new ArrayList<Orf>(); 126 array[1]=new ArrayList<Orf>(); 127 final float[] scores=new float[bases.length]; 128 final int[] kmersSeen=(lsuKmers==null && ssuKmers==null && trnaKmers==null && r5SKmers==null) ? null : new int[bases.length]; 129 for(int strand=0; strand<2; strand++){ 130 for(StatsContainer sc : pgm.rnaContainers){ 131 if(ProkObject.callType(sc.type)){ 132 ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, (sc.kmerSet()==null ? null : kmersSeen), false, -1);//TODO: Make this loop through all RNA types 133 if(strand==1 && list!=null){ 134 for(Orf orf : list){ 135 assert(orf.strand==strand); 136 orf.flip(); 137 } 138 } 139 if(list!=null){array[strand].addAll(list);} 140 } 141 } 142 Collections.sort(array[strand]); 143 AminoAcid.reverseComplementBasesInPlace(bases); 144 } 145 return array; 146 } 147 148 /** Designed for quickly calling a single SSU */ makeRna(String name, byte[] bases, int type)149 public Orf makeRna(String name, byte[] bases, int type){ 150 final float[] scores=new float[bases.length];//TODO: Big and slow; make a FloatList? 151 StatsContainer sc=pgm.allContainers[type]; 152 final int[] kmersSeen=(sc.kmerSet()==null ? null : new int[bases.length]);//TODO: IntList? 153 154 int strand=0; 155 ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1); 156 final Orf best1=pickBest(list); 157 assert(best1==null || best1.start>=0 && best1.stop<bases.length) : bases.length+"\n"+best1; 158 if(best1!=null && best1.orfScore>-999){return best1;} 159 160 strand++; 161 AminoAcid.reverseComplementBasesInPlace(bases); 162 list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1); 163 AminoAcid.reverseComplementBasesInPlace(bases); 164 if(strand==1 && list!=null){ 165 for(Orf orf : list){ 166 assert(orf.strand==strand); 167 orf.flip(); 168 } 169 } 170 final Orf best2=pickBest(list); 171 assert(best2==null || best2.start>=0 && best2.stop<bases.length) : bases.length+"\n"+best2; 172 if(best2!=null && best2.orfScore>-999){return best2;} 173 return best1!=null ? best1 : best2; 174 } 175 pickBest(ArrayList<Orf> list)176 final Orf pickBest(ArrayList<Orf> list){ 177 if(list==null){return null;} 178 Orf best=null; 179 for(Orf orf : list){ 180 if(best==null || orf.orfScore>best.orfScore){ 181 best=orf; 182 } 183 } 184 return best; 185 } 186 187 /** 188 * Generates lists of all max-length non-overlapping Orfs per frame. 189 * There IS overlap between frames. 190 * All Orfs come out flipped to + orientation. 191 * */ makeOrfs(String name, byte[] bases, int minlen)192 static ArrayList<Orf>[] makeOrfs(String name, byte[] bases, int minlen){ 193 @SuppressWarnings("unchecked") 194 ArrayList<Orf>[] array=new ArrayList[6]; 195 for(int strand=0; strand<2; strand++){ 196 for(int frame=0; frame<3; frame++){ 197 ArrayList<Orf> list=makeOrfsForFrame(name, bases, frame, strand, minlen); 198 array[frame+3*strand]=list; 199 if(strand==1 && list!=null){ 200 for(Orf orf : list){ 201 assert(orf.frame==frame); 202 assert(orf.strand==strand); 203 orf.flip(); 204 } 205 } 206 } 207 AminoAcid.reverseComplementBasesInPlace(bases); 208 } 209 return array; 210 } 211 212 /** 213 * Dynamic programming phase. 214 * @param frameLists 215 * @param bases 216 * @return 217 */ findPath(ArrayList<Orf>[] frameLists, byte[] bases)218 private ArrayList<Orf> findPath(ArrayList<Orf>[] frameLists, byte[] bases){ 219 ArrayList<Orf> all=new ArrayList<Orf>(); 220 for(ArrayList<Orf> list : frameLists){all.addAll(list);} 221 if(all.isEmpty()){return all;} 222 Collections.sort(all); 223 224 for(Orf orf : all){ 225 orf.pathScorePlus=-999999; 226 orf.pathScoreMinus=-999999; 227 } 228 229 int[] lastPositionScored=KillSwitch.allocInt1D(6); 230 Arrays.fill(lastPositionScored, -1); 231 232 //Index of highest-scoring ORF in this frame, with prev on the plus strand 233 int[] bestIndexPlus=KillSwitch.allocInt1D(6); 234 //Index of highest-scoring ORF in this frame, with prev on the minus strand 235 int[] bestIndexMinus=KillSwitch.allocInt1D(6); 236 //Highest-scoring ORF in this frame, with prev on the plus strand 237 Orf[] bestOrfPlus=new Orf[6]; 238 //Highest-scoring ORF in this frame, with prev on the minus strand 239 Orf[] bestOrfMinus=new Orf[6]; 240 241 int[][] bestIndex=new int[][] {bestIndexPlus, bestIndexMinus}; 242 Orf[][] bestOrf=new Orf[][] {bestOrfPlus, bestOrfMinus}; 243 244 for(Orf orf : all){ 245 final int myListNum=3*orf.strand+orf.frame; 246 calcPathScore(orf, frameLists, lastPositionScored, bestIndex); 247 if(bestOrfPlus[myListNum]==null || orf.pathScorePlus>=bestOrfPlus[myListNum].pathScorePlus){ 248 bestOrfPlus[myListNum]=orf; 249 bestIndexPlus[myListNum]=lastPositionScored[myListNum]; 250 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf); 251 } 252 if(bestOrfMinus[myListNum]==null || orf.pathScoreMinus>=bestOrfMinus[myListNum].pathScoreMinus){ 253 bestOrfMinus[myListNum]=orf; 254 bestIndexMinus[myListNum]=lastPositionScored[myListNum]; 255 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf); 256 } 257 } 258 259 Orf best=bestOrf[0][0]; 260 for(Orf[] array : bestOrf){ 261 for(Orf orf : array){ 262 if(best==null || (orf!=null && orf.pathScore()>best.pathScore())){ 263 best=orf; 264 } 265 } 266 } 267 ArrayList<Orf> bestPath=new ArrayList<Orf>(); 268 for(Orf orf=best; orf!=null; orf=orf.prev()){ 269 bestPath.add(orf); 270 if(orf.type==CDS){geneStartsOut++;} 271 else if(orf.type==tRNA){tRNAOut++;} 272 else if(orf.type==r16S){r16SOut++;} 273 else if(orf.type==r23S){r23SOut++;} 274 else if(orf.type==r5S){r5SOut++;} 275 else if(orf.type==r18S){r18SOut++;} 276 } 277 Collections.sort(bestPath); 278 return bestPath; 279 } 280 281 /** 282 * Calculate the best path to this ORF. 283 * @param orf 284 * @param frameLists 285 * @param lastPositionScored 286 * @param bestIndex 287 */ calcPathScore(Orf orf, ArrayList<Orf>[] frameLists, int[] lastPositionScored, int[][] bestIndex)288 private void calcPathScore(Orf orf, ArrayList<Orf>[] frameLists, int[] lastPositionScored, int[][] bestIndex){ 289 final int myListNum=3*orf.strand+orf.frame; 290 291 // System.err.println("* "+orf); 292 // System.err.println("* "+Arrays.toString(lastPositionScored)); 293 // System.err.println(); 294 295 for(int listStrand=0; listStrand<2; listStrand++){ 296 for(int listFrame=0; listFrame<3; listFrame++){ 297 int listNum=listFrame+3*listStrand; 298 ArrayList<Orf> list=frameLists[listNum]; 299 int lastPos=lastPositionScored[listNum]; 300 int bestPos=bestIndex[listStrand][listNum]; 301 if(listStrand==0){ 302 calcPathScorePlus(orf, list, listStrand, lastPos, bestPos); 303 }else{ 304 calcPathScoreMinus(orf, list, listStrand, lastPos, bestPos); 305 } 306 } 307 } 308 309 // System.err.println(myListNum+", "+Arrays.toString(lastPositionScored)+", "+frameLists[myListNum].size()); 310 311 lastPositionScored[myListNum]++; 312 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf) : myListNum+"\n"+orf+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum])+"\n" 313 +Arrays.toString(lastPositionScored)+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum]+1); 314 315 //These are sanity checks to make sure that the path did not break in the middle. 316 //Safe to disable. 317 // assert(orf.prevPlus!=null || orf.stop<100000); 318 // assert(orf.prevMinus!=null || orf.stop<100000); 319 // assert(orf.pathScore>-10) : orf.pathScore+"\n"+orf+"\n"+orf.prev+"\n"; 320 } 321 322 /** 323 * Calculate the best path to this ORF from a plus-strand previous ORF. 324 * @param orf 325 * @param list 326 * @param listStrand 327 * @param lastPos 328 * @param bestPos 329 */ calcPathScorePlus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos)330 private void calcPathScorePlus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){ 331 assert(listStrand==0); 332 if(lastPos<0){ 333 if(orf.prevPlus==null){ 334 orf.pathScorePlus=orf.orfScore; 335 orf.pathLengthPlus=1; 336 } 337 return; 338 } 339 if(list.isEmpty()){return;} 340 341 // System.err.println("\nExamining \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame); 342 boolean found=false; 343 final boolean sameStrand=(orf.strand==listStrand); 344 final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand); 345 for(int i=lastPos, min=Tools.max(0, bestPos-lookbackPlus); i>=min || (i>0 && !found); i--){ 346 Orf prev=list.get(i); 347 assert(prev!=orf) : prev; 348 // System.err.println("Comparing to \t"+prev); 349 if(orf.isValidPrev(prev, maxOverlap)){ 350 int overlap=Tools.max(0, prev.stop-orf.start+1); 351 float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap); 352 353 final float prevScore=prev.pathScore(); 354 final int prevLength=prev.pathLength(); 355 356 float pathScore; 357 final int pathLength; 358 if(sameStrand){ 359 pathLength=prevLength+1; 360 pathScore=prevScore+orfScore; 361 pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4)); 362 }else{ 363 pathLength=1; 364 pathScore=prev.pathScore()+orfScore; 365 pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5); 366 } 367 368 if(overlap<1 && prevScore>0){found=true;} 369 if(pathScore>=orf.pathScorePlus){ 370 orf.pathScorePlus=pathScore; 371 orf.prevPlus=prev; 372 orf.pathLengthPlus=pathLength; 373 // System.err.println("Set as best"); 374 } 375 } 376 if(found && prev.stop<maxOverlap-2000 && orf.prevPlus!=null){ 377 System.err.println("Breaking"); 378 break; 379 } 380 } 381 } 382 383 /** 384 * Calculate the best path to this ORF from a minus-strand previous ORF. 385 * @param orf 386 * @param list 387 * @param listStrand 388 * @param lastPos 389 * @param bestPos 390 */ calcPathScoreMinus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos)391 private void calcPathScoreMinus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){ 392 assert(listStrand==1); 393 if(lastPos<0){ 394 if(orf.prevMinus==null){ 395 orf.pathScoreMinus=orf.orfScore; 396 orf.pathLengthMinus=1; 397 } 398 return; 399 } 400 if(list.isEmpty()){return;} 401 402 // System.err.println("\nExamining \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame); 403 boolean found=false; 404 final boolean sameStrand=(orf.strand==listStrand); 405 final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand); 406 for(int i=lastPos, min=Tools.max(0, bestPos-lookbackMinus); i>=min || (i>0 && !found); i--){ 407 Orf prev=list.get(i); 408 assert(prev!=orf) : prev; 409 // System.err.println("Comparing to \t"+prev); 410 if(orf.isValidPrev(prev, maxOverlap)){ 411 int overlap=Tools.max(0, prev.stop-orf.start+1); 412 float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap); 413 414 final float prevScore=prev.pathScore(); 415 final int prevLength=prev.pathLength(); 416 417 float pathScore; 418 final int pathLength; 419 if(sameStrand){ 420 pathLength=prevLength+1; 421 pathScore=prevScore+orfScore; 422 pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4)); 423 }else{ 424 pathLength=1; 425 pathScore=prev.pathScore()+orfScore; 426 pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5); 427 } 428 if(overlap<1 && prevScore>0){found=true;} 429 if(pathScore>=orf.pathScoreMinus){ 430 orf.pathScoreMinus=pathScore; 431 orf.prevMinus=prev; 432 orf.pathLengthMinus=pathLength; 433 // System.err.println("Set as best"); 434 } 435 } 436 if(found && prev.stop<maxOverlap-2000 && orf.prevMinus!=null){ 437 System.err.println("Breaking"); 438 break; 439 } 440 } 441 } 442 443 /** 444 * Generates a list of maximal-length Orfs only (non-overlapping). 445 * All Orfs come out in native orientation (unflipped). 446 * */ makeOrfsForFrame(String name, byte[] bases, int startFrame, int strand, int minlen)447 static ArrayList<Orf> makeOrfsForFrame(String name, byte[] bases, int startFrame, int strand, int minlen){ 448 // assert(false) : "TODO"; 449 assert(minlen>=3); 450 if(bases==null || bases.length<minlen){return null;} 451 ArrayList<Orf> orfs=new ArrayList<Orf>(); 452 if(!ProkObject.callCDS){return orfs;} 453 // int mask=63; 454 int code=0; 455 int start=-2; 456 int frame=0; 457 int pos=startFrame; 458 459 460 for(; pos<bases.length; pos++){ 461 byte b=bases[pos]; 462 int x=AminoAcid.baseToNumber[b]; 463 // code=((code<<2)|x)&mask; 464 code=((code<<2)|x); 465 frame++; 466 if(frame==3){ 467 frame=0; 468 if(start>=0){ 469 if(GeneModel.isStopCodon(code) || code<0){//NOTE: This adds a stop codon wherever there are Ns. 470 int len=pos-start+1; 471 if(len>=minlen){ 472 Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS); 473 orfs.add(f); 474 } 475 start=-1; 476 } 477 }else{ 478 if(start==-2 || (start<0 && GeneModel.isStartCodon(code))){ 479 start=pos-2; 480 } 481 } 482 code=0; 483 } 484 } 485 486 //Add a stop codon at the sequence end. 487 if(start>=0){ 488 pos--; 489 while(frame!=3 && frame!=-1){ 490 pos--; 491 frame--; 492 } 493 int len=pos-start+1; 494 if(len>=minlen){ 495 assert(pos<bases.length) : start+", "+pos+", "+bases.length; 496 Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS); 497 orfs.add(f); 498 } 499 } 500 501 return orfs; 502 } 503 504 /** 505 * Generates a list of maximal-length RNAs (non-overlapping). 506 * All RNAs come out in native orientation (unflipped). 507 * */ 508 ArrayList<Orf> makeRnasForStrand(String name, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen, boolean quitEarly, float bias){ 509 final int window=sc.lengthAvg; 510 if(bases==null || bases.length*2<window){return null;} 511 ArrayList<Orf> orfs=new ArrayList<Orf>(sc.type==tRNA ? 32 : 8); 512 513 final FrameStats inner=sc.inner; 514 // final FrameStats start=sc.start; 515 // final FrameStats stop=sc.stop; 516 517 final int k=inner.k; 518 final int mask=inner.mask; 519 // final float invLen=sc.invLengthAvg; 520 final int halfWindow=window/2; 521 final int maxWindow=(int)(window*1.5f); 522 final int maxWindow2=(int)(window*2.5f); 523 // final int slop=Tools.max(50, window/8); 524 int len=0; 525 int kmer=0; 526 float currentScore=0; 527 float currentScoreAbs=0; 528 bias=(bias>-1 ? bias : biases[sc.type]); 529 final float maxBias=biases[sc.type]*1.45f; 530 531 float thresh=cutoff1[sc.type]; 532 float prevScore=0; 533 int lastStart=0; 534 535 float max=0; 536 int maxPos=0; 537 538 for(int pos=0; pos<bases.length; pos++){ 539 final byte b=bases[pos]; 540 assert(b>=0 && b<128) : "Invalid base b="+((int)b)+"; pos="+pos+"\n"+new String(bases)+"\n"; 541 final int x=AminoAcid.baseToNumber[b]; 542 kmer=((kmer<<2)|x)&mask; 543 544 if(x>=0){ 545 len++; 546 if(len>=k){ 547 float prob=inner.probs[0][kmer]; 548 float dif=prob-bias;//Prob above 1 is more likely than average 549 currentScoreAbs+=prob; 550 currentScore=Tools.max(0, currentScore+dif); 551 } 552 553 if(currentScore>0){ 554 if(currentScore>max){ 555 max=currentScore; 556 maxPos=pos; 557 } 558 if(prevScore<=0){ 559 lastStart=pos; 560 } 561 }else{ 562 int rnaLen=maxPos-lastStart; 563 if(max>thresh && rnaLen>=halfWindow){ 564 if(rnaLen>maxWindow){ 565 if(bias<=maxBias){ 566 orfs=null; 567 float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f); 568 return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult); 569 } 570 } 571 if(rnaLen<=maxWindow2){ 572 Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type); 573 orfs.add(orf); 574 orf.orfScore=max; 575 if(verbose){ 576 final int start2=(strand==0 ? lastStart : bases.length-maxPos-1); 577 final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1); 578 System.err.println("Made Orf "+start2+"\t"+stop2+"\t"+max); 579 } 580 } 581 } 582 max=0; 583 lastStart=pos; 584 } 585 // System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart); 586 prevScore=currentScore; 587 588 // if(pos>=223000 && pos<232000){ 589 //// System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart); 590 // System.out.println(pos+"\t"+currentScore); 591 // } 592 593 }else{ 594 len=0; 595 kmer=0; 596 } 597 scores[pos]=currentScoreAbs; 598 } 599 600 // System.err.println("size="+orfs.size()+", type="+Orf.typeStrings[sc.type]); 601 602 603 { 604 int rnaLen=maxPos-lastStart; 605 if(max>thresh && rnaLen>=halfWindow){ 606 if(rnaLen>maxWindow){ 607 if(bias<=maxBias){ 608 orfs=null; 609 float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f); 610 return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult); 611 } 612 } 613 if(rnaLen<=maxWindow2){ 614 Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type); 615 orfs.add(orf); 616 orf.orfScore=max; 617 if(verbose){ 618 final int start2=(strand==0 ? lastStart : bases.length-maxPos-1); 619 final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1); 620 System.err.println(start2+"\t"+stop2+"\t"+max); 621 } 622 } 623 } 624 } 625 626 if(kmersSeen!=null && orfs.size()>0 && sc.kmerSet()!=null){ fillKmersSeen(bases, kmersSeen, sc.kmerSet(), sc.kLongLen())627 fillKmersSeen(bases, kmersSeen, sc.kmerSet(), sc.kLongLen()); 628 } 629 630 float cutoff=cutoff2[sc.type]; 631 632 for(int i=0; i<orfs.size(); i++){ 633 Orf orf=orfs.get(i); 634 // System.err.println(orf.orfScore); 635 boolean good=refineRna(orf, bases, strand, sc, scores, kmersSeen); 636 if(orf.orfScore<cutoff || !good){ 637 if(verbose){System.err.println("REJECT: "+orf.toStringFlipped());} orfs.set(i, null)638 orfs.set(i, null); 639 }else{ 640 if(verbose){System.err.println("ACCEPT: "+orf.toStringFlipped());} 641 if(quitEarly){ orfs.clear()642 orfs.clear(); 643 orfs.add(orf); 644 return orfs; 645 } 646 } 647 } Tools.condenseStrict(orfs)648 Tools.condenseStrict(orfs); 649 650 // assert(false); 651 652 // for(int pos=0; pos<bases.length; pos++){ 653 // final byte b=bases[pos]; 654 // final int x=AminoAcid.baseToNumber[b]; 655 // kmer=((kmer<<2)|x)&mask; 656 // 657 // if(x>=0){ 658 // len++; 659 // if(len>=k){ 660 // float prob=inner.probs[0][kmer]; 661 // float dif=prob-1.2f;//Prob above 1 is more likely than average 662 // currentScore=Tools.max(0, currentScore+dif); 663 // if(currentScore>0){ 664 // currentStreak++; 665 // }else{ 666 // currentStreak=0; 667 // } 668 // if(currentScore>200 && currentStreak>1500 && currentStreak<1700){ 669 // Orf orf=new Orf(name, pos-currentStreak-1, pos, strand, 0, bases, false); 670 // orfs.add(orf); 671 // orf.orfScore=currentScore; 672 // orf.startScore=start.scorePoint(orf.start, bases); 673 // orf.stopScore=stop.scorePoint(orf.stop, bases); 674 // currentStreak=0; 675 // currentScore=0; 676 // } 677 // } 678 // }else{ 679 // len=0; 680 // kmer=0; 681 // } 682 // } 683 684 return orfs; 685 } 686 fillKmersSeen(byte[] bases, int[] kmersSeen, LongHashSet set, int k)687 void fillKmersSeen(byte[] bases, int[] kmersSeen, LongHashSet set, int k){ 688 final long mask=~((-1L)<<(2*k)); 689 long kmer=0; 690 int len=0; 691 int seen=0; 692 for(int i=0; i<bases.length; i++){ 693 final byte b=bases[i]; 694 final int num=AminoAcid.baseToNumber[b]; 695 if(num>=0){ 696 len++; 697 kmer=((kmer<<2)|num)&mask; 698 if(len>=k && set.contains(kmer)){seen++;} 699 }else{ 700 len=0; 701 } 702 kmersSeen[i]=seen; 703 } 704 } 705 refineRna(Orf orf, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen)706 boolean refineRna(Orf orf, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen){ 707 if(orf==null){return false;} 708 if(verbose){System.err.println("REFINE: "+orf.toStringFlipped());} 709 final int window=sc.lengthAvg; 710 // final int halfWindow=window/2; 711 // final int maxWindow=(int)(window*1.5f); 712 final int slop=Tools.max(60, 10+window/8); 713 final float invWindow=sc.invLengthAvg; 714 final float oldScore=orf.orfScore; 715 IntList starts=new IntList(); 716 IntList stops=new IntList(); 717 FloatList startScores=new FloatList(); 718 FloatList stopScores=new FloatList(); 719 720 final int leftmost=Tools.max(0, orf.start-slop); 721 final int rightmost=Tools.min(bases.length-1, orf.stop+slop); 722 if(kmersSeen!=null){ 723 if(kmersSeen[leftmost]>=kmersSeen[rightmost]){ 724 // System.err.println("Bad: "+oldScore); 725 orf.orfScore=-999; 726 return false; 727 }else{ 728 // System.err.println("Good: "+oldScore); 729 } 730 } 731 if(verbose){System.err.println("REFINE2");} 732 733 {//starts 734 final int left=leftmost; 735 final int right=Tools.min(bases.length-1, orf.stop+slop-window); 736 final float thresh=cutoff3[sc.type]; 737 fillPoints(left, right, bases, sc.start, thresh, starts, startScores); 738 } 739 if(verbose){System.err.println("starts: "+starts.size);} 740 // if((orf.start+"").startsWith("146") || true){System.err.println(starts);} 741 if(verbose){System.err.println(startScores);} 742 743 {//stops 744 final int left=Tools.max(0, orf.start-slop+window); 745 final int right=rightmost; 746 final float thresh=cutoff4[sc.type]; 747 fillPoints(left, right, bases, sc.stop, thresh, stops, stopScores); 748 } 749 if(verbose){System.err.println("stops: "+stops.size);} 750 // if((orf.start+"").startsWith("146") || true){System.err.println(stops);} 751 if(verbose){System.err.println(stopScores);} 752 753 final float innerThresh=cutoff5[sc.type]; 754 755 final int minlen=Tools.max(window/2, window-slop); 756 final int maxlen=window+slop; 757 758 orf.orfScore=0; 759 int lastStop=0; 760 for(int i=0; i<starts.size; i++){ 761 final int start=starts.get(i); 762 final int startSeen=kmersSeen==null ? 0 : kmersSeen[start]; 763 final float startScore=startScores.get(i); 764 // System.err.println("start="+start); 765 for(int j=lastStop; j<stops.size; j++){ 766 final int stop=stops.get(j); 767 final int rnalen=stop-start+1; 768 // System.err.println("stop="+stop); 769 if(rnalen<minlen){ 770 lastStop=j+1; 771 }else if(rnalen>maxlen){ 772 // System.err.println("broke"); 773 break; 774 }else if(kmersSeen!=null && kmersSeen[stop]<=startSeen){//TODO: Test this 775 // //skip 776 }else{ 777 final int len=stop-start+1; 778 final float stopScore=stopScores.get(j); 779 final float innerScore=(scores[stop]-scores[start])/len; 780 // System.err.println("innerScore="+innerScore); 781 assert(rnalen<=maxlen) : "start="+start+", stop="+stop+", rnalen="+rnalen+", minlen="+minlen+", maxlen="+maxlen; 782 if(innerScore>=innerThresh){ 783 final float a=Tools.max(startScore+0.25f, 0.25f); 784 final float b=Tools.max(stopScore+0.25f, 0.25f); 785 final float c=innerScore*innerScore; 786 final float d=(window-(2.4f*Tools.absdif(len, window)))*invWindow; 787 final float score=c*d*(float)Math.sqrt(a*b); 788 if(verbose && score>=0.2f*orf.orfScore){ 789 final int start2=(strand==0 ? start : bases.length-stop-1); 790 final int stop2=(strand==0 ? stop : bases.length-start-1); 791 System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore); 792 } 793 if(score>orf.orfScore){ 794 orf.start=start; 795 orf.stop=stop; 796 orf.kmerScore=innerScore*len; 797 // if(verbose){ 798 // final int start2=(strand==0 ? start : bases.length-stop-1); 799 // final int stop2=(strand==0 ? stop : bases.length-start-1); 800 // System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore); 801 // } 802 orf.startScore=startScore; 803 orf.stopScore=stopScore; 804 orf.orfScore=score; 805 } 806 }else{ 807 assert(true); 808 } 809 } 810 } 811 } 812 orf.orfScore*=scoreMult[sc.type]; 813 814 if(starts.isEmpty() || stops.isEmpty()){ 815 if(verbose){System.err.println("No starts or stops.");} 816 orf.orfScore=Tools.min(-999, orf.orfScore-9999); 817 return false; 818 } 819 return refineByAlignment(orf, bases, strand, sc);//Returns true if no consensus is present 820 } 821 refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc)822 boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc){ 823 if(verbose){System.err.println("ALIGN");} 824 Read[] consensus=ProkObject.consensusReads(sc.type); 825 if(consensus==null || consensus.length==0){return true;} 826 boolean refined=false; 827 // System.err.println("Initial: "+orf.start+", "+orf.stop); 828 for(Read r : consensus){ 829 // refined=refineByAlignment(orf, bases, strand, sc, r.bases, 15, 15, 2); 830 refined=refineByAlignment(orf, bases, strand, sc, r.bases, sc.startSlop(), sc.stopSlop(), 2); 831 if(refined || sc.type==r18S || true){break;} 832 } 833 if(refined){ 834 if(verbose){System.err.println("Aligned to: "+orf.start+", "+orf.stop);} 835 }else{ 836 if(verbose){System.err.println("Alignment failed.");} 837 orf.orfScore=Tools.min(-999, orf.orfScore-9999); 838 } 839 return refined; 840 } 841 refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc, byte[] consensus, final int startSlop, final int stopSlop, int recurLimit)842 boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc, byte[] consensus, 843 final int startSlop, final int stopSlop, int recurLimit){ 844 final int start0=orf.start; 845 final int stop0=orf.stop; 846 847 assert(start0>=0 && start0<bases.length) : start0+", "+stop0; 848 assert(stop0>=0 && stop0<bases.length) : start0+", "+stop0; 849 850 final float minID=sc.minIdentity(); 851 final int padding=Tools.min(alignmentPadding, 30+sc.lengthAvg/4); 852 final int a=Tools.max(0, orf.start-padding); 853 final int b=Tools.min(bases.length-1, orf.stop+padding); 854 final int reflen=b-a+1; 855 if(reflen>10*sc.lengthAvg && reflen>20000){ 856 System.err.println("Skipped reflen "+reflen+"/"+sc.lengthAvg+" for " 857 + "seqlen="+bases.length+", orf="+orf.toString()); 858 assert(false); 859 //TODO: Possibly change return to -1, 0, 1 ("can't align") 860 //Should be a limit on window size... 861 //Also consider shrinking matrix after jumbo alignments 862 return false; 863 } 864 assert(a>=0 && b<bases.length) : a+", "+b; 865 SingleStateAlignerFlat2 ssa=getSSA(); 866 final int minScore=ssa.minScoreByIdentity(consensus.length, minID); 867 int[] max=ssa.fillUnlimited(consensus, bases, a, b, minScore); 868 if(max==null){return false;} 869 870 final int rows=max[0]; 871 final int maxCol=max[1]; 872 final int maxState=max[2]; 873 // final int maxScore=max[3]; 874 // final int maxStart=max[4]; 875 876 //returns {score, bestRefStart, bestRefStop} 877 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight}; 878 final int[] score=ssa.score(consensus, bases, a, b, rows, maxCol, maxState); 879 final int rstart=Tools.max(score[1], 0); 880 final int rstop=Tools.min(score[2], bases.length-1); 881 final float id=ssa.tracebackIdentity(consensus, bases, a, b, rows, maxCol, maxState, null); 882 883 assert(rstart>=0 && rstart<bases.length) : "bases="+bases.length+ 884 ", rstart="+rstart+", rstop="+rstop+", a="+a+", b="+b+"\n"+"score="+Arrays.toString(score)+", id="+id; 885 assert(rstop>=0 && rstop<bases.length) : rstart+", "+rstop; 886 887 if(score.length>3 && recurLimit>0){ 888 final int padLeft=score[3]; 889 final int padRight=score[4]; 890 //TODO: This takes extra memory. It may be better to cap the width or ignore start0/stop0 here. 891 int rstart2=Tools.max(0, Tools.min(start0, rstart)-10); 892 int rstop2=Tools.min(bases.length-1, Tools.max(stop0, rstop)+10); 893 assert(rstart2>=0 && rstart2<bases.length) : rstart2+", "+rstop2; 894 assert(rstop2>=0 && rstop2<bases.length) : rstart2+", "+rstop2; 895 orf.start=rstart2; 896 orf.stop=rstop2; 897 if(orf.start<a || orf.stop>b){ 898 boolean ret=refineByAlignment(orf, bases, strand, sc, consensus, 0, 0, recurLimit-1); 899 if(ret){return true;} 900 } 901 orf.start=start0; 902 orf.stop=stop0; 903 // assert(false) : "Don't traceback after recur."; 904 } 905 if(verbose){ 906 System.err.println("Identity: "+id); 907 } 908 // assert(score.length==3) : "TODO: Handle padding requests."; 909 910 // System.err.println("Identity: "+String.format("%.2f", 100*id)+"; location: "+rstart+"-"+rstop); 911 if(id<minID){return false;} 912 913 914 if(Tools.absdif(rstart, start0)>startSlop){orf.start=rstart;} 915 if(Tools.absdif(rstop, stop0)>stopSlop){orf.stop=rstop;} 916 return true; 917 } 918 fillPoints(final int left, final int right, final byte[] bases, final FrameStats fs, float thresh, final IntList points, final FloatList scores)919 void fillPoints(final int left, final int right, final byte[] bases, final FrameStats fs, float thresh, final IntList points, final FloatList scores){ 920 points.clear(); 921 scores.clear(); 922 final float minThresh=thresh;//thresh*0.05f; 923 // System.err.println(left+", "+right+", "+thresh+", "+minThresh); 924 while(points.size()<8 && thresh>=minThresh){ 925 // System.err.println("Running with thresh="+thresh); 926 points.clear(); 927 scores.clear(); 928 for(int i=left; i<right; i++){ 929 float score=fs.scorePoint(i, bases); 930 // System.err.println(i+", "+score); 931 if(score>=thresh){ 932 points.add(i); 933 scores.add(score); 934 } 935 } 936 thresh=thresh*0.75f; 937 } 938 } 939 940 /** 941 * Generate all possible genes from each Orf, and return them in a new set of lists. 942 * @param frameLists 943 * @param bases 944 * @return Lists of orfs. 945 */ breakOrfs(ArrayList<Orf>[] frameLists, byte[] bases)946 private ArrayList<Orf>[] breakOrfs(ArrayList<Orf>[] frameLists, byte[] bases){ 947 948 @SuppressWarnings("unchecked") 949 ArrayList<Orf>[] brokenLists=new ArrayList[6]; 950 for(int strand=0; strand<2; strand++){ 951 for(int frame=0; frame<3; frame++){ 952 int fnum=frame+3*strand; 953 ArrayList<Orf> longest=frameLists[fnum]; //Longest Orf per stop 954 ArrayList<Orf> broken=new ArrayList<Orf>(); //All high scoring Orfs, including multiple per stop, with different starts 955 if(longest!=null) { 956 for(Orf orf : longest){ 957 assert(orf.frame==frame); 958 assert(orf.strand==strand); 959 ArrayList<Orf> temp=breakOrf(orf, bases); 960 if(temp!=null){ 961 broken.addAll(temp); 962 } 963 } 964 } 965 Collections.sort(broken); 966 brokenLists[fnum]=broken; 967 } 968 //Reverse-complement bases after processing each strand 969 AminoAcid.reverseComplementBasesInPlace(bases); 970 } 971 return brokenLists; 972 } 973 974 /** 975 * Generate an Orf for each possible start codon. 976 * Retain only the high-scoring ones. 977 * @param longest Longest open reading frame for a given stop. 978 * @param bases Bases, oriented for this Orf. 979 * @return List of Orfs. 980 */ breakOrf(Orf longest, byte[] bases)981 private ArrayList<Orf> breakOrf(Orf longest, byte[] bases){ 982 assert(longest.start<longest.stop); 983 final int flipped=longest.flipped(); 984 if(flipped==1){longest.flip();}//Now the orf is aligned to its native strand 985 986 geneStopsMade++; 987 988 final FrameStats innerStats=pgm.statsCDS.inner; 989 final FrameStats startStats=pgm.statsCDS.start; 990 final FrameStats stopStats=pgm.statsCDS.stop; 991 992 final String name=longest.scafName; 993 final int start=longest.start; 994 final int stop=longest.stop; 995 final int strand=longest.strand; 996 final int max=Tools.min(longest.stop-2, longest.stop-minLen+4); 997 998 assert(pgm!=null) : pgm; 999 assert(pgm.statsCDS!=null) : pgm; 1000 assert(pgm.statsCDS.inner!=null) : pgm.statsCDS; 1001 assert(pgm.statsCDS.inner.k>0) : pgm.statsCDS.inner; 1002 1003 final int k=innerStats.k; 1004 final int mask=~((-1)<<(2*k)); 1005 1006 final float stopScore=stopStats.scorePoint(longest.stop, bases); 1007 stCds.geneStopScoreSum+=stopScore; 1008 stCds.geneStopScoreCount++; 1009 1010 ArrayList<Orf> broken=new ArrayList<Orf>(); 1011 int created=0; 1012 1013 int codon=0; 1014 int kmer=0; 1015 int len=0; 1016 int numKmers=0; 1017 float currentScore=0; 1018 for(int pos=start, currentFrame=0; pos<=stop; pos++){ 1019 final byte b=bases[pos]; 1020 final int x=AminoAcid.baseToNumber[b]; 1021 codon=((codon<<2)|x); 1022 kmer=((kmer<<2)|x)&mask; 1023 1024 if(x>=0){ 1025 len++; 1026 if(len>=k){ 1027 float prob=innerStats.probs[currentFrame][kmer]; 1028 float dif=prob-0.99f;//Prob above 1 is more likely than average 1029 currentScore+=dif; 1030 // outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", dif)+",\tscore="+String.format(Locale.ROOT, "%.4f", currentScore)+ 1031 // "\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+ 1032 // "\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame)); 1033 }else{ 1034 // outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", 0.0)+",\tscore="+String.format(Locale.ROOT, "%.4f", 0.0)+ 1035 // "\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+ 1036 // "\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame)); 1037 } 1038 }else{ 1039 len=0; 1040 kmer=0; 1041 } 1042 1043 currentFrame++; 1044 // outstream.println("pos="+pos+", codon="+AminoAcid.kmerToString(kmer, 3)+", frame="+currentFrame+", start="+start+", isStartCodon="+pgm.isStartCodon(codon)); 1045 if(currentFrame>2){ 1046 currentFrame=0; 1047 if(pos<max && created<breakLimit && (pos==start+2 || pgm.isStartCodon(codon))){ 1048 // outstream.println(x); 1049 int glen=stop-pos+3; 1050 assert(glen>=minLen) : "glen="+glen+", minLen="+minLen+", pos="+pos+", max="+max+", start="+start; 1051 1052 int oStart=pos-2; 1053 float startScore=startStats.scorePoint(oStart, bases); 1054 1055 stCds.geneStartScoreSum+=startScore; 1056 stCds.geneStartScoreCount++; 1057 1058 stCds.lengthSum+=(stop-(pos-2)+1); 1059 stCds.lengthCount++; 1060 1061 if((startScore>=minStartScore || pos<6) /* && stopScore>=minStopScore /*|| broken.isEmpty()*/){ 1062 Orf orf=new Orf(name, pos-2, stop, strand, longest.frame, bases, false, longest.type); 1063 1064 geneStartsMade++; 1065 orf.kmerScore=currentScore; 1066 orf.startScore=startScore; 1067 orf.stopScore=stopScore; 1068 1069 assert(orf.frame==longest.frame); 1070 assert(orf.strand==strand); 1071 1072 if(strand==1){orf.flip();} 1073 broken.add(orf); 1074 created++; 1075 } 1076 } 1077 codon=0; 1078 } 1079 } 1080 1081 final int size=broken.size(); 1082 final int sizeCutoff=Tools.max(5, size/2); 1083 if(size<1){return broken;} 1084 Orf best=broken.get(0); 1085 Orf bestStart=broken.get(0); 1086 for(int i=0; i<size; i++){ 1087 Orf orf=broken.get(i); 1088 1089 //This fixes scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2). 1090 orf.kmerScore=currentScore-orf.kmerScore; 1091 orf.orfScore=orf.calcOrfScore(); 1092 if(orf.orfScore>=best.orfScore){best=orf;} 1093 if(orf.startScore>=bestStart.startScore){bestStart=orf;} 1094 1095 stCds.geneInnerScoreSum+=orf.averageKmerScore(); 1096 stCds.geneInnerScoreCount++; 1097 } 1098 1099 //Sort to by score descending to eliminate low-scoring copies. 1100 if(broken.size()>1){Collections.sort(broken, Feature.featureComparatorScore);} 1101 1102 int removed=0; 1103 for(int i=0; i<size; i++){//Fix scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2). 1104 Orf orf=broken.get(i); 1105 if(orf.averageKmerScore()<minInnerScore || orf.orfScore<minOrfScore || 1106 orf.orfScore/orf.length()<minAvgScore || 1107 orf.orfScore<0.5f*best.orfScore-10 || (orf.startScore<bestStart.startScore-0.55f && orf.kmerScore<best.kmerScore*1.1f && orf!=best)){ 1108 broken.set(i, null); 1109 removed++; 1110 }else if(i>sizeCutoff){ 1111 broken.set(i, null); 1112 removed++; 1113 } 1114 } 1115 if(removed>0){ 1116 Tools.condenseStrict(broken); 1117 } 1118 1119 geneStartsRetained+=broken.size(); 1120 geneStopsRetained+=(broken.size()>0 ? 1 : 0); 1121 1122 if(flipped==1){longest.flip();} 1123 return broken; 1124 } 1125 1126 /*--------------------------------------------------------------*/ 1127 /*---------------- Fields ----------------*/ 1128 /*--------------------------------------------------------------*/ 1129 1130 /** 1131 * Current gene model. 1132 * TODO: Dynamically swap this as needed for contigs with varying GC. 1133 */ 1134 GeneModel pgm; 1135 1136 //Gene-calling cutoffs 1137 final int minLen; 1138 final int maxOverlapSameStrand; 1139 final int maxOverlapOppositeStrand; 1140 final float minStartScore; 1141 final float minStopScore; 1142 final float minInnerScore; 1143 final float minOrfScore; 1144 final float minAvgScore; 1145 1146 // static float[] cutoff1=new float[] {0, 40f, 200f, 150f, 45f}; 1147 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 60f}; 1148 // static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.4f, 1.8f}; 1149 // static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 0.6f, 1.1f}; 1150 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.55f};//inner score 1151 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f}; 1152 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f}; 1153 1154 // static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 45f}; 1155 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f}; 1156 // static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.3f, 1.8f}; 1157 // static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.5f, 1.1f}; 1158 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score 1159 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f}; 1160 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f}; 1161 1162 //for k=4,2,2 1163 // static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 40f}; 1164 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f}; 1165 // static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.1f, 1.8f}; 1166 // static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.45f, 1.1f}; 1167 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score 1168 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f}; 1169 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f}; 1170 1171 // //for k=5,3,3 1172 // static float[] cutoff1=new float[] {0, 40f, 300f, 300f, 135f};//sum score 1173 // static float[] cutoff2=new float[] {0, 44f, 300f, 400f, 100f};//orf score 1174 // static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.5f, 3.5f};//start score 1175 // static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 1.4f, 1.8f};//stop score 1176 // static float[] cutoff5=new float[] {0, 1.0f, 1.1f, 1.15f, 1.4f};//inner score 1177 // static float[] scoreMult=new float[] {1f, 8f, 80f, 120f, 5f};//score mult 1178 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f};//bias for sum score 1179 1180 //for k=6,3,3 - this has low scores for correct 23s stops; may try k=2 or k=4 for that end 1181 //Also, 5S seems to score low very for archaea 1182 // public static int CDS=0, tRNA=1, /*r16S=2,*/ r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; 1183 // static float[] cutoff1=new float[] {0, 100f, 600f, 400f, 300f};//sum score 1184 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f};//orf score 1185 // static float[] cutoff3=new float[] {0, 3.0f, 1.8f, 1.6f, 4.0f};//start score 1186 // static float[] cutoff4=new float[] {0, 2.8f, 2.0f, 0.90f, 1.9f};//stop score 1187 // static float[] cutoff5=new float[] {0, 2.8f, 1.1f, 1.55f, 1.4f};//inner score 1188 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f};//score mult 1189 // static float[] biases=new float[] {1f, 1.25f, 1.30f, 1.30f, 1.22f};//16S bias for sum score: 1.25 best for archs, 1.4 best for bacts 1190 1191 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; 1192 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 270f, 300f};//sum score 1193 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score 1194 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 3.8f, 1.65f};//start score 1195 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.8f, 1.70f};//stop score 1196 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 1.4f, 1.70f};//inner score 1197 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f, 35f};//score mult 1198 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.30f, 1.30f}; 1199 1200 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; 1201 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 90f, 300f};//sum score 1202 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 24f, 300f};//orf score 1203 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 2.0f, 1.65f};//start score 1204 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score 1205 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score 1206 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult 1207 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.50f, 1.30f}; 1208 1209 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; 1210 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 100f, 300f};//sum score 1211 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score 1212 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 1.8f, 1.65f};//start score 1213 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score 1214 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score 1215 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult 1216 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.55f, 1.30f}; 1217 1218 // {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; 1219 static float[] cutoff1=new float[] {0, 20f, 300f, 400f, 100f, 400f};//sum score 1220 static float[] cutoff2=new float[] {0, 36f, 300f, 300f, 32f, 300f};//orf score //tRNA is very sensitive here 1221 static float[] cutoff3=new float[] {0, 2.4f, 1.65f, 1.6f, 1.8f, 1.5f};//start score 1222 static float[] cutoff4=new float[] {0, 1.5f, 1.70f, 0.90f, 1.0f, 1.1f};//stop score 1223 static float[] cutoff5=new float[] {0, 2.2f, 1.70f, 1.55f, 2.6f, 1.5f};//inner score 1224 static float[] scoreMult=new float[] {1f, 1.0f, 35f, 80f, 1.25f, 35f};//score mult 1225 static float[] biases=new float[] {1f, 1.45f, 1.30f, 1.30f, 1.55f, 1.50f}; 1226 1227 long geneStopsMade=0; 1228 long geneStartsMade=0; 1229 long geneStartsRetained=0; 1230 long geneStopsRetained=0; 1231 long geneStartsOut=0; 1232 1233 long tRNAOut=0; 1234 long r16SOut=0; 1235 long r23SOut=0; 1236 long r5SOut=0; 1237 long r18SOut=0; 1238 1239 ScoreTracker stCds=new ScoreTracker(CDS); 1240 ScoreTracker stCds2=new ScoreTracker(CDS); 1241 ScoreTracker stCdsPass=new ScoreTracker(CDS); 1242 ScoreTracker sttRNA=new ScoreTracker(tRNA); 1243 ScoreTracker st16s=new ScoreTracker(r16S); 1244 ScoreTracker st23s=new ScoreTracker(r23S); 1245 ScoreTracker st5s=new ScoreTracker(r5S); 1246 ScoreTracker st18s=new ScoreTracker(r18S); 1247 1248 ScoreTracker[] trackers=new ScoreTracker[] {stCdsPass, sttRNA, st16s, st23s, st5s, st18s}; 1249 getSSA()1250 public static SingleStateAlignerFlat2 getSSA(){ 1251 SingleStateAlignerFlat2 ssa=localSSA.get(); 1252 if(ssa==null){ 1253 synchronized(localSSA){ 1254 ssa=localSSA.get(); 1255 if(ssa==null){ 1256 ssa=new SingleStateAlignerFlat2(); 1257 localSSA.set(ssa); 1258 } 1259 } 1260 } 1261 return ssa; 1262 } 1263 getSSA3()1264 public static SingleStateAlignerFlat3 getSSA3(){ 1265 SingleStateAlignerFlat3 ssa=localSSA3.get(); 1266 if(ssa==null){ 1267 synchronized(localSSA3){ 1268 ssa=localSSA3.get(); 1269 if(ssa==null){ 1270 ssa=new SingleStateAlignerFlat3(); 1271 localSSA3.set(ssa); 1272 } 1273 } 1274 } 1275 return ssa; 1276 } 1277 getSSAF()1278 public static SingleStateAlignerFlatFloat getSSAF(){ 1279 SingleStateAlignerFlatFloat ssa=localSSAF.get(); 1280 if(ssa==null){ 1281 synchronized(localSSAF){ 1282 ssa=localSSAF.get(); 1283 if(ssa==null){ 1284 ssa=new SingleStateAlignerFlatFloat(); 1285 localSSAF.set(ssa); 1286 } 1287 } 1288 } 1289 return ssa; 1290 } 1291 1292 private static ThreadLocal<SingleStateAlignerFlat2> localSSA=new ThreadLocal<SingleStateAlignerFlat2>(); 1293 private static ThreadLocal<SingleStateAlignerFlat3> localSSA3=new ThreadLocal<SingleStateAlignerFlat3>(); 1294 private static ThreadLocal<SingleStateAlignerFlatFloat> localSSAF=new ThreadLocal<SingleStateAlignerFlatFloat>(); 1295 // public static int maxAlignmentEndpointDifference=15; 1296 public static int alignmentPadding=300; 1297 1298 public static int breakLimit=12; 1299 public static int lookbackPlus=70; 1300 public static int lookbackMinus=25; 1301 1302 // pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4)); 1303 1304 //Operon length modifiers for same strand 1305 public static float p0=-30f; 1306 public static float p1=-0.35f; //Sensitive - higher decreases FP and TP 1307 public static float p2=4.0f;//Insensitive - higher positive decreases FP and TP 1308 public static float p3=12f; //Higher decreases FP (substantially) and TP (slightly) 1309 public static float p4=-10f; //Lower decreases FP (weakly) and TP (greatly) 1310 public static float p5=2.0f; //Insensitive - lower increases FP and TP 1311 public static float p6=2f; //Greater increases FP and TP 1312 1313 // pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5); 1314 1315 //Operon length modifiers for opposite strand 1316 public static float q1=-36f; 1317 public static float q2=-1.6f; //q2 and q4 must have opposite signs 1318 public static float q3=-12f; 1319 public static float q4=3.0f; //(Lower [even negative] decreases FP and TP) 1320 public static float q5=-40f; 1321 1322 private static PrintStream outstream=System.err; 1323 static boolean verbose; 1324 1325 } 1326