1 #ifndef __RLINK_H__ 2 #define __RLINK_H__ 3 #include "GArgs.h" 4 #include "GStr.h" 5 #include "gff.h" 6 #include "GBam.h" 7 #include "GBitVec.h" 8 #include "time.h" 9 #include "tablemaker.h" 10 #include "GIntHash.hh" 11 12 #define MAX_NODE 1000000 13 #define KMER 31 14 15 #define DROP 0.5 16 #define ERROR_PERC 0.1 17 18 #define CHI_WIN 100 19 #define CHI_THR 50 20 21 #define IS_FPKM_FLAG 1 22 #define IS_TPM_FLAG 2 23 #define IS_COV_FLAG 4 24 25 const double epsilon=0.000001; //-E 26 const float trthr=1.0; // transfrag pattern threshold 27 const float MIN_VAL=-100000.0; 28 const int MAX_MAXCOMP=200; // is 200 too much, or should I set it up to 150? 29 30 //const uint largeintron=20000; // don't trust introns longer than this unless there is higher evidence; less than 10% of all human annotated introns are longer than this 31 //const uint longintron=70000; // don't trust introns longer than this unless there is higher evidence; about 98% of all human annotated introns are shorter than this 32 const uint longintron=100000; // don't trust introns longer than this unless there is higher evidence; about 99% of all human annotated introns are shorter than this 33 const uint longintronanchor=25; // I need a higher anchor for long introns -> use a smaller value here? i.e. 20? 34 //const uint verylongintron=100000; // don't trust introns longer than this unless there is higher evidence; about 99% of all human annotated introns are shorter than this 35 //const uint verylongintronanchor=35; // I need a higher anchor for very long introns 36 37 const float mismatchfrac=0.02; 38 const float lowcov=1.5; 39 const float lowisofrac=0.02; 40 41 const int max_trf_number=40000; // maximum number of transfrag accepted so that the memory doesn't blow up 42 43 extern bool mergeMode; 44 extern bool forceBAM; //for stdin alignment data 45 46 extern bool verbose; 47 extern bool debugMode; 48 49 //collect all refguide transcripts for a single genomic sequence 50 struct GRefData { 51 GList<GffObj> rnas; //all transcripts on this genomic seq 52 //GList<GRefLocus> loci; 53 int gseq_id; 54 const char* gseq_name; 55 //GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here) rnasGRefData56 GRefData(int gid=-1):rnas(false,true,false),gseq_id(gid),gseq_name(NULL) { 57 gseq_id=gid; 58 if (gseq_id>=0) 59 gseq_name=GffObj::names->gseqs.getName(gseq_id); 60 } 61 addGRefData62 void add(GffReader* gffr, GffObj* t) { 63 if (gseq_id>=0) { 64 if (gseq_id!=t->gseq_id) 65 GError("Error: invalid call to GRefData::add() - different genomic sequence!\n"); 66 } 67 else { //adding first transcript, initialize storage 68 gseq_id=t->gseq_id; 69 gseq_name=t->getGSeqName(); 70 if (gffr->gseqtable[gseq_id]==NULL) 71 GError("Error: invalid genomic sequence data (%s)!\n",gseq_name); 72 rnas.setCapacity(gffr->gseqtable[gseq_id]->fcount); 73 } 74 rnas.Add(t); 75 t->isUsed(true); 76 //setLocus(t); //use the GRefLocus::mexons to quickly find an overlap with existing loci, or create a new one 77 } 78 79 bool operator==(GRefData& d){ 80 return gseq_id==d.gseq_id; 81 } 82 bool operator<(GRefData& d){ 83 return (gseq_id<d.gseq_id); 84 } 85 }; 86 87 88 struct CBundlenode:public GSeg { 89 float cov; 90 int bid; // bundle node id in bnode -> to easy retrieve it 91 CBundlenode *nextnode; // next node in the same bundle GSegCBundlenode92 CBundlenode(int rstart=0, int rend=0, float _cov=0, int _bid=-1, CBundlenode *_nextnode=NULL):GSeg(rstart, rend), 93 cov(_cov),bid(_bid),nextnode(_nextnode) {} 94 }; 95 96 // bundle data structure, holds all data needed for 97 // infering transcripts from a bundle 98 enum BundleStatus { 99 BUNDLE_STATUS_CLEAR=0, //available for loading/prepping 100 BUNDLE_STATUS_LOADING, //being prepared by the main thread (there can be only one) 101 BUNDLE_STATUS_READY //ready to be processed, or being processed 102 }; 103 104 struct CBundle { 105 int len; 106 float cov; 107 float multi; 108 int startnode; // id of start node in bundle of same strand 109 int lastnodeid; // id of last node added to bundle 110 CBundle(int _len=0, float _cov=0, float _multi=0, int _start=-1, int _last=-1): lenCBundle111 len(_len),cov(_cov),multi(_multi), startnode(_start),lastnodeid(_last) {} 112 }; 113 114 struct CPath { 115 int node; 116 int contnode; 117 float abundance; nodeCPath118 CPath(int n1=0,int n2=0,float abund=0):node(n1),contnode(n2),abundance(abund){} 119 }; 120 121 struct CTransfrag { 122 GVec<int> nodes; 123 GBitVec pattern; 124 float abundance; 125 float srabund; // keeps abundance associated to srfrag 126 GVec<CPath> path; // stores all the possible paths that leave from a node to reach next node in a transfrag, and distributes the abundance of the transfrag between all possible continuations 127 float usepath; 128 int weak; // number of weak links 129 bool real:1; 130 bool guide:1; 131 uint longstart; // for long reads: min start of all longreads sharing transfrag 132 uint longend; // for long reads: max end of all longreads sharing transfrag nodesCTransfrag133 CTransfrag(GVec<int>& _nodes,GBitVec& bit, float abund=0, bool treal=false, bool tguide=false,float sr=0):nodes(_nodes),pattern(bit),abundance(abund),srabund(sr),path(),usepath(-1),weak(-1),real(treal),guide(tguide),longstart(false),longend(false) {} nodesCTransfrag134 CTransfrag(float abund=0, bool treal=false,bool tguide=false):nodes(),pattern(),abundance(abund),srabund(0),path(),usepath(-1),weak(-1),real(treal),guide(tguide),longstart(false),longend(false) { 135 } 136 }; 137 138 struct CMTransfrag { // this is the super-class for transfrag -> to use in case of merging transcripts 139 CTransfrag *transfrag; 140 GVec<int> read; // all reads' indeces that are connected to this transfrag 141 int nf; 142 int nl; 143 uint len; transfragCMTransfrag144 CMTransfrag(CTransfrag *t=NULL):transfrag(t),read(),nf(0),nl(0),len(0) {} 145 }; 146 147 struct CGuide { 148 CTransfrag *trf; 149 //GffObj* t; // this is what I was using before but I need to store the guide index in guides instead 150 //CGuide(CTransfrag* _trf=NULL, GffObj* _t=NULL):trf(_trf),t(_t) {} 151 int g; // stores guide index in guides instead of the actual pointer trfCGuide152 CGuide(CTransfrag* _trf=NULL, int _g=-1):trf(_trf),g(_g) {} 153 }; 154 155 struct CPartGuide { 156 int idx; 157 int olen; // overlap with node 158 int allolen; // overlap with all nodes 159 int glen; // guide length 160 bool terminal_in; 161 bool terminal_out; 162 float gcount; 163 float cov; // assigned overall coverage so far 164 float ncov; // new coverage in node idxCPartGuide165 CPartGuide(int _idx=0, int _olen=0, int _aolen=0, int _glen=0):idx(_idx),olen(_olen),allolen(_aolen),glen(_glen), 166 terminal_in(false),terminal_out(false),gcount(0),cov(0),ncov(0) {} 167 }; 168 169 struct CTrGuidePat { // remember abundances based on node guide pattern 170 GBitVec pat; 171 float abund; 172 bool terminal; 173 GVec<int> g; CTrGuidePatCTrGuidePat174 CTrGuidePat():pat(),abund(0),terminal(false),g() {} // default constructor CTrGuidePatCTrGuidePat175 CTrGuidePat(GBitVec p, float a, bool _t): pat(p),abund(a),terminal(_t),g() {} 176 }; 177 178 struct CNodeGuide { 179 GVec<CPartGuide> guide; 180 GVec<CTrGuidePat> trcount; 181 float sumtrcount; // sum of all used in guides transfrag abundances -> this is needed in order to see if a guide would explain all of them CNodeGuideCNodeGuide182 CNodeGuide():guide(),trcount(),sumtrcount(0) {} 183 }; 184 185 struct CGroup:public GSeg { 186 int color; 187 int grid; 188 float cov_sum; 189 float multi; 190 float neg_prop; // proportion of negative reads assigned to group out of all positives and negatives 191 CGroup *next_gr; 192 CGroup(int rstart=0, int rend=0, int _color=-1, int _grid=0, float _cov_sum=0, GSegCGroup193 float _multi=0,float _neg_prop=0, CGroup *_next_gr=NULL): GSeg(rstart, rend), color(_color), grid(_grid),cov_sum(_cov_sum), 194 multi(_multi), neg_prop(_neg_prop),next_gr(_next_gr) { } 195 }; 196 197 struct CMerge { 198 GStr name; 199 GVec<int> fidx; // file indices for the transcripts in the merge nameCMerge200 CMerge(const char* rname=NULL):name(rname),fidx() {} 201 }; 202 203 204 struct CExon{ 205 int predno; 206 int exonno; 207 float exoncov; prednoCExon208 CExon(int p=0,int e=0,float c=0):predno(p),exonno(e),exoncov(c) {} 209 }; 210 211 struct TwoFloat{ 212 float start; 213 float end; startTwoFloat214 TwoFloat(float v1=0,float v2=0):start(v1),end(v2) {} 215 }; 216 217 struct CPred{ 218 int predno; 219 float cov; prednoCPred220 CPred(int p=0,float c=0):predno(p),cov(c) {} 221 }; 222 223 struct CLongTrf{ 224 int t; 225 float cov; 226 GVec<int> group; tCLongTrf227 CLongTrf(int tno=0,float c=0):t(tno),cov(c),group() {} 228 }; 229 230 struct CPrediction:public GSeg { 231 int geneno; 232 GffObj* t_eq; //equivalent reference transcript (guide) 233 //char *id; 234 float cov; 235 char strand; 236 //float frag; // counted number of fragments associated with prediction 237 int tlen; 238 bool flag; 239 GVec<GSeg> exons; 240 GVec<float> exoncov; 241 GStr mergename; 242 CPrediction(int _geneno=0, GffObj* guide=NULL, int gstart=0, int gend=0, float _cov=0, char _strand='.', GSegCPrediction243 int _len=0,bool f=true):GSeg(gstart,gend), geneno(_geneno),t_eq(guide),cov(_cov),strand(_strand), 244 //CPrediction(int _geneno=0, char* _id=NULL,int gstart=0, int gend=0, float _cov=0, char _strand='.', float _frag=0, 245 // int _len=0,bool f=true):GSeg(gstart,gend), geneno(_geneno),id(_id),cov(_cov),strand(_strand),frag(_frag), 246 tlen(_len),flag(f),exons(),exoncov(),mergename() {} 247 void init(int _geneno=0, GffObj* guide=NULL, int gstart=0, int gend=0, float _cov=0, char _strand='.', 248 int _len=0) { 249 geneno=_geneno; 250 t_eq=guide; 251 start=gstart; 252 end=gend; 253 cov=_cov; 254 strand=_strand; 255 tlen=_len; 256 flag=true; 257 exons.Clear(); 258 exoncov.Clear(); 259 mergename.clear(); 260 } 261 CPredictionCPrediction262 CPrediction(CPrediction& c):GSeg(c.start, c.end), geneno(c.geneno), 263 // id(Gstrdup(c.id)), cov(c.cov), strand(c.strand), frag(c.frag), tlen(c.tlen), flag(c.flag), 264 t_eq(c.t_eq), cov(c.cov), strand(c.strand), tlen(c.tlen), flag(c.flag), 265 exons(c.exons), exoncov(c.exoncov), mergename(c.mergename) {} ~CPredictionCPrediction266 ~CPrediction() { //GFREE(id); 267 } 268 }; 269 270 struct CMPrediction { 271 CPrediction *p; 272 GVec<int> nodes; 273 GBitVec pat; // pattern of nodes and introns in prediction 274 GBitVec b; // not retained introns pCMPrediction275 CMPrediction(CPrediction* _p=NULL): p(_p),nodes(),pat(),b() {} CMPredictionCMPrediction276 CMPrediction(CPrediction* _p,GVec<int>& _nodes,GBitVec& _pat, GBitVec& _b): p(_p),nodes(_nodes),pat(_pat),b(_b) {} 277 }; 278 279 struct CNodeCapacity { 280 int id; 281 bool left; 282 float perc; idCNodeCapacity283 CNodeCapacity(int nid=0,bool leftnode=false,float p=0): id(nid),left(leftnode),perc(p) {} 284 }; 285 286 // this class keeps the gene predictions (linked bundle nodes initially) 287 struct CGene:public GSeg { // I don't necessarily need to make this a GSeg since I can get the start&end from the exons 288 char strand; 289 char* geneID; 290 char* geneName; 291 float cov; // this is the actual gene coverage 292 float covsum; // this is a sum of transcripts coverages -> this is what we need for FPKM and TPM estimations 293 GVec<GSeg> exons; // all possible exons in gene (those are bnodes in bundle) GSegCGene294 CGene(int gstart=0, int gend=0, char _strand='.',char *gid=NULL, char *gname=NULL):GSeg(gstart,gend), 295 strand(_strand), geneID(gid), geneName(gname), exons() { cov=0; covsum=0;} 296 // getGeneID() and getGeneName() functions of gffobj return pointers to this attributes in gffobj so I don't need to clean them up here 297 }; 298 299 //holding transcript info for --merge mode 300 struct TAlnInfo { 301 GStr name; //transcript name 302 int fileidx; //index of transcript file in the TInputFiles.files array 303 double cov; 304 double fpkm; 305 double tpm; 306 int g; nameTAlnInfo307 TAlnInfo(const char* rname=NULL, int fidx=0):name(rname), fileidx(fidx), 308 cov(-1),fpkm(-1),tpm(-1),g(-1) { } 309 }; 310 311 struct CJunction; 312 313 struct CReadAln:public GSeg { 314 //DEBUG ONLY: 315 // GStr name; 316 char strand; // 1, 0 (unkown), -1 (reverse) 317 short int nh; 318 uint len; 319 float read_count; // keeps count for all reads (including paired and unpaired) 320 bool unitig; 321 GVec<float> pair_count; // keeps count for all paired reads 322 GVec<int> pair_idx; // keeps indeces for all pairs in assembly mode, or all reads that were collapsed in merge mode 323 GVec<GSeg> segs; //"exons" 324 GPVec<CJunction> juncs; 325 union { 326 TAlnInfo* tinfo; 327 bool in_guide; 328 }; 329 330 CReadAln(char _strand=0, short int _nh=0, GSegCReadAln331 int rstart=0, int rend=0, TAlnInfo* tif=NULL): GSeg(rstart, rend), //name(rname), 332 strand(_strand),nh(_nh), len(0), read_count(0), unitig(false),pair_count(),pair_idx(), 333 segs(), juncs(false), tinfo(tif) { } CReadAlnCReadAln334 CReadAln(CReadAln &rd):GSeg(rd.start,rd.end) { // copy contructor 335 strand=rd.strand; 336 nh=rd.nh; 337 len=rd.len; 338 read_count=rd.read_count; 339 unitig=rd.unitig; 340 pair_count=rd.pair_count; 341 pair_idx=rd.pair_idx; 342 juncs=rd.juncs; 343 tinfo=rd.tinfo; 344 } overlapSegLenCReadAln345 int overlapSegLen(CReadAln* r) { 346 347 if (r->start>end || start>r->end) return 0; 348 349 int i=0; 350 int j=0; 351 int len=0; 352 while(i<segs.Count()) { 353 if(segs[i].end<r->segs[j].start) i++; 354 else if(r->segs[j].end<segs[i].start) j++; 355 else { // there is overlap 356 len+=segs[i].overlapLen(r->segs[j].start,r->segs[j].end); 357 if(segs[i].end<r->segs[j].end) i++; 358 else j++; 359 } 360 if(j==r->segs.Count()) break; 361 } 362 return len; 363 } ~CReadAlnCReadAln364 ~CReadAln() { if(mergeMode) {delete tinfo;} } 365 }; 366 367 struct CGraphinfo { 368 int ngraph; 369 int nodeno; ngraphCGraphinfo370 CGraphinfo(int ng=-1,int nnode=-1):ngraph(ng),nodeno(nnode){} 371 }; 372 373 struct CGJunc { 374 int leftnode; 375 int rightnode; 376 double cov; // ngood 377 double goodcov; // ngood_reads leftnodeCGJunc378 CGJunc(int n1=0,int n2=0,double _cov=0,double _goodcov=0):leftnode(n1),rightnode(n2),cov(_cov),goodcov(_goodcov){} 379 }; 380 381 382 struct CGNode { 383 int id; // initial id in graphno 384 bool last; // if this is last node (to be linked to sink later) 385 bool keep; // if I keep it in the final count (true by default) 386 bool merge; // if this node needs to be merged to its adjacent node 387 bool future; idCGNode388 CGNode(int _id=0,bool _last=false,bool _keep=true, bool _merge=false, bool _future=false):id(_id),last(_last),keep(_keep),merge(_merge),future(_future){} 389 }; 390 391 struct CTreePat { 392 int nodeno; 393 int childno; 394 CTransfrag *tr; 395 CTreePat **nextpat; nodenoCTreePat396 CTreePat(int n=0,int cno=0):nodeno(n),childno(cno),tr(NULL),nextpat(NULL){ 397 if(cno) { 398 GCALLOC(nextpat,cno*sizeof(CTreePat *)); 399 for(int i=0;i<cno;i++) nextpat[i]=NULL; 400 } 401 } setchildsCTreePat402 void setchilds(int cno) { 403 if(cno && !nextpat) { 404 GCALLOC(nextpat,cno*sizeof(CTreePat *)); 405 for(int i=0;i<cno;i++) nextpat[i]=NULL; 406 } 407 childno=cno; 408 } settreeCTreePat409 void settree(int i, CTreePat *t) { 410 if(i<childno) nextpat[i]=t; 411 } settreeCTreePat412 CTreePat *settree(int nextpos,int n,int cno) { 413 if(nextpos<childno) { 414 if(!nextpat[nextpos]) nextpat[nextpos]=new CTreePat(n,cno); 415 return(nextpat[nextpos]); 416 } 417 return(NULL); 418 } 419 }; 420 421 struct CTrimPoint { // this can work as a guide keeper too, where pos is the guideidx, abundance is the flow, and start is the included status 422 uint pos; 423 float abundance; 424 bool start; posCTrimPoint425 CTrimPoint(uint _pos=0,float abund=0.0,bool _start=true):pos(_pos),abundance(abund),start(_start) {} 426 }; 427 428 struct CInterval { 429 uint pos; // interval start position 430 float val; // interval value or interval last position depending on use 431 CInterval *next; // next interval; posCInterval432 CInterval(uint _pos=0,float _val=0,CInterval *_next=NULL):pos(_pos),val(_val),next(_next) {} 433 }; 434 435 436 struct CMaxIntv:public GSeg { 437 GVec<CExon> node; 438 CMaxIntv *next; // next interval; GSegCMaxIntv439 CMaxIntv(uint start=0,uint end=0):GSeg(start,end),node(),next(NULL) {} GSegCMaxIntv440 CMaxIntv(GVec<CExon>& _node,uint start,uint end,CMaxIntv *_next=NULL):GSeg(start,end),node(_node),next(_next) {} 441 }; 442 443 struct GInterval { 444 uint start; 445 uint end; 446 GInterval *next; startGInterval447 GInterval(uint _start, uint _end,GInterval *_next=NULL):start(_start),end(_end),next(_next) {} 448 }; 449 450 struct CTrInfo { 451 int trno; 452 float abundance; 453 float penalty; trnoCTrInfo454 CTrInfo(int tr=-1,float _abund=0.0, float _pen=0.0):trno(tr),abundance(_abund),penalty(_pen) {} 455 }; 456 457 struct CNetEdge { 458 int link; 459 float rate; 460 bool fake; linkCNetEdge461 CNetEdge(int lnk=0.0,float r=0.0, bool f=false):link(lnk),rate(r),fake(f){} 462 }; 463 464 struct CComponent { 465 float size; 466 GVec<int> *set; sizeCComponent467 CComponent(float _size=0.0,GVec<int> *_set=NULL):size(_size),set(_set) {} ~CComponentCComponent468 ~CComponent() { if(set) delete set;} 469 }; 470 471 struct GEdge { // guide edge 472 // if val < endval then this is start; otherwise it is end 473 uint val; // value of the boundary 474 uint endval; // value of the other exon boundary shared with val 475 int strand; 476 bool operator<(const GEdge& o) const { 477 return(val<o.val || (val==o.val && strand<o.strand)); 478 } 479 bool operator==(const GEdge& o) const { 480 return(val==o.val && strand==o.strand); 481 } valGEdge482 GEdge(uint _val=0,uint _endval=0,int _strand=0):val(_val),endval(_endval),strand(_strand) {} 483 }; 484 485 struct CGraphnode:public GSeg { 486 int nodeid; 487 float cov; 488 float capacity; // sum of all transcripts abundances exiting and through node 489 float rate; // conversion rate between in and out transfrags of node 490 //float frag; // number of fragments included in node 491 GVec<int> child; 492 GVec<int> parent; 493 GBitVec childpat; 494 GBitVec parentpat; 495 GVec<int> trf; // transfrags that pass the node 496 //CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0,float f=0):GSeg(s,e),nodeid(id),cov(nodecov),capacity(cap),rate(r),frag(f),child(),parent(),childpat(),parentpat(),trf(){} GSegCGraphnode497 CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0):GSeg(s,e), 498 nodeid(id),cov(nodecov),capacity(cap),rate(r),child(),parent(),childpat(),parentpat(),trf(){} 499 }; 500 501 // # 0: strand; 1: start; 2: end; 3: nreads; 4: nreads_good; 502 struct CJunction:public GSeg { 503 char strand; //-1,0,1 504 char guide_match; //exact match of a ref intron? 505 char consleft; // -1,0,1 -1 is not set up, 0 is non consensus, 1 is consensus 506 char consright; // -1,0,1 -1 is not set up, 0 is non consensus, 1 is consensus 507 double nreads; 508 double nreads_good; 509 double leftsupport; 510 double rightsupport; 511 double nm; // number of reads with a high nm (high mismatch) 512 double mm; // number of reads that support a junction with both anchors bigger than longintronanchor GSegCJunction513 CJunction(int s=0,int e=0, char _strand=0):GSeg(s,e), 514 strand(_strand), guide_match(0), consleft(-1), consright(-1),nreads(0),nreads_good(0), 515 leftsupport(0),rightsupport(0),nm(0),mm(0) {} 516 bool operator<(CJunction& b) { 517 if (start<b.start) return true; 518 if (start>b.start) return false; 519 if (end<b.end) return true; 520 if (end>b.end) return false; 521 if (strand>b.strand) return true; 522 return false; 523 } 524 bool operator==(CJunction& b) { 525 return (start==b.start && end==b.end && strand==b.strand); 526 } 527 }; 528 529 struct GReadAlnData { 530 GBamRecord* brec; 531 char strand; //-1, 0, 1 532 int nh; 533 int hi; 534 GPVec<CJunction> juncs; 535 union { 536 TAlnInfo* tinfo; 537 bool in_guide; 538 }; 539 //GPVec< GVec<RC_ExonOvl> > g_exonovls; //>5bp overlaps with guide exons, for each read "exon" 540 GReadAlnData(GBamRecord* bamrec=NULL, char nstrand=0, int num_hits=0, brecGReadAlnData541 int hit_idx=0, TAlnInfo* tif=NULL):brec(bamrec), strand(nstrand), 542 nh(num_hits), hi(hit_idx), juncs(true), tinfo(tif) { } //, g_exonovls(true) ~GReadAlnDataGReadAlnData543 ~GReadAlnData() { if(mergeMode) delete tinfo; } 544 }; 545 546 547 /* 548 struct CTCov { //covered transcript info 549 int first_cov_exon; 550 int last_cov_exon; 551 int numt; 552 GffObj* guide; 553 bool whole; 554 CTCov(GffObj* t, int fex=-1, int lex=0, int ntr=0):first_cov_exon(fex), last_cov_exon(lex), 555 numt(ntr), guide(t), whole(false) { 556 whole = (first_cov_exon<0); 557 } 558 void print(FILE* f) { 559 if (whole) { //from get_covered() 560 guide->printTranscriptGff(f); 561 } 562 else { //from get_partial_covered() 563 bool partial=true; 564 if (last_cov_exon<0) { 565 if (guide->exons.Count()==1) partial=false; 566 last_cov_exon=first_cov_exon; 567 } else { 568 if(last_cov_exon-first_cov_exon+1==guide->exons.Count()) partial=false; 569 } 570 for(int i=first_cov_exon;i<=last_cov_exon;i++) { 571 if(partial) fprintf(f, "%s\tpartial\texon\t%u\t%u\t.\t%c\t.\ttranscript_id \"%s_part%d\";\n",guide->getGSeqName(), 572 guide->exons[i]->start,guide->exons[i]->end,guide->strand,guide->getID(), numt); 573 else fprintf(f, "%s\tcomplete\texon\t%u\t%u\t.\t%c\t.\ttranscript_id \"%s\";\n",guide->getGSeqName(), 574 guide->exons[i]->start,guide->exons[i]->end,guide->strand,guide->getID()); 575 } 576 } 577 } 578 }; 579 */ 580 581 // bundle data structure, holds all input data parsed from BAM file 582 // - r216 regression 583 struct BundleData { 584 BundleStatus status; 585 //int64_t bamStart; //start of bundle in BAM file 586 int idx; //index in the main bundles array 587 int start; 588 int end; 589 unsigned long numreads; // number of reads in bundles 590 /* 591 float wnumreads; // NEW: weighted numreads; a multi-mapped read mapped in 2 places will contribute only 0.5 592 double sumreads; // sum of all reads' lengths in bundle 593 double sumfrag; // sum of all fragment lengths (this includes the insertion so it is an estimate) 594 float num_reads; // number of all reads in bundle that we considered (weighted) 595 float num_cov; // how many coverages we added (weighted) to obtain sumcov 596 float num_frag; // how many fragments we added to obtain sumfrag 597 double num_fragments3; 598 double sum_fragments3; 599 */ 600 double num_fragments; //aligned read/pairs 601 double frag_len; 602 double sum_cov; // sum of all transcripts coverages --> needed to compute TPMs 603 char covflags; 604 605 GStr refseq; //reference sequence name 606 char* gseq; //actual genomic sequence for the bundle 607 GList<CReadAln> readlist; 608 GVec<float> bpcov[3]; // this needs to be changed to a more inteligent way of storing the data 609 GList<CJunction> junction; 610 GPVec<GffObj> keepguides; 611 //GPVec<CTCov> covguides; 612 GList<CPrediction> pred; 613 RC_BundleData* rc_data; BundleDataBundleData614 BundleData():status(BUNDLE_STATUS_CLEAR), idx(0), start(0), end(0), 615 numreads(0), 616 num_fragments(0), frag_len(0),sum_cov(0),covflags(0), 617 refseq(), gseq(NULL), readlist(false,true), //bpcov(1024), 618 junction(true, true, true), 619 keepguides(false), pred(false), rc_data(NULL) { 620 for(int i=0;i<3;i++) bpcov[i].setCapacity(1024); 621 } 622 getReadyBundleData623 void getReady(int currentstart, int currentend) { 624 //this is only called when the bundle is valid and ready to be processed 625 start=currentstart; 626 end=currentend; 627 //refseq=ref; 628 //tag all these guides 629 for (int i=0;i<this->keepguides.Count();++i) { 630 RC_TData* tdata=(RC_TData*)(keepguides[i]->uptr); 631 tdata->in_bundle=1; 632 } 633 status=BUNDLE_STATUS_READY; 634 } 635 rc_initBundleData636 void rc_init(GffObj* t, GPVec<RC_TData>* rc_tdata, 637 GPVec<RC_Feature>* rc_edata, GPVec<RC_Feature>* rc_idata) { 638 if (rc_data==NULL) { 639 rc_data = new RC_BundleData(t->start, t->end, 640 rc_tdata, rc_edata, rc_idata); 641 } 642 } 643 /* after reference annotation was loaded 644 void rc_finalize_refs() { 645 if (rc_data==NULL) return; 646 //rc_data->setupCov(); 647 } 648 Not needed here, we update the coverage span as each transcript is added 649 */ 650 void keepGuide(GffObj* scaff, GPVec<RC_TData>* rc_tdata=NULL, 651 GPVec<RC_Feature>* rc_edata=NULL, GPVec<RC_Feature>* rc_idata=NULL); 652 653 //bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi); 654 bool evalReadAln(GReadAlnData& alndata, char& strand); 655 ClearBundleData656 void Clear() { 657 keepguides.Clear(); 658 pred.Clear(); 659 pred.setSorted(false); 660 readlist.Clear(); 661 for(int i=0;i<3;i++) { 662 bpcov[i].Clear(); 663 bpcov[i].setCapacity(1024); 664 } 665 junction.Clear(); 666 start=0; 667 end=0; 668 status=BUNDLE_STATUS_CLEAR; 669 numreads=0; 670 num_fragments=0; 671 frag_len=0; 672 sum_cov=0; 673 covflags=0; 674 delete rc_data; 675 GFREE(gseq); 676 rc_data=NULL; 677 } 678 ~BundleDataBundleData679 ~BundleData() { 680 Clear(); 681 } 682 }; 683 684 void processRead(int currentstart, int currentend, BundleData& bdata, 685 GHash<int>& hashread, GReadAlnData& alndata); 686 //GBamRecord& brec, char strand, int nh, int hi); 687 688 void countFragment(BundleData& bdata, GBamRecord& brec, int hi,int nh); 689 690 int printResults(BundleData* bundleData, int geneno, GStr& refname); 691 int printMergeResults(BundleData* bundleData, int geneno, GStr& refname); 692 693 int infer_transcripts(BundleData* bundle); 694 695 // --- utility functions 696 void printGff3Header(FILE* f, GArgs& args); 697 698 void printTime(FILE* f); 699 700 701 #endif 702