1 #ifndef __RLINK_H__
2 #define __RLINK_H__
3 #include "GArgs.h"
4 #include "GStr.h"
5 #include "gff.h"
6 #include "GBam.h"
7 #include "GBitVec.h"
8 #include "time.h"
9 #include "tablemaker.h"
10 #include "GIntHash.hh"
11 
12 #define MAX_NODE 1000000
13 #define KMER 31
14 
15 #define DROP 0.5
16 #define ERROR_PERC 0.1
17 
18 #define CHI_WIN 100
19 #define CHI_THR 50
20 
21 #define IS_FPKM_FLAG 1
22 #define IS_TPM_FLAG 2
23 #define IS_COV_FLAG 4
24 
25 const double epsilon=0.000001; //-E
26 const float trthr=1.0;   // transfrag pattern threshold
27 const float MIN_VAL=-100000.0;
28 const int MAX_MAXCOMP=200; // is 200 too much, or should I set it up to 150?
29 
30 //const uint largeintron=20000; // don't trust introns longer than this unless there is higher evidence; less than 10% of all human annotated introns are longer than this
31 //const uint longintron=70000; // don't trust introns longer than this unless there is higher evidence; about 98% of all human annotated introns are shorter than this
32 const uint longintron=100000; // don't trust introns longer than this unless there is higher evidence; about 99% of all human annotated introns are shorter than this
33 const uint longintronanchor=25; // I need a higher anchor for long introns -> use a smaller value here? i.e. 20?
34 //const uint verylongintron=100000; // don't trust introns longer than this unless there is higher evidence; about 99% of all human annotated introns are shorter than this
35 //const uint verylongintronanchor=35; // I need a higher anchor for very long introns
36 
37 const float mismatchfrac=0.02;
38 const float lowcov=1.5;
39 const float lowisofrac=0.02;
40 
41 const int max_trf_number=40000; // maximum number of transfrag accepted so that the memory doesn't blow up
42 
43 extern bool mergeMode;
44 extern bool forceBAM; //for stdin alignment data
45 
46 extern bool verbose;
47 extern bool debugMode;
48 
49 //collect all refguide transcripts for a single genomic sequence
50 struct GRefData {
51   GList<GffObj> rnas; //all transcripts on this genomic seq
52   //GList<GRefLocus> loci;
53   int gseq_id;
54   const char* gseq_name;
55    //GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
rnasGRefData56   GRefData(int gid=-1):rnas(false,true,false),gseq_id(gid),gseq_name(NULL) {
57     gseq_id=gid;
58     if (gseq_id>=0)
59        gseq_name=GffObj::names->gseqs.getName(gseq_id);
60   }
61 
addGRefData62   void add(GffReader* gffr, GffObj* t) {
63      if (gseq_id>=0) {
64         if (gseq_id!=t->gseq_id)
65            GError("Error: invalid call to GRefData::add() - different genomic sequence!\n");
66      }
67      else { //adding first transcript, initialize storage
68         gseq_id=t->gseq_id;
69         gseq_name=t->getGSeqName();
70         if (gffr->gseqtable[gseq_id]==NULL)
71             GError("Error: invalid genomic sequence data (%s)!\n",gseq_name);
72         rnas.setCapacity(gffr->gseqtable[gseq_id]->fcount);
73      }
74      rnas.Add(t);
75      t->isUsed(true);
76      //setLocus(t); //use the GRefLocus::mexons to quickly find an overlap with existing loci, or create a new one
77   }
78 
79   bool operator==(GRefData& d){
80     return gseq_id==d.gseq_id;
81   }
82   bool operator<(GRefData& d){
83     return (gseq_id<d.gseq_id);
84   }
85 };
86 
87 
88 struct CBundlenode:public GSeg {
89 	float cov;
90 	int bid; // bundle node id in bnode -> to easy retrieve it
91 	CBundlenode *nextnode; // next node in the same bundle
GSegCBundlenode92 	CBundlenode(int rstart=0, int rend=0, float _cov=0, int _bid=-1, CBundlenode *_nextnode=NULL):GSeg(rstart, rend),
93 			cov(_cov),bid(_bid),nextnode(_nextnode) {}
94 };
95 
96 // bundle data structure, holds all data needed for
97 // infering transcripts from a bundle
98 enum BundleStatus {
99 	BUNDLE_STATUS_CLEAR=0, //available for loading/prepping
100 	BUNDLE_STATUS_LOADING, //being prepared by the main thread (there can be only one)
101 	BUNDLE_STATUS_READY //ready to be processed, or being processed
102 };
103 
104 struct CBundle {
105 	int len;
106 	float cov;
107 	float multi;
108 	int startnode;  // id of start node in bundle of same strand
109 	int lastnodeid; // id of last node added to bundle
110 	CBundle(int _len=0, float _cov=0, float _multi=0, int _start=-1, int _last=-1):
lenCBundle111 			len(_len),cov(_cov),multi(_multi), startnode(_start),lastnodeid(_last) {}
112 };
113 
114 struct CPath {
115 	int node;
116 	int contnode;
117 	float abundance;
nodeCPath118 	CPath(int n1=0,int n2=0,float abund=0):node(n1),contnode(n2),abundance(abund){}
119 };
120 
121 struct CTransfrag {
122 	GVec<int> nodes;
123 	GBitVec pattern;
124 	float abundance;
125 	float srabund; // keeps abundance associated to srfrag
126 	GVec<CPath> path; // stores all the possible paths that leave from a node to reach next node in a transfrag, and distributes the abundance of the transfrag between all possible continuations
127 	float usepath;
128 	int weak; // number of weak links
129 	bool real:1;
130 	bool guide:1;
131 	uint longstart; // for long reads: min start of all longreads sharing transfrag
132 	uint longend; // for long reads: max end of all longreads sharing transfrag
nodesCTransfrag133 	CTransfrag(GVec<int>& _nodes,GBitVec& bit, float abund=0, bool treal=false, bool tguide=false,float sr=0):nodes(_nodes),pattern(bit),abundance(abund),srabund(sr),path(),usepath(-1),weak(-1),real(treal),guide(tguide),longstart(false),longend(false) {}
nodesCTransfrag134 	CTransfrag(float abund=0, bool treal=false,bool tguide=false):nodes(),pattern(),abundance(abund),srabund(0),path(),usepath(-1),weak(-1),real(treal),guide(tguide),longstart(false),longend(false) {
135 	}
136 };
137 
138 struct CMTransfrag { // this is the super-class for transfrag -> to use in case of merging transcripts
139 	CTransfrag *transfrag;
140 	GVec<int> read; // all reads' indeces that are connected to this transfrag
141 	int nf;
142 	int nl;
143 	uint len;
transfragCMTransfrag144 	CMTransfrag(CTransfrag *t=NULL):transfrag(t),read(),nf(0),nl(0),len(0) {}
145 };
146 
147 struct CGuide {
148 	CTransfrag *trf;
149 	//GffObj* t; // this is what I was using before but I need to store the guide index in guides instead
150 	//CGuide(CTransfrag* _trf=NULL, GffObj* _t=NULL):trf(_trf),t(_t) {}
151 	int g; // stores guide index in guides instead of the actual pointer
trfCGuide152 	CGuide(CTransfrag* _trf=NULL, int _g=-1):trf(_trf),g(_g) {}
153 };
154 
155 struct CPartGuide {
156 	int idx;
157 	int olen; // overlap with node
158 	int allolen; // overlap with all nodes
159 	int glen; // guide length
160 	bool terminal_in;
161 	bool terminal_out;
162 	float gcount;
163 	float cov; // assigned overall coverage so far
164 	float ncov; // new coverage in node
idxCPartGuide165 	CPartGuide(int _idx=0, int _olen=0, int _aolen=0, int _glen=0):idx(_idx),olen(_olen),allolen(_aolen),glen(_glen),
166 			terminal_in(false),terminal_out(false),gcount(0),cov(0),ncov(0) {}
167 };
168 
169 struct CTrGuidePat { // remember abundances based on node guide pattern
170 	GBitVec pat;
171 	float abund;
172 	bool terminal;
173 	GVec<int> g;
CTrGuidePatCTrGuidePat174 	CTrGuidePat():pat(),abund(0),terminal(false),g() {} // default constructor
CTrGuidePatCTrGuidePat175 	CTrGuidePat(GBitVec p, float a, bool _t): pat(p),abund(a),terminal(_t),g() {}
176 };
177 
178 struct CNodeGuide {
179 	GVec<CPartGuide> guide;
180 	GVec<CTrGuidePat> trcount;
181 	float sumtrcount; // sum of all used in guides transfrag abundances -> this is needed in order to see if a guide would explain all of them
CNodeGuideCNodeGuide182 	CNodeGuide():guide(),trcount(),sumtrcount(0) {}
183 };
184 
185 struct CGroup:public GSeg {
186 	int color;
187 	int grid;
188 	float cov_sum;
189 	float multi;
190 	float neg_prop; // proportion of negative reads assigned to group out of all positives and negatives
191 	CGroup *next_gr;
192 	CGroup(int rstart=0, int rend=0, int _color=-1, int _grid=0, float _cov_sum=0,
GSegCGroup193 			float _multi=0,float _neg_prop=0, CGroup *_next_gr=NULL): GSeg(rstart, rend), color(_color), grid(_grid),cov_sum(_cov_sum),
194 			multi(_multi), neg_prop(_neg_prop),next_gr(_next_gr) { }
195 };
196 
197 struct CMerge {
198 	GStr name;
199 	GVec<int> fidx; // file indices for the transcripts in the merge
nameCMerge200 	CMerge(const char* rname=NULL):name(rname),fidx() {}
201 };
202 
203 
204 struct CExon{
205 	int predno;
206 	int exonno;
207 	float exoncov;
prednoCExon208 	CExon(int p=0,int e=0,float c=0):predno(p),exonno(e),exoncov(c) {}
209 };
210 
211 struct TwoFloat{
212 	float start;
213 	float end;
startTwoFloat214 	TwoFloat(float v1=0,float v2=0):start(v1),end(v2) {}
215 };
216 
217 struct CPred{
218 	int predno;
219 	float cov;
prednoCPred220 	CPred(int p=0,float c=0):predno(p),cov(c) {}
221 };
222 
223 struct CLongTrf{
224 	int t;
225 	float cov;
226 	GVec<int> group;
tCLongTrf227 	CLongTrf(int tno=0,float c=0):t(tno),cov(c),group() {}
228 };
229 
230 struct CPrediction:public GSeg {
231 	int geneno;
232 	GffObj* t_eq; //equivalent reference transcript (guide)
233 	//char *id;
234 	float cov;
235 	char strand;
236 	//float frag; // counted number of fragments associated with prediction
237 	int tlen;
238 	bool flag;
239 	GVec<GSeg> exons;
240 	GVec<float> exoncov;
241 	GStr mergename;
242 	CPrediction(int _geneno=0, GffObj* guide=NULL, int gstart=0, int gend=0, float _cov=0, char _strand='.',
GSegCPrediction243 	int _len=0,bool f=true):GSeg(gstart,gend), geneno(_geneno),t_eq(guide),cov(_cov),strand(_strand),
244 	//CPrediction(int _geneno=0, char* _id=NULL,int gstart=0, int gend=0, float _cov=0, char _strand='.', float _frag=0,
245 	//		int _len=0,bool f=true):GSeg(gstart,gend), geneno(_geneno),id(_id),cov(_cov),strand(_strand),frag(_frag),
246 			tlen(_len),flag(f),exons(),exoncov(),mergename() {}
247 	void init(int _geneno=0, GffObj* guide=NULL, int gstart=0, int gend=0, float _cov=0, char _strand='.',
248 	          int _len=0) {
249 		geneno=_geneno;
250 		t_eq=guide;
251 		start=gstart;
252 		end=gend;
253 		cov=_cov;
254 		strand=_strand;
255 		tlen=_len;
256 		flag=true;
257 		exons.Clear();
258 		exoncov.Clear();
259 		mergename.clear();
260 	}
261 
CPredictionCPrediction262 	CPrediction(CPrediction& c):GSeg(c.start, c.end), geneno(c.geneno),
263 //			id(Gstrdup(c.id)), cov(c.cov), strand(c.strand), frag(c.frag), tlen(c.tlen), flag(c.flag),
264 			t_eq(c.t_eq), cov(c.cov), strand(c.strand), tlen(c.tlen), flag(c.flag),
265 	      exons(c.exons),  exoncov(c.exoncov), mergename(c.mergename) {}
~CPredictionCPrediction266 	~CPrediction() { //GFREE(id);
267 		}
268 };
269 
270 struct CMPrediction {
271 	CPrediction *p;
272 	GVec<int> nodes;
273 	GBitVec pat; // pattern of nodes and introns in prediction
274 	GBitVec b; // not retained introns
pCMPrediction275 	CMPrediction(CPrediction* _p=NULL): p(_p),nodes(),pat(),b() {}
CMPredictionCMPrediction276 	CMPrediction(CPrediction* _p,GVec<int>& _nodes,GBitVec& _pat, GBitVec& _b): p(_p),nodes(_nodes),pat(_pat),b(_b) {}
277 };
278 
279 struct CNodeCapacity {
280 	int id;
281 	bool left;
282 	float perc;
idCNodeCapacity283 	CNodeCapacity(int nid=0,bool leftnode=false,float p=0): id(nid),left(leftnode),perc(p) {}
284 };
285 
286 // this class keeps the gene predictions (linked bundle nodes initially)
287 struct CGene:public GSeg { // I don't necessarily need to make this a GSeg since I can get the start&end from the exons
288 	char strand;
289 	char* geneID;
290 	char* geneName;
291 	float cov;    // this is the actual gene coverage
292 	float covsum; // this is a sum of transcripts coverages -> this is what we need for FPKM and TPM estimations
293 	GVec<GSeg> exons;  // all possible exons in gene (those are bnodes in bundle)
GSegCGene294 	CGene(int gstart=0, int gend=0, char _strand='.',char *gid=NULL, char *gname=NULL):GSeg(gstart,gend),
295 		strand(_strand), geneID(gid), geneName(gname), exons() { cov=0; covsum=0;}
296 	// getGeneID() and getGeneName() functions of gffobj return pointers to this attributes in gffobj so I don't need to clean them up here
297 };
298 
299 //holding transcript info for --merge mode
300 struct TAlnInfo {
301 	GStr name; //transcript name
302 	int fileidx; //index of transcript file in the TInputFiles.files array
303 	double cov;
304 	double fpkm;
305 	double tpm;
306 	int g;
nameTAlnInfo307 	TAlnInfo(const char* rname=NULL, int fidx=0):name(rname), fileidx(fidx),
308 			cov(-1),fpkm(-1),tpm(-1),g(-1) { }
309 };
310 
311 struct CJunction;
312 
313 struct CReadAln:public GSeg {
314 	//DEBUG ONLY:
315 	// GStr name;
316 	char strand; // 1, 0 (unkown), -1 (reverse)
317 	short int nh;
318 	uint len;
319 	float read_count;       // keeps count for all reads (including paired and unpaired)
320 	bool unitig;
321 	GVec<float> pair_count;   // keeps count for all paired reads
322 	GVec<int> pair_idx;     // keeps indeces for all pairs in assembly mode, or all reads that were collapsed in merge mode
323 	GVec<GSeg> segs; //"exons"
324 	GPVec<CJunction> juncs;
325 	union {
326 		TAlnInfo* tinfo;
327 		bool in_guide;
328 	};
329 
330 	CReadAln(char _strand=0, short int _nh=0,
GSegCReadAln331 			int rstart=0, int rend=0, TAlnInfo* tif=NULL): GSeg(rstart, rend), //name(rname),
332 					strand(_strand),nh(_nh), len(0), read_count(0), unitig(false),pair_count(),pair_idx(),
333 					segs(), juncs(false), tinfo(tif) { }
CReadAlnCReadAln334 	CReadAln(CReadAln &rd):GSeg(rd.start,rd.end) { // copy contructor
335 		strand=rd.strand;
336 		nh=rd.nh;
337 		len=rd.len;
338 		read_count=rd.read_count;
339 		unitig=rd.unitig;
340 		pair_count=rd.pair_count;
341 		pair_idx=rd.pair_idx;
342 		juncs=rd.juncs;
343 		tinfo=rd.tinfo;
344 	}
overlapSegLenCReadAln345 	int overlapSegLen(CReadAln* r) {
346 
347 		if (r->start>end || start>r->end) return 0;
348 
349 		int i=0;
350 		int j=0;
351 		int len=0;
352 		while(i<segs.Count()) {
353 			if(segs[i].end<r->segs[j].start) i++;
354 			else if(r->segs[j].end<segs[i].start) j++;
355 			else { // there is overlap
356 				len+=segs[i].overlapLen(r->segs[j].start,r->segs[j].end);
357 				if(segs[i].end<r->segs[j].end) i++;
358 				else j++;
359 			}
360 			if(j==r->segs.Count()) break;
361 		}
362 		return len;
363 	}
~CReadAlnCReadAln364 	~CReadAln() { if(mergeMode) {delete tinfo;} }
365 };
366 
367 struct CGraphinfo {
368 	int ngraph;
369 	int nodeno;
ngraphCGraphinfo370 	CGraphinfo(int ng=-1,int nnode=-1):ngraph(ng),nodeno(nnode){}
371 };
372 
373 struct CGJunc {
374 	int leftnode;
375 	int rightnode;
376 	double cov; // ngood
377 	double goodcov; // ngood_reads
leftnodeCGJunc378 	CGJunc(int n1=0,int n2=0,double _cov=0,double _goodcov=0):leftnode(n1),rightnode(n2),cov(_cov),goodcov(_goodcov){}
379 };
380 
381 
382 struct CGNode {
383 	int id;    // initial id in graphno
384 	bool last; // if this is last node (to be linked to sink later)
385 	bool keep; // if I keep it in the final count (true by default)
386 	bool merge; // if this node needs to be merged to its adjacent node
387 	bool future;
idCGNode388 	CGNode(int _id=0,bool _last=false,bool _keep=true, bool _merge=false, bool _future=false):id(_id),last(_last),keep(_keep),merge(_merge),future(_future){}
389 };
390 
391 struct CTreePat {
392 	int nodeno;
393 	int childno;
394 	CTransfrag *tr;
395 	CTreePat **nextpat;
nodenoCTreePat396 	CTreePat(int n=0,int cno=0):nodeno(n),childno(cno),tr(NULL),nextpat(NULL){
397 		if(cno) {
398 			GCALLOC(nextpat,cno*sizeof(CTreePat *));
399 			for(int i=0;i<cno;i++) nextpat[i]=NULL;
400 		}
401 	}
setchildsCTreePat402 	void setchilds(int cno) {
403 		if(cno && !nextpat) {
404 			GCALLOC(nextpat,cno*sizeof(CTreePat *));
405 			for(int i=0;i<cno;i++) nextpat[i]=NULL;
406 		}
407 		childno=cno;
408 	}
settreeCTreePat409 	void settree(int i, CTreePat *t) {
410 		if(i<childno) nextpat[i]=t;
411 	}
settreeCTreePat412 	CTreePat *settree(int nextpos,int n,int cno) {
413 		if(nextpos<childno) {
414 			if(!nextpat[nextpos]) nextpat[nextpos]=new CTreePat(n,cno);
415 			return(nextpat[nextpos]);
416 		}
417 		return(NULL);
418 	}
419 };
420 
421 struct CTrimPoint { // this can work as a guide keeper too, where pos is the guideidx, abundance is the flow, and start is the included status
422 	uint pos;
423 	float abundance;
424 	bool start;
posCTrimPoint425 	CTrimPoint(uint _pos=0,float abund=0.0,bool _start=true):pos(_pos),abundance(abund),start(_start) {}
426 };
427 
428 struct CInterval {
429 	uint pos; // interval start position
430 	float val; // interval value or interval last position depending on use
431 	CInterval *next; // next interval;
posCInterval432 	CInterval(uint _pos=0,float _val=0,CInterval *_next=NULL):pos(_pos),val(_val),next(_next) {}
433 };
434 
435 
436 struct CMaxIntv:public GSeg {
437 	GVec<CExon> node;
438 	CMaxIntv *next; // next interval;
GSegCMaxIntv439 	CMaxIntv(uint start=0,uint end=0):GSeg(start,end),node(),next(NULL) {}
GSegCMaxIntv440 	CMaxIntv(GVec<CExon>& _node,uint start,uint end,CMaxIntv *_next=NULL):GSeg(start,end),node(_node),next(_next) {}
441 };
442 
443 struct GInterval {
444 	uint start;
445 	uint end;
446 	GInterval *next;
startGInterval447 	GInterval(uint _start, uint _end,GInterval *_next=NULL):start(_start),end(_end),next(_next) {}
448 };
449 
450 struct CTrInfo {
451 	int trno;
452 	float abundance;
453 	float penalty;
trnoCTrInfo454 	CTrInfo(int tr=-1,float _abund=0.0, float _pen=0.0):trno(tr),abundance(_abund),penalty(_pen) {}
455 };
456 
457 struct CNetEdge {
458 	int link;
459 	float rate;
460 	bool fake;
linkCNetEdge461 	CNetEdge(int lnk=0.0,float r=0.0, bool f=false):link(lnk),rate(r),fake(f){}
462 };
463 
464 struct CComponent {
465 	float size;
466 	GVec<int> *set;
sizeCComponent467 	CComponent(float _size=0.0,GVec<int> *_set=NULL):size(_size),set(_set) {}
~CComponentCComponent468 	~CComponent() { if(set) delete set;}
469 };
470 
471 struct GEdge { // guide edge
472 	// if val < endval then this is start; otherwise it is end
473 	uint val;  // value of the boundary
474 	uint endval; // value of the other exon boundary shared with val
475 	int strand;
476 	bool operator<(const GEdge& o) const {
477 		return(val<o.val || (val==o.val && strand<o.strand));
478 	}
479 	bool operator==(const GEdge& o) const {
480 		return(val==o.val && strand==o.strand);
481 	}
valGEdge482 	GEdge(uint _val=0,uint _endval=0,int _strand=0):val(_val),endval(_endval),strand(_strand) {}
483 };
484 
485 struct CGraphnode:public GSeg {
486 	int nodeid;
487 	float cov;
488 	float capacity; // sum of all transcripts abundances exiting and through node
489 	float rate; // conversion rate between in and out transfrags of node
490 	//float frag; // number of fragments included in node
491 	GVec<int> child;
492 	GVec<int> parent;
493 	GBitVec childpat;
494 	GBitVec parentpat;
495 	GVec<int> trf; // transfrags that pass the node
496 	//CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0,float f=0):GSeg(s,e),nodeid(id),cov(nodecov),capacity(cap),rate(r),frag(f),child(),parent(),childpat(),parentpat(),trf(){}
GSegCGraphnode497 	CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0):GSeg(s,e),
498 			nodeid(id),cov(nodecov),capacity(cap),rate(r),child(),parent(),childpat(),parentpat(),trf(){}
499 };
500 
501 // # 0: strand; 1: start; 2: end; 3: nreads; 4: nreads_good;
502 struct CJunction:public GSeg {
503 	char strand; //-1,0,1
504 	char guide_match; //exact match of a ref intron?
505 	char consleft; // -1,0,1 -1 is not set up, 0 is non consensus, 1 is consensus
506 	char consright; // -1,0,1 -1 is not set up, 0 is non consensus, 1 is consensus
507 	double nreads;
508 	double nreads_good;
509 	double leftsupport;
510 	double rightsupport;
511 	double nm; // number of reads with a high nm (high mismatch)
512 	double mm; // number of reads that support a junction with both anchors bigger than longintronanchor
GSegCJunction513 	CJunction(int s=0,int e=0, char _strand=0):GSeg(s,e),
514 			strand(_strand), guide_match(0), consleft(-1), consright(-1),nreads(0),nreads_good(0),
515 			leftsupport(0),rightsupport(0),nm(0),mm(0) {}
516 	bool operator<(CJunction& b) {
517 		if (start<b.start) return true;
518 		if (start>b.start) return false;
519 		if (end<b.end) return true;
520 		if (end>b.end) return false;
521 		if (strand>b.strand) return true;
522 		return false;
523 	}
524 	bool operator==(CJunction& b) {
525 		return (start==b.start && end==b.end && strand==b.strand);
526 	}
527 };
528 
529 struct GReadAlnData {
530 	GBamRecord* brec;
531 	char strand; //-1, 0, 1
532 	int nh;
533 	int hi;
534 	GPVec<CJunction> juncs;
535 	union {
536 		TAlnInfo* tinfo;
537 		bool in_guide;
538 	};
539 	//GPVec< GVec<RC_ExonOvl> > g_exonovls; //>5bp overlaps with guide exons, for each read "exon"
540 	GReadAlnData(GBamRecord* bamrec=NULL, char nstrand=0, int num_hits=0,
brecGReadAlnData541 			int hit_idx=0, TAlnInfo* tif=NULL):brec(bamrec), strand(nstrand),
542 					nh(num_hits), hi(hit_idx), juncs(true), tinfo(tif) { } //, g_exonovls(true)
~GReadAlnDataGReadAlnData543 	~GReadAlnData() { if(mergeMode) delete tinfo; }
544 };
545 
546 
547 /*
548 struct CTCov { //covered transcript info
549 	int first_cov_exon;
550 	int last_cov_exon;
551 	int numt;
552 	GffObj* guide;
553 	bool whole;
554 	CTCov(GffObj* t, int fex=-1, int lex=0, int ntr=0):first_cov_exon(fex), last_cov_exon(lex),
555 			   numt(ntr), guide(t), whole(false) {
556 		whole = (first_cov_exon<0);
557 	}
558 	void print(FILE* f) {
559 		if (whole) { //from get_covered()
560 			guide->printTranscriptGff(f);
561 		}
562 		else { //from get_partial_covered()
563 			bool partial=true;
564 			if (last_cov_exon<0) {
565 				if (guide->exons.Count()==1) partial=false;
566 				last_cov_exon=first_cov_exon;
567 			} else {
568 			 if(last_cov_exon-first_cov_exon+1==guide->exons.Count()) partial=false;
569 			}
570 			for(int i=first_cov_exon;i<=last_cov_exon;i++) {
571 				if(partial) fprintf(f, "%s\tpartial\texon\t%u\t%u\t.\t%c\t.\ttranscript_id \"%s_part%d\";\n",guide->getGSeqName(),
572 						guide->exons[i]->start,guide->exons[i]->end,guide->strand,guide->getID(), numt);
573 				else fprintf(f, "%s\tcomplete\texon\t%u\t%u\t.\t%c\t.\ttranscript_id \"%s\";\n",guide->getGSeqName(),
574 						guide->exons[i]->start,guide->exons[i]->end,guide->strand,guide->getID());
575 			}
576 		}
577 	}
578 };
579 */
580 
581 // bundle data structure, holds all input data parsed from BAM file
582 // - r216 regression
583 struct BundleData {
584  BundleStatus status;
585  //int64_t bamStart; //start of bundle in BAM file
586  int idx; //index in the main bundles array
587  int start;
588  int end;
589  unsigned long numreads; // number of reads in bundles
590  /*
591  float wnumreads; // NEW: weighted numreads; a multi-mapped read mapped in 2 places will contribute only 0.5
592  double sumreads; // sum of all reads' lengths in bundle
593  double sumfrag; // sum of all fragment lengths (this includes the insertion so it is an estimate)
594  float num_reads; // number of all reads in bundle that we considered (weighted)
595  float num_cov; // how many coverages we added (weighted) to obtain sumcov
596  float num_frag; // how many fragments we added to obtain sumfrag
597  double num_fragments3;
598  double sum_fragments3;
599 */
600  double num_fragments; //aligned read/pairs
601  double frag_len;
602  double sum_cov; // sum of all transcripts coverages --> needed to compute TPMs
603  char covflags;
604 
605  GStr refseq; //reference sequence name
606  char* gseq; //actual genomic sequence for the bundle
607  GList<CReadAln> readlist;
608  GVec<float> bpcov[3];   // this needs to be changed to a more inteligent way of storing the data
609  GList<CJunction> junction;
610  GPVec<GffObj> keepguides;
611  //GPVec<CTCov> covguides;
612  GList<CPrediction> pred;
613  RC_BundleData* rc_data;
BundleDataBundleData614  BundleData():status(BUNDLE_STATUS_CLEAR), idx(0), start(0), end(0),
615 		 numreads(0),
616 		 num_fragments(0), frag_len(0),sum_cov(0),covflags(0),
617 		 refseq(), gseq(NULL), readlist(false,true), //bpcov(1024),
618 		 junction(true, true, true),
619 		 keepguides(false), pred(false), rc_data(NULL) {
620 	 for(int i=0;i<3;i++) 	bpcov[i].setCapacity(1024);
621  }
622 
getReadyBundleData623  void getReady(int currentstart, int currentend) {
624 	 //this is only called when the bundle is valid and ready to be processed
625 	 start=currentstart;
626 	 end=currentend;
627 	 //refseq=ref;
628 	 //tag all these guides
629 	 for (int i=0;i<this->keepguides.Count();++i) {
630 		 RC_TData* tdata=(RC_TData*)(keepguides[i]->uptr);
631 		 tdata->in_bundle=1;
632 	 }
633 	 status=BUNDLE_STATUS_READY;
634  }
635 
rc_initBundleData636  void rc_init(GffObj* t, GPVec<RC_TData>* rc_tdata,
637 		 GPVec<RC_Feature>* rc_edata, GPVec<RC_Feature>* rc_idata) {
638 	  if (rc_data==NULL) {
639 	  	rc_data = new RC_BundleData(t->start, t->end,
640 	  			rc_tdata, rc_edata, rc_idata);
641 	  }
642  }
643  /* after reference annotation was loaded
644  void rc_finalize_refs() {
645      if (rc_data==NULL) return;
646      //rc_data->setupCov();
647 	}
648 	Not needed here, we update the coverage span as each transcript is added
649  */
650  void keepGuide(GffObj* scaff, GPVec<RC_TData>* rc_tdata=NULL,
651 		 GPVec<RC_Feature>* rc_edata=NULL, GPVec<RC_Feature>* rc_idata=NULL);
652 
653  //bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi);
654  bool evalReadAln(GReadAlnData& alndata, char& strand);
655 
ClearBundleData656  void Clear() {
657 	keepguides.Clear();
658 	pred.Clear();
659 	pred.setSorted(false);
660 	readlist.Clear();
661 	for(int i=0;i<3;i++) {
662 		bpcov[i].Clear();
663 		bpcov[i].setCapacity(1024);
664 	}
665 	junction.Clear();
666 	start=0;
667 	end=0;
668 	status=BUNDLE_STATUS_CLEAR;
669 	numreads=0;
670 	num_fragments=0;
671 	frag_len=0;
672 	sum_cov=0;
673 	covflags=0;
674 	delete rc_data;
675 	GFREE(gseq);
676 	rc_data=NULL;
677  }
678 
~BundleDataBundleData679  ~BundleData() {
680 	Clear();
681  }
682 };
683 
684 void processRead(int currentstart, int currentend, BundleData& bdata,
685 		 GHash<int>& hashread, GReadAlnData& alndata);
686 		 //GBamRecord& brec, char strand, int nh, int hi);
687 
688 void countFragment(BundleData& bdata, GBamRecord& brec, int hi,int nh);
689 
690 int printResults(BundleData* bundleData, int geneno, GStr& refname);
691 int printMergeResults(BundleData* bundleData, int geneno, GStr& refname);
692 
693 int infer_transcripts(BundleData* bundle);
694 
695 // --- utility functions
696 void printGff3Header(FILE* f, GArgs& args);
697 
698 void printTime(FILE* f);
699 
700 
701 #endif
702