1 #ifndef GTF_TRACKING_H
2 #define GTF_TRACKING_H
3 /*
4  *  gtf_tracking.h
5  *
6  */
7 
8 #ifdef HEAPROFILE
9 #include "gperftools/heap-profiler.h"
10 #endif
11 
12 #include "gff.h"
13 #include "GFaSeqGet.h"
14 #include "GFastaIndex.h"
15 #include "GStr.h"
16 
17 #define MAX_QFILES 500
18 
19 extern bool gtf_tracking_verbose;
20 
21 extern bool gtf_tracking_largeScale;
22 //many input files, no accuracy stats are generated, no *.tmap
23 // and exon attributes are discarded
24 
25 int cmpByPtr(const pointer p1, const pointer p2);
26 
27 bool t_contains(GffObj& a, GffObj& b);
28 //returns true only IF b has fewer exons than a AND a "contains" b
29 
30 char* getGSeqName(int gseq_id);
31 
32 //genomic fasta sequence handling
33 class GFastaHandler {
34  public:
35   char* fastaPath;
36   GFastaIndex* faIdx;
getFastaFile(int gseq_id)37   char* getFastaFile(int gseq_id) {
38      if (fastaPath==NULL) return NULL;
39      GStr s(fastaPath);
40      s.trimR('/');
41      s.appendfmt("/%s",getGSeqName(gseq_id));
42      GStr sbase(s);
43      if (!fileExists(s.chars())) s.append(".fa");
44      if (!fileExists(s.chars())) s.append("sta");
45      if (fileExists(s.chars())) return Gstrdup(s.chars());
46          else {
47              GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
48              return NULL;
49              }
50      }
51 
52    GFastaHandler(const char* fpath=NULL) {
53      fastaPath=NULL;
54      faIdx=NULL;
55      if (fpath!=NULL && fpath[0]!=0) init(fpath);
56      }
57 
init(const char * fpath)58    void init(const char* fpath) {
59      if (fpath==NULL || fpath[0]==0) return;
60      if (!fileExists(fpath))
61        GError("Error: file/directory %s does not exist!\n",fpath);
62      fastaPath=Gstrdup(fpath);
63      if (fastaPath!=NULL) {
64          if (fileExists(fastaPath)>1) { //exists and it's not a directory
65             GStr fainame(fastaPath);
66             //the .fai name might have been given directly
67             if (fainame.rindex(".fai")==fainame.length()-4) {
68                //.fai index file given directly
69                fastaPath[fainame.length()-4]=0;
70                if (!fileExists(fastaPath))
71                   GError("Error: cannot find fasta file for index %s !\n", fastaPath);
72                }
73               else fainame.append(".fai");
74             //fainame.append(".fai");
75             faIdx=new GFastaIndex(fastaPath,fainame.chars());
76             GStr fainamecwd(fainame);
77             int ip=-1;
78             if ((ip=fainamecwd.rindex('/'))>=0)
79                fainamecwd.cut(0,ip+1);
80             if (!faIdx->hasIndex()) { //could not load index
81                //try current directory
82                   if (fainame!=fainamecwd) {
83                     if (fileExists(fainamecwd.chars())>1) {
84                        faIdx->loadIndex(fainamecwd.chars());
85                        }
86                     }
87                   } //tried to load index
88             if (!faIdx->hasIndex()) {
89                  GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
90                  faIdx->buildIndex();
91                  if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
92                  GMessage("Fasta index rebuilt.\n");
93                  FILE* fcreate=fopen(fainame.chars(), "w");
94                  if (fcreate==NULL) {
95                    GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
96                    if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
97                    if (fcreate==NULL)
98                       GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
99                    }
100                  if (faIdx->storeIndex(fcreate)<faIdx->getCount())
101                      GMessage("Warning: error writing the index file!\n");
102                  } //index created and attempted to store it
103             } //multi-fasta
104          } //genomic sequence given
105      }
106    GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
107      if (fastaPath==NULL) return NULL;
108      //genomic sequence given
109      GFaSeqGet* faseq=NULL;
110      if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
111         char* gseqname=getGSeqName(gseq_id);
112         GFastaRec* farec=faIdx->getRecord(gseqname);
113         if (farec!=NULL) {
114              faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
115                                farec->line_len, farec->line_blen);
116              faseq->loadall(); //just cache the whole sequence, it's faster
117              }
118         else {
119           GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
120           return NULL;
121           }
122         }
123      else //if (fileExists(fastaPath)==1)
124         {
125          char* sfile=getFastaFile(gseq_id);
126          if (sfile!=NULL) {
127             //if (gtf_tracking_verbose)
128             //   GMessage("Processing sequence from fasta file '%s'\n",sfile);
129             faseq=new GFaSeqGet(sfile,checkFasta);
130             faseq->loadall();
131             GFREE(sfile);
132             }
133          } //one fasta file per contig
134        return faseq;
135      }
136 
~GFastaHandler()137    ~GFastaHandler() {
138      GFREE(fastaPath);
139      delete faIdx;
140      }
141 };
142 
143 
144 
145 bool betterRef(GffObj* a, GffObj* b); //for better CovLink reference ranking
146 
147 class GLocus;
148 
149 class COvLink {
150 public:
coderank(char c)151 	static int coderank(char c) {
152 		switch (c) {
153 			case '=': return 0; //ichain match
154 			case 'c': return 2; //containment (ichain fragment)
155 			case 'j': return 4; // overlap with at least a junction match
156 			case 'e': return 6; // single exon transfrag overlapping an intron of reference (possible pre-mRNA)
157 			case 'o': return 8; // generic exon overlap
158 			case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand
159 			case 'x': return 18; // exon overlap on opposite strand (usually wrong strand mapping)
160 			case 'i': return 20; // intra-intron
161 			case 'p': return 90; //polymerase run
162 			case 'r': return 92; //repeats
163 			case 'u': return 94; //intergenic
164 			case  0 : return 100;
165 			default: return 96;
166 			}
167 	}
168     char code;
169     int rank;
170     GffObj* mrna;
171     int ovlen;
172     COvLink(char c=0,GffObj* m=NULL, int ovl=0) {
173 		code=c;
174 		mrna=m;
175 		ovlen=ovl;
176 		rank=coderank(c);
177 	}
178     bool operator<(COvLink& b) {
179 		if (rank==b.rank)
180 			return (ovlen==b.ovlen)? betterRef(mrna, b.mrna) : (ovlen>b.ovlen);
181 		else return rank<b.rank;
182 	}
183     bool operator==(COvLink& b) {
184 		return (rank==b.rank && mrna==b.mrna);
185 	}
186 };
187 
188 class GISeg: public GSeg {
189  public:
190    GffObj* t; //pointer to the largest transcript with a segment this exact exon coordinates
GSeg(s,e)191    GISeg(uint s=0,uint e=0, GffObj* ot=NULL):GSeg(s,e) { t=ot; }
192 };
193 
194 class GIArray:public GArray<GISeg> {
195   public:
196    GIArray(bool uniq=true):GArray<GISeg>(true,uniq) { }
IAdd(GISeg * item)197    int IAdd(GISeg* item) {
198      if (item==NULL) return -1;
199      int result=-1;
200      if (Found(*item, result)) {
201          if (fUnique) {
202            //cannot add a duplicate, return index of existing item
203            if (item->t!=NULL && fArray[result].t!=NULL &&
204                   item->t->covlen>fArray[result].t->covlen)
205                fArray[result].t=item->t;
206            return result;
207            }
208          }
209      //Found sets result to the position where the item should be
210      idxInsert(result, *item);
211      return result;
212      }
213 
214 };
215 
216 class CEqList: public GList<GffObj> {
217   public:
218     GffObj* head;
CEqList()219     CEqList():GList<GffObj>((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true) {
220       head=NULL;
221       }
222 };
223 
224 class CTData { //transcript associated data
225 public:
226 	GffObj* mrna; //owner transcript
227 	GLocus* locus;
228 	GList<COvLink> ovls; //overlaps with other transcripts (ref vs query)
229 	//-- just for ichain match tracking:
230 	GffObj* eqref; //ref transcript having an ichain match
231 	int qset; //qry set index (qfidx), -1 means reference dataset
232 	//GffObj* eqnext; //next GffObj in the linked list of matching transfrags
233 	bool eqhead;
234 	CEqList* eqlist; //keep track of matching transfrags
235 	//int eqdata; // flags for EQ list (is it a list head?)
236 	// Cufflinks specific data:
237 	double FPKM;
238 	double conf_hi;
239 	double conf_lo;
240 	double cov;
241 	char classcode; //the best/final classcode
ovls(true,true,true)242 	CTData(GffObj* m=NULL, GLocus* l=NULL):ovls(true,true,true) {
243 		mrna=m;
244 		if (mrna!=NULL) mrna->uptr=this;
245 		locus=l;
246 		classcode=0;
247 		eqref=NULL;
248 		eqhead=false;
249 		eqlist=NULL;
250 		qset=-2;
251 		FPKM=0;
252 		conf_lo=0;
253 		conf_hi=0;
254 		cov=0;
255 	}
256 
~CTData()257 	~CTData() {
258 		ovls.Clear();
259 		//if ((eqdata & EQHEAD_TAG)!=0) delete eqlist;
260 		//if (isEqHead()) delete eqlist;
261 		if (eqhead) delete eqlist;
262 	}
263 
264   //inline bool eqHead() { return ((eqdata & EQHEAD_TAG)!=0); }
265  /*  bool isEqHead() {
266       if (eqlist==NULL) return false;
267       return (eqlist->head==this->mrna);
268       }
269   */
joinEqList(GffObj * m)270   void joinEqList(GffObj* m) { //add list from m
271    //list head is set to the transfrag with the lower qset#
272   CTData* md=(CTData*)(m->uptr);
273   //ASSERT(md);
274   if (eqlist==NULL) { //no eqlist yet for this node
275      if (md->eqlist!=NULL) { //m in an eqlist already
276           eqlist=md->eqlist;
277           eqlist->Add(this->mrna);
278           CTData* md_head_d=(CTData*)(md->eqlist->head->uptr);
279           if (this->qset < md_head_d->qset) {
280                eqlist->head=this->mrna;
281                eqhead=true;
282                md_head_d->eqhead=false;
283                }
284         }
285         else { //m was not in an EQ list either
286           eqlist=new CEqList();
287           eqlist->Add(this->mrna);
288           eqlist->Add(m);
289           md->eqlist=eqlist;
290           if (qset<md->qset) {
291         	eqlist->head=this->mrna;
292         	eqhead=true;
293           }
294           else  {
295         	eqlist->head=m;
296         	md->eqhead=true;
297           }
298         }
299       }//no eqlist before
300      else { //merge two eqlists
301       if (eqlist==md->eqlist) //already in the same eqlist, nothing to do
302          return;
303       if (md->eqlist!=NULL) {
304         //copy the smaller list into the larger one
305         CEqList* srclst, *destlst;
306         if (md->eqlist->Count()<eqlist->Count()) {
307            srclst=md->eqlist;
308            destlst=eqlist;
309            }
310          else {
311            srclst=eqlist;
312            destlst=md->eqlist;
313            }
314          for (int i=0;i<srclst->Count();i++) {
315            destlst->Add(srclst->Get(i));
316            CTData* od=(CTData*)((*srclst)[i]->uptr);
317            od->eqlist=destlst;
318            }
319         this->eqlist=destlst;
320         CTData* s_head_d=(CTData*)(srclst->head->uptr);
321         CTData* d_head_d=(CTData*)(destlst->head->uptr);
322         if (s_head_d->qset < d_head_d->qset ) {
323              this->eqlist->head=srclst->head;
324              s_head_d->eqhead=true;
325              d_head_d->eqhead=false;
326         }
327         else {
328           s_head_d->eqhead=false;
329           d_head_d->eqhead=true;
330         }
331         delete srclst;
332       }
333       else { //md->eqlist==NULL
334         eqlist->Add(m);
335         md->eqlist=eqlist;
336         CTData* head_d=(CTData*)(eqlist->head->uptr);
337         if (md->qset<head_d->qset) {
338           eqlist->head=m;
339           md->eqhead=true;
340         }
341       }
342     }
343   }
344 
345 	void addOvl(char code,GffObj* target=NULL, int ovlen=0) {
346 		ovls.AddIfNew(new COvLink(code, target, ovlen));
347 	}
getBestCode()348 	char getBestCode() {
349 		return (ovls.Count()>0) ? ovls[0]->code : 0 ;
350     }
351 	bool operator<(CTData& b) { return (mrna < b.mrna); }
352 	bool operator==(CTData& b) { return (mrna==b.mrna); }
353 };
354 
355 class GSuperLocus;
356 class GTrackLocus;
357 class GXLocus;
358 
359 class GXSeg : public GSeg {
360 public:
361 	int flags;
GSeg(s,e)362 	GXSeg(uint s=0, uint e=0, int f=0):GSeg(s,e),flags(f) { }
363 };
364 
365 //Data structure holding a query locus data (overlapping mRNAs on the same strand)
366 // and also the accuracy data of all mRNAs of a query locus
367 // (against all reference loci overlapping the same region)
368 class GLocus:public GSeg {
369 public:
370     int gseq_id; //id of underlying genomic sequence
371     int qfidx; // for locus tracking
372     GTrackLocus* t_ptr; //for locus tracking cluster
373     GffObj* mrna_maxcov;  //transcript with maximum coverage (for main "ref" transcript)
374     GffObj* mrna_maxscore; //transcript with maximum gscore (for major isoform)
375     GList<GffObj> mrnas; //list of transcripts (isoforms) for this locus
376 	GArray<GXSeg> uexons; //list of unique exons (covered segments) in this region
377 	GArray<GSeg> mexons; //list of merged exons in this region
378 	GIArray introns;
379 	GList<GLocus> cmpovl; //temp list of overlapping qry/ref loci to compare to (while forming superloci)
380 
381 	//only for reference loci --> keep track of all superloci found for each qry dataset
382 	//                           which contain this reference locus
383 	GList<GSuperLocus>* superlst;
384 	GXLocus* xlocus; //superlocus formed by exon overlaps across all qry datasets
385 	// -- if genomic sequence was given:
386 	int spl_major; // number of GT-AG splice site consensi
387 	int spl_rare; // number of GC-AG, AT-AC and other rare splice site consensi
388 	int spl_wrong; //number of "wrong" (unrecognized) splice site consensi
389 	int ichains; //number of multi-exon mrnas
390 	int ichainTP; //number of intron chains fully matching reference introns
391 	//int ichainATP;
392 	int mrnaTP;
393 	//int mrnaATP;
394 	int v; //user flag/data
mrnas(true,false,false)395 	GLocus(GffObj* mrna=NULL, int qidx=-1):mrnas(true,false,false),uexons(true,true),mexons(true,true),
396 	  introns(), cmpovl(true,false,true) {
397 		//this will NOT free mrnas!
398 		ichains=0;
399 		gseq_id=-1;
400 		qfidx=qidx;
401 		t_ptr=NULL;
402 		creset();
403 		xlocus=NULL;
404 		mrna_maxcov=NULL;
405 		mrna_maxscore=NULL;
406 		superlst=new GList<GSuperLocus>(true,false,false);
407 		if (mrna!=NULL) {
408 			start=mrna->exons.First()->start;
409 			end=mrna->exons.Last()->end;;
410 			gseq_id=mrna->gseq_id;
411 			GISeg seg;
412 			for (int i=0;i<mrna->exons.Count();i++) {
413 				seg.start=mrna->exons[i]->start;
414 				seg.end=mrna->exons[i]->end;
415 				int flags=0; //terminal exon flags (1=left end, 2=right end)
416 				if (i==0) flags|=1;
417 				if (i==mrna->exons.Count()-1) flags|=2;
418 				GXSeg xseg(seg.start, seg.end, flags);
419 				uexons.Add(xseg);
420 				mexons.Add(seg);
421 				if (i>0) {
422 					seg.start=mrna->exons[i-1]->end+1;
423 					seg.end=mrna->exons[i]->start-1;
424 					seg.t=mrna;
425 					introns.Add(seg);
426 				}
427 			}
428 			mrnas.Add(mrna);
429 			if (mrna->exons.Count()>1) ichains++;
430 			((CTData*)(mrna->uptr))->locus=this;
431 			mrna_maxscore=mrna;
432 			mrna_maxcov=mrna;
433 		}
434 	}
~GLocus()435 	~GLocus() {
436 		delete superlst;
437 	}
creset()438 	void creset() {
439 		spl_major=0;spl_rare=0;spl_wrong=0;
440 		v=0; //visited/other data
441 		ichainTP=0;
442 		//ichainATP=0;
443 		mrnaTP=0;
444 		//mrnaATP=0;
445 		cmpovl.Clear();
446 	}
447 
addMerge(GLocus & locus,GffObj * lnkmrna)448 	void addMerge(GLocus& locus, GffObj* lnkmrna) {
449 		//add all the elements of the other locus (merging)
450 		//-- merge mexons
451 		GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
452 		int i=0; //index of first mexons with a merge
453 		int j=0; //index current mrna exon
454 		while (i<mexons.Count() && j<locus.mexons.Count()) {
455 			uint istart=mexons[i].start;
456 			uint iend=mexons[i].end;
457 			uint jstart=locus.mexons[j].start;
458 			uint jend=locus.mexons[j].end;
459 			if (iend<jstart) { i++; continue; }
460 			if (jend<istart) { j++; continue; }
461 			//if (mexons[i].overlap(jstart, jend)) {
462 			//exon overlap was found :
463 			ovlexons.Add(j);
464 			//extend mexons[i] as needed
465 			if (jstart<istart) mexons[i].start=jstart;
466 			if (jend>iend) { //mexons[i] end extend
467 				mexons[i].end=jend;
468 				//now this could overlap the next mexon(s), so we have to merge them all
469 				while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
470 					uint nextend=mexons[i+1].end;
471 					mexons.Delete(i+1);
472 					if (nextend>mexons[i].end) {
473 						mexons[i].end=nextend;
474 						break; //no need to check next mexons
475 					}
476                 } //while next mexons merge
477             } // mexons[i] end extend
478 			//  } //exon overlap
479 			j++; //check the next locus.mexon
480 		}
481 		//-- add the rest of the non-overlapping mexons:
482 		GSeg seg;
483 		for (int i=0;i<locus.mexons.Count();i++) {
484 			seg.start=locus.mexons[i].start;
485 			seg.end=locus.mexons[i].end;
486 			if (!ovlexons.Exists(i)) mexons.Add(seg);
487 		}
488         // -- merge uexons
489         //add to uexons:
490 		for (int i=0;i<locus.uexons.Count();i++) {
491 			uexons.Add(locus.uexons[i]);
492 		}
493 		for (int i=0;i<locus.introns.Count();i++) {
494 			introns.IAdd(&(locus.introns[i]));
495             }
496 
497 		// -- add locus.mrnas
498 		for (int i=0;i<locus.mrnas.Count();i++) {
499 			((CTData*)(locus.mrnas[i]->uptr))->locus=this;
500 			if (locus.mrnas[i]!=lnkmrna) {
501 				mrnas.Add(locus.mrnas[i]);
502 				if (locus.mrnas[i]->exons.Count()>1) ichains++;
503             }
504 		  }
505 		// -- adjust start/end as needed
506 		if (start>locus.start) start=locus.start;
507 		if (end<locus.end) end=locus.end;
508 		if (mrna_maxcov->covlen<locus.mrna_maxcov->covlen)
509 			mrna_maxcov=locus.mrna_maxcov;
510 		if (mrna_maxscore->gscore<locus.mrna_maxscore->gscore)
511 			mrna_maxscore=locus.mrna_maxscore;
512      }
513 
514 
exonOverlap(GLocus & loc)515 	bool exonOverlap(GLocus& loc) {
516 		//check if any mexons overlap!
517 		int i=0;
518 		int j=0;
519 		while (i<mexons.Count() && j<loc.mexons.Count()) {
520 			uint istart=mexons[i].start;
521 			uint iend=mexons[i].end;
522 			uint jstart=loc.mexons[j].start;
523 			uint jend=loc.mexons[j].end;
524 			if (iend<jstart) { i++; continue; }
525 			if (jend<istart) { j++; continue; }
526 			//exon overlap found if we're here:
527 			return true;
528 		}
529 		return false;
530     }
531 
add_mRNA(GffObj * mrna)532 	bool add_mRNA(GffObj* mrna) {
533 		if (mrnas.Count()>0 && mrna->gseq_id!=gseq_id) return false; //mrna must be on the same genomic seq
534 		//check for exon overlap with existing mexons
535 		//also update uexons and mexons accordingly, if mrna is added
536 		uint mrna_start=mrna->exons.First()->start;
537 		uint mrna_end=mrna->exons.Last()->end;
538 		if (mrna_start>end || start>mrna_end) return false;
539 		bool hasovl=false;
540 		int i=0; //index of first mexons with a merge
541 		int j=0; //index current mrna exon
542 		GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
543 		while (i<mexons.Count() && j<mrna->exons.Count()) {
544 			uint istart=mexons[i].start;
545 			uint iend=mexons[i].end;
546 			uint jstart=mrna->exons[j]->start;
547 			uint jend=mrna->exons[j]->end;
548 			if (iend<jstart) { i++; continue; }
549 			if (jend<istart) { j++; continue; }
550 			//exon overlap found if we're here:
551 			ovlexons.Add(j);
552 			hasovl=true;
553 			//extend mexons[i] as needed
554 			if (jstart<istart) mexons[i].start=jstart;
555 			if (jend>iend) { //mexon stretch up
556 				mexons[i].end=jend;
557 				//now this could overlap the next mexon(s), so we have to merge them all
558 				while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
559 					uint nextend=mexons[i+1].end;
560 					mexons.Delete(i+1);
561 					if (nextend>mexons[i].end) {
562 						mexons[i].end=nextend;
563 						break; //no need to check next mexons
564 					}
565 				} //while next mexons merge
566 			} //possible mexons merge
567 
568 			j++; //check the next mrna exon
569 		}//all vs all exon check loop
570 		if (hasovl) {
571 			GSeg seg;
572 	         //add the rest of the non-overlapping exons,
573 			 // and also to uexons etc.
574 			for (int i=0;i<mrna->exons.Count();i++) {
575 				seg.start=mrna->exons[i]->start;
576 				seg.end=mrna->exons[i]->end;
577 				if (!ovlexons.Exists(i)) mexons.Add(seg);
578 				int xterm=0;
579 				if (i==0) xterm|=1;
580 				if (i==mrna->exons.Count()-1) xterm|=2;
581 				GXSeg xseg(seg.start, seg.end, xterm);
582 				uexons.Add(xseg);
583 				GISeg iseg;
584 				if (i>0) {
585 					iseg.start=mrna->exons[i-1]->end+1;
586 					iseg.end=mrna->exons[i]->start-1;
587 					iseg.t=mrna;
588 					introns.Add(iseg);
589 				}
590 			}
591 
592 			mrnas_add(mrna);
593 			// add to mrnas
594 			((CTData*)mrna->uptr)->locus=this;
595 			gseq_id=mrna->gseq_id;
596 			if (mrna->exons.Count()>1) ichains++;
597 		}
598 		return hasovl;
599 	}
600 
601 	//simpler,basic adding of a mrna
mrnas_add(GffObj * mrna)602 	void mrnas_add(GffObj* mrna) {
603 		mrnas.Add(mrna);
604 		// adjust start/end
605 		if (start>mrna->start) start=mrna->start;
606 		if (end<mrna->end) end=mrna->end;
607 		if (mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna;
608 		if (mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna;
609     }
610 };
611 
612 class GSuperLocus;
613 class GTrackLocus;
614 
615 class GSuperLocus : public GSeg {
616 public:
617     int qfidx; //index of query dataset/file for which this superlocus was created
618     GList<GLocus> qloci;
619     GList<GLocus> rloci;
620     GList<GffObj> qmrnas; //list of transcripts (isoforms) for this locus
621     GArray<GSeg> qmexons; //list of merged exons in this region
622     GArray<GXSeg> quexons; //list of unique exons (covered segments) in this region
623     GIArray qintrons; //list of unique introns in this region
624     //same lists for reference:
625     GList<GffObj> rmrnas; //list of ref transcripts (isoforms) for this locus
626     GArray<GSeg> rmexons; //list of ref merged exons in this region
627     GArray<GXSeg> ruexons; //list of ref unique exons (covered segments) in this region
628     GArray<GISeg> rintrons; //list of unique introns in this region
629     // store problematic introns for printing:
630     GIArray i_missed; //missed reference introns (not overlapped by any qry intron)
631     GIArray i_notp;  //wrong ref introns (one or both ends not matching any qry intron)
632     //
633     GIArray i_qwrong; //totally wrong qry introns (not overlapped by any ref intron)
634     GIArray i_qnotp;  //imperfect qry introns (may overlap but has no "perfect" match)
635 
636 
637     int qbases_all;
638     int rbases_all; //in fact, it's all ref bases overlapping any query loci
639     int in_rmrnas; //count of ALL ref mrnas and loci given for this region
640     int in_rloci; //not just those overlapping qry data
641     // this will keep track of total qry loci, mrnas and exons in an area
642     int total_superloci;
643     int total_qloci;
644     int total_qloci_alt; //total qloci with multiple transcripts
645 
646     int total_qmrnas;
647     int total_qexons; //unique exons
648     int total_qmexons;
649     int total_qintrons; //unique introns
650     int total_qichains; //total multi-exon transfrags predicted (incl. duplicates if -G)
651 
652     // NOTE: these ref totals are only limited to data from loci overlapping any qry loci
653     int total_rmexons;
654     int total_rloci;
655     int total_rmrnas;
656     int total_richains; //total multi-exon reference transcripts
657     int total_rexons;
658     int total_rintrons; //unique introns
659 
660     //--- accuracy data after compared to ref loci:
661   int locusQTP;
662   int locusTP; // +1 if ichainTP+mrnaTP > 0
663   //int locusAQTP;
664 	//int locusATP; // 1 if ichainATP + mrnaATP > 0
665 	int locusFP;
666 	//int locusAFP;
667 	//int locusAFN;
668 	int locusFN;
669 	//---transcript level accuracy -- all exon coordinates should match (most stringent)
670 	int mrnaTP; // number of qry mRNAs with perfect match with ref transcripts
671 	//int mrnaATP;
672 	//---intron level accuracy (comparing the ordered set of splice sites):
673 	int ichainTP; // number of fully matched ref intron chains (# correctly predicted ichains)
674 
675 	//int ichainFP; // number of qry intron chains not matching a reference intron chain
676 	//int ichainFN; // number of ref intron chains in this region not being covered by a reference intron chain
677 	/*
678 	// same as above, but Approximate -- allowing a 5bp distance around splice site coordinates
679 	int ichainATP; //as opposed to ichainTP, this also includes ref intron chains which are
680                    //sub-chains of qry intron chains (rare cases)
681      */
682 	//---projected features ---
683 	//---exon level accuracy:
684 	int exonTP;  //number of matched reference exons (true positives)
685 	int exonQTP; //number of query exons matching reference exons
686 	//int exonFP; //number of exons of query with no perfect match with a reference exon
687 	//int exonFN; //number of exons of reference with no perfect match with a query exon
688 	// same as the above but with acceptable approximation (10bp error window):
689 	/*int exonATP;
690 	int exonAFP;
691 	int exonAFN;*/
692 
693 	int intronTP;  //number of perfectly overlapping introns (true positives)
694 	int intronFP; //number of introns of query with no perfect match with a reference intron
695 	int intronFN; //number of introns of reference with no perfect match with a query intron
696 	/*
697 	// same as the above but with acceptable approximation (10bp error window):
698 	int intronATP;
699 	int intronAFP;
700 	int intronAFN;
701 	*/
702 	//-- EGASP added these too:
703 	int m_exons; //number of exons totally missed (not overlapped *at all* by any query exon)
704 	int w_exons; //numer of totally wrong exons (query exons not overlapping *at all* any reference exon)
705 	int m_introns; //number of introns totally missed (not overlapped *at all* by any query intron)
706 	int w_introns; //numer of totally wrong introns (query introns not overlapping *at all* any reference intron)
707 	int m_loci; //missed loci
708 	int w_loci; //novel/wrong loci
709 	//---base level accuracy
710 	int baseTP; //number of overlapping bases
711 	int baseFP; //number of qry bases not overlapping reference
712 	int baseFN; //number of ref bases not overlapping qry
713 	//            sorted,free,unique       sorted,unique
qloci(true,false,false)714     GSuperLocus(uint lstart=0,uint lend=0):qloci(true,false,false),rloci(true,false,false),
715 	qmrnas(true,false,false), qmexons(true,false), quexons(true,false), qintrons(false),
716 	rmrnas(true,false,false), rmexons(true,false), ruexons(true,false), rintrons(false),
717 	i_missed(false),i_notp(false), i_qwrong(false), i_qnotp(false){
718 		qfidx=-1;
719 		start=lstart;
720 		end=lend;
721 		qbases_all=0;
722 		rbases_all=0;
723 		baseTP=0;baseFP=0;baseFN=0;
724 		locusTP=0;locusQTP=0; //locusAQTP=0; locusATP=0;
725 		locusFP=0;// locusAFP=0;locusAFN=0;
726 		locusFN=0;
727 		in_rmrnas=0;
728 		in_rloci=0;
729 		w_loci=0;
730 		m_loci=0;
731 		total_superloci=0;
732 		mrnaTP=0;//mrnaFP=0;mrnaFN=0;
733 		ichainTP=0;//ichainFP=0;ichainFN=0;
734 		exonTP=0;exonQTP=0;
735 		//exonFP=0;exonFN=0;
736 		intronTP=0;intronFP=0;intronFN=0;
737 		/* mrnaATP=0;//mrnaAFP=0;mrnaAFN=0;
738 		ichainATP=0;//ichainAFP=0;ichainAFN=0;
739 		exonATP=0;exonAFP=0;exonAFN=0;
740 		intronATP=0;intronAFP=0;intronAFN=0; */
741 		total_rmexons=0;
742 		total_qmexons=0;
743 		total_qexons=0;total_qloci=0;total_qmrnas=0;
744 		total_qloci_alt=0;
745 		total_qintrons=0;total_qichains=0;
746 		total_rexons=0;total_rloci=0;total_rmrnas=0;
747 		total_rintrons=0;total_richains=0;
748 		w_exons=0;
749 		m_exons=0;
750 		w_introns=0;
751 		m_introns=0;
752 	}
addQlocus(GLocus & loc)753     void addQlocus(GLocus& loc) {
754 		if (start==0 || start>loc.start) start=loc.start;
755 		if (end<loc.end) end=loc.end;
756 		qloci.Add(&loc);
757 		total_qloci++;
758 		if (loc.ichains>0 && loc.mrnas.Count()>1)
759 		    total_qloci_alt++;
760 		qmrnas.Add(loc.mrnas);
761 		total_qmrnas+=loc.mrnas.Count();
762 		total_qichains+=loc.ichains;
763 		qmexons.Add(loc.mexons);
764 		total_qmexons+=loc.mexons.Count();
765 		quexons.Add(loc.uexons);
766 		total_qexons+=loc.uexons.Count();
767 		qintrons.Add(loc.introns);
768 		total_qintrons+=loc.introns.Count();
769 	}
addRlocus(GLocus & loc)770     void addRlocus(GLocus& loc) {
771 		if (start==0 || start>loc.start) start=loc.start;
772 		if (end<loc.end) end=loc.end;
773 		rloci.Add(&loc);
774 		total_rloci++;
775 		rmrnas.Add(loc.mrnas);
776 		total_rmrnas+=loc.mrnas.Count();
777 		total_richains+=loc.ichains;
778 		rmexons.Add(loc.mexons);
779 		total_rmexons+=loc.mexons.Count();
780 		ruexons.Add(loc.uexons);
781 		total_rexons+=loc.uexons.Count();
782 		rintrons.Add(loc.introns);
783 		total_rintrons+=loc.introns.Count();
784 	}
785 
calcF()786     void calcF() {
787 		// base level
788 		baseFP=qbases_all-baseTP;
789 		baseFN=rbases_all-baseTP;
790 		//exon level:
791 		//exonFP=total_qexons-exonTP;
792 		//exonFN=total_rexons-exonTP;
793 		//intron stats
794 		intronFP=total_qintrons-intronTP;
795 		intronFN=total_rintrons-intronTP;
796 		/* intronAFN=total_rintrons-intronATP;
797 		intronAFP=total_qintrons-intronATP;
798 		exonAFP=total_qexons-exonATP;
799 		exonAFN=total_rexons-exonATP; */
800 
801 		// ichain and transcript levels:
802 		//ichainAFP=total_qichains-ichainATP;
803 		//ichainFP=total_qichains-ichainTP;
804 		//ichainAFN=total_richains-ichainATP;
805 		//ichainFN=total_richains-ichainTP;
806 		//mrnaFP=total_qmrnas-mrnaTP;
807 		//mrnaFN=total_rmrnas-mrnaTP;
808 		//mrnaAFP=total_qmrnas-mrnaATP;
809 		//mrnaAFN=total_rmrnas-mrnaATP;
810 		// locus/gene level:
811 		locusFP=total_qloci-locusQTP;
812 		/*locusAFN=total_rloci-locusATP;
813 		locusAFP=total_qloci-locusAQTP;*/
814 		locusFN=total_rloci-locusTP;
815 	}
816 
addStats(GSuperLocus & s)817     void addStats(GSuperLocus& s) {
818 		in_rmrnas+=s.in_rmrnas;
819 		in_rloci+=s.in_rloci;
820 		baseTP+=s.baseTP;
821 		exonTP+=s.exonTP;
822 		exonQTP+=s.exonQTP;
823 		intronTP+=s.intronTP;
824 		ichainTP+=s.ichainTP;
825 		mrnaTP+=s.mrnaTP;
826 		locusTP+=s.locusTP;
827 		locusQTP+=s.locusQTP;
828 		/*
829 		mrnaATP+=s.mrnaATP;
830 		exonATP+=s.exonATP;
831 		intronATP+=s.intronATP;
832 		ichainATP+=s.ichainATP;
833 		locusATP+=s.locusATP;
834 		locusAQTP+=s.locusAQTP;
835 		*/
836 		m_exons+=s.m_exons;
837 		w_exons+=s.w_exons;
838 		m_introns+=s.m_introns;
839 		w_introns+=s.w_introns;
840 		if (s.total_superloci==0 && s.qloci.Count()>0) s.total_superloci=1;
841 		total_superloci+=s.total_superloci;
842 		qbases_all+=s.qbases_all;
843 		rbases_all+=s.rbases_all;
844 		m_loci+=s.m_loci;
845 		w_loci+=s.w_loci;
846 		total_qexons+=s.total_qexons;
847 		total_qintrons+=s.total_qintrons;
848 		total_qmexons+=s.total_qmexons;
849 		total_rexons+=s.total_rexons;
850 		total_rintrons+=s.total_rintrons;
851 		total_rmexons+=s.total_rmexons;
852 		total_qmrnas+=s.total_qmrnas;
853 		total_qichains+=s.total_qichains;
854 		total_rmrnas+=s.total_rmrnas;
855 		total_richains+=s.total_richains;
856 		total_qloci+=s.total_qloci;
857 		total_qloci_alt+=s.total_qloci_alt;
858 		total_rloci+=s.total_rloci;
859     }
860 };
861 
862 class GSeqData {
863 	int gseq_id;
864 public:
865     const char* gseq_name;
866     GList<GffObj> refs_f; //forward strand mRNAs
867     GList<GffObj> refs_r; //reverse strand mRNAs
868 	GList<GffObj> mrnas_f; //forward strand mRNAs
869 	GList<GffObj> mrnas_r; //reverse strand mRNAs
870 	GList<GLocus> loci_f; //forward strand loci
871 	GList<GLocus> loci_r; //reverse strand loci
872 	//--> the fields below are not used by reference data --
873 	GList<GSuperLocus> gstats_f; //stats for forward strand superloci
874 	GList<GSuperLocus> gstats_r; //stats for reverse strand superloci
875 	GList<GLocus> nloci_f; //"novel" loci on forward strand
876 	GList<GLocus> nloci_r; //"novel" loci on reverse strand
877 	GList<GffObj> umrnas; //unknown orientation mrnas
878 	GList<GLocus> nloci_u; //"novel" loci with no orientation found
879 
880 	GList<CTData> tdata; //transcript data (uptr holder for all mrnas here)
881 
get_gseqid()882 	int get_gseqid() { return gseq_id; }
883 
884 	//--<
mrnas_f(true,true,false)885 	GSeqData(int gid=-1):mrnas_f(true,true,false),mrnas_r(true,true,false),
886 	loci_f(true,true,true),loci_r(true,true,true),
887 	gstats_f(true,true,false),gstats_r(true,true,false),
888 	nloci_f(true,false,true), nloci_r(true,false,true),
889 	umrnas(true,true,false), nloci_u(true,true,true), tdata(false,true,false) {
890 		gseq_id=gid;
891 		if (gseq_id>=0)
892 		  gseq_name=GffObj::names->gseqs.getName(gseq_id);
893 	}
894 	bool operator==(GSeqData& d){
895 		return (gseq_id==d.gseq_id);
896 	}
897 	bool operator>(GSeqData& d){
898 		return (gseq_id>d.gseq_id);
899 	}
900 	bool operator<(GSeqData& d){
901 		return (gseq_id<d.gseq_id);
902 	}
903 };
904 
905 
906 // a group of qry loci and a transcript cluster for a single qry dataset
907 class GQCluster : public GList<GffObj> {
908  public:
909    GffObj* mrna_maxcov;  //transcript with maximum coverage (for largest transcript)
910    GffObj* mrna_maxscore; //transcript with maximum gscore ( = major isoform for Cufflinks)
911    uint start;
912    uint end;
913    GList<GLocus> qloci;
914    //GCluster cl; //just a more compact way of keeping all transcripts in these loci
915    GQCluster(GList<GLocus>* loci=NULL):GList<GffObj>(true,false,false),
916                                     qloci(true,false,false) {
917      mrna_maxcov=NULL;
918      mrna_maxscore=NULL;
919      start=0;
920      end=0;
921      if (loci!=NULL)  {
922           qloci.Add(*loci);
923           for (int i=0;i<loci->Count();i++) {
924              addLocus(loci->Get(i),false);
925              }
926           }
927       }
928    void addLocus(GLocus* loc, bool toLoci=true) {
929      //check so we don't add locus duplicates
930      if (toLoci) {
931         for (int i=0;i<qloci.Count();i++) {
932            if (loc==qloci[i]) return;
933            }
934         qloci.Add(loc);
935         }
936      for (int m=0;m<loc->mrnas.Count();m++) {
937         GffObj* mrna=loc->mrnas[m];
938         Add(mrna);
939         if (start==0 || start>mrna->start) start=mrna->start;
940         if (end<mrna->end) end=mrna->end;
941         if (mrna_maxcov==NULL || mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna;
942         if (mrna_maxscore==NULL || mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna;
943         }
944      }
945 };
946 
947 //track a set of clustered qloci across multiple qry datasets
948 // the qloci in qcls[] overlap but not necessarily at exon level
949 // (so there can be multiple genes here in fact)
950 class GTrackLocus:public GSeg {
951   public:
952     char strand;
953     bool hasQloci;
954     //GLocus* rloc; //corresponding reference locus, if available
955     GList<GLocus> rloci; //ref loci found overlapping this region
956     GQCluster* qcls[MAX_QFILES]; //all qloci for this superlocus, grouped by dataset
957     GTrackLocus(GLocus* qloc=NULL, int q=-1):GSeg(0,0),rloci(true,false,true) {
958       strand='.';
959       for (int i=0;i<MAX_QFILES;i++) qcls[i]=NULL;
960       if (qloc!=NULL) addQLocus(qloc,q);
961       }
962 
addRLocus(GLocus * rl)963     void addRLocus(GLocus* rl) {
964       if (rl==NULL) return;
965       if (rl->qfidx>=0)
966           GError("Error: GTrackLocus::addRLocus called with a query locus (set# %d)\n",
967                         rl->qfidx+1);
968       if (strand=='.') strand=rl->mrna_maxcov->strand;
969       if (start==0 || start>rl->start) start=rl->start;
970       if (end==0 || end<rl->end) end=rl->end;
971       rl->t_ptr=this;
972       rloci.Add(rl);
973       }
974 
975     void addQLocus(GLocus* loc, int q=-1) { //adding qry locus
976       if (loc==NULL) return;
977       if (strand=='.' && loc->mrna_maxcov->strand!='.')
978            strand=loc->mrna_maxcov->strand;
979       if (loc->qfidx<0 && q<0)
980          GError("Error at GTrackLocus::addQLocus(): locus.qfidx not set and index not given!\n");
981       if (q>=0) loc->qfidx=q;
982            else q=loc->qfidx;
983       if (start==0 || start>loc->start) start=loc->start;
984       if (end==0 || end<loc->end) end=loc->end;
985       if (qcls[q]==NULL) qcls[q]=new GQCluster();
986       hasQloci=true;
987       loc->t_ptr = this;
988       qcls[q]->addLocus(loc);
989       }
990 
add_Locus(GLocus * loc)991     bool add_Locus(GLocus* loc) {
992       if (start==0 || overlap(*loc)) { //simple range overlap, not exon overlap
993          if (loc->qfidx<0) addRLocus(loc);
994                       else addQLocus(loc);
995          return true;
996          }
997       return false;
998       }
999 
1000 
addQCl(int q,GQCluster * qcl,GLocus * lnkloc)1001    void addQCl(int q, GQCluster* qcl, GLocus* lnkloc) {
1002       for (int i=0;i<qcl->qloci.Count();i++) {
1003          GLocus* loc=qcl->qloci[i];
1004          if (loc==lnkloc) continue; // or if loc->t_ptr==this ?
1005          hasQloci=true;
1006          loc->t_ptr=this;
1007          qcls[q]->addLocus(loc);
1008          }
1009      }
1010 
addMerge(GTrackLocus * loctrack,int qcount,GLocus * lnkloc)1011    void addMerge(GTrackLocus* loctrack, int qcount, GLocus* lnkloc) {
1012       if (loctrack==NULL) return;
1013       //merge qloci
1014       for (int q=0; q < qcount; q++) {
1015           if (qcls[q]==NULL) {
1016              if (loctrack->qcls[q]!=NULL) {
1017                  qcls[q]=loctrack->qcls[q];
1018                  loctrack->qcls[q]=NULL; //just move pointer here
1019                  //set all t_ptr pointers for moved loci
1020                  for (int ql = 0; ql < qcls[q]->qloci.Count(); ql++) {
1021                     qcls[q]->qloci[ql]->t_ptr=this;
1022                     }
1023                  hasQloci=true;
1024                  }
1025              }
1026            else //existing qloci at q
1027              if (loctrack->qcls[q]!=NULL) { //merge elements
1028                 addQCl(q, loctrack->qcls[q], lnkloc);
1029               }
1030           }//for each qset
1031       //merge rloci, if any
1032       if (loctrack->rloci.Count()>0) {
1033          for (int l=0;l<loctrack->rloci.Count();l++) {
1034            if (loctrack->rloci[l]!=lnkloc && loctrack->rloci[l]->t_ptr!=this) {
1035               rloci.Add(loctrack->rloci[l]);
1036               loctrack->rloci[l]->t_ptr=this;
1037               }
1038            }
1039          }
1040       if (loctrack->start<start) start=loctrack->start;
1041       if (loctrack->end>end) end=loctrack->end;
1042       if (strand=='.' && loctrack->strand!='.') strand=loctrack->strand;
1043       }
1044 
1045     /*
1046     void add_QLoci(GList<GLocus>* loci, int q, GLocus& r) {
1047       // only add loci overlapping given refloc
1048       //rloc=&r;
1049       if (loci==NULL) return;
1050       for (int i=0;i<loci->Count();i++) {
1051          GLocus* loc=loci->Get(i);
1052          // if (!loc->exonOverlap(r)) continue;  //do we really needed exon overlap?
1053          if (!loc->overlap(r)) continue;
1054          if (start==0 || start>loc->start) start=loc->start;
1055          if (end==0 || end<loc->end) end=loc->end;
1056          loc->t_ptr=this;
1057          loc->qfidx=q;
1058          if (qcls[q]==NULL) qcls[q]=new GQCluster();
1059          qcls[q]->addLocus(loc);
1060          }
1061       strand=r.mrnas[0]->strand;
1062       }
1063      */
~GTrackLocus()1064     ~GTrackLocus() {
1065       for (int q=0;q<MAX_QFILES;q++)
1066            if (qcls[q]!=NULL) { delete qcls[q]; qcls[q]=NULL; }
1067       }
1068 
1069     GQCluster* operator[](int q) {
1070       if (q<0 || q>=MAX_QFILES)
1071           GError("Error: qfidx index out of bounds (%d) for GTrackLocus!\n",q);
1072       return qcls[q];
1073       }
1074 };
1075 
1076 class GXConsensus:public GSeg {
1077  public:
1078    static int count;
1079    int id; //XConsensus ID
1080    int tss_id; //group id for those xconsensi with shared first exon
1081    int p_id; //group id for those xconsensi with "similar" protein
1082    GffObj* tcons; //longest transcript to represent the combined "consensus" structure
1083    GffObj* ref; //overlapping reference transcript
1084    char refcode; // the code for ref relationship (like in the tracking file)
1085    char* aa;
1086    int aalen;
1087    GXConsensus* contained; //if contained into another GXConsensus
1088    //list of ichain-matching query (cufflinks) transcripts that contributed to this consensus
1089    GList<GffObj> qchain;
1090    GXConsensus(GffObj* c, CEqList* qlst, GffObj* r=NULL, char rcode=0)
qchain(false,false,false)1091                    :qchain(false,false,false) {
1092       ref=r;
1093       refcode=rcode;
1094       tcons=c;
1095       if (qlst!=NULL) qchain.Add(*((GList<GffObj>*)qlst));
1096                  else qchain.Add(c);
1097       count++;
1098       tss_id=0;
1099       p_id=0;
1100       aalen=0;
1101       id=count;
1102       aa=NULL;
1103       start=tcons->start;
1104       end=tcons->end;
1105       contained=NULL;
1106       }
~GXConsensus()1107    ~GXConsensus() {
1108      if (aa!=NULL) GFREE(aa);
1109      }
1110 };
1111 
1112 class GXLocus:public GSeg {
1113  public:
1114     int id;
1115     int num_mtcons; //number of multi-exon "consensus" transcripts in this locus
1116     char strand;
1117     GList<GLocus> rloci; //list of ref loci overlapping any of the mexons
1118     GList<GLocus> qloci; //loci from all qry datasets that have overlapping exons with this region
1119     GArray<GSeg> mexons; //list of merged exonic regions for this locus
1120     GList<GXConsensus> tcons;
1121     GXLocus(GLocus* lfirst=NULL):GSeg(0,0),
1122         rloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true),
1123         qloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true),
1124         mexons(true,true), tcons(true,true,false) {
1125       strand='.';
1126       num_mtcons=0;
1127       if (lfirst!=NULL) {
1128          add_Locus(lfirst);
1129          }
1130       id=0;
1131       }
1132 
add_Locus(GLocus * loc)1133     bool add_Locus(GLocus* loc) {
1134       if (mexons.Count()>0 && (end<loc->start || start > loc->end))
1135               return false; //no chance for overlapping exons
1136       if (mexons.Count()==0) {
1137           mexons.Add(loc->mexons);
1138           start=loc->start;
1139           end=loc->end;
1140           if (loc->qfidx<0) rloci.Add(loc);
1141                       else  qloci.Add(loc);
1142           strand=loc->mrna_maxcov->strand;
1143           loc->xlocus=this;
1144           return true;
1145           }
1146       int f=0;
1147       if (loc->qfidx<0) {
1148         if (rloci.Found(loc,f)) return false;
1149         }
1150       else if (qloci.Found(loc,f)) return false;
1151 
1152       // -- merge mexons
1153       GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
1154       int i=0; //index of first mexons with a merge
1155       int j=0; //index current mrna exon
1156       while (i<mexons.Count() && j<loc->mexons.Count()) {
1157           uint istart=mexons[i].start;
1158           uint iend=mexons[i].end;
1159           uint jstart=loc->mexons[j].start;
1160           uint jend=loc->mexons[j].end;
1161           if (iend<jstart) { i++; continue; }
1162           if (jend<istart) { j++; continue; }
1163           //if (mexons[i].overlap(jstart, jend)) {
1164           //exon overlap was found :
1165           ovlexons.Add(j);
1166           //extend mexons[i] as needed
1167           if (jstart<istart) mexons[i].start=jstart;
1168           if (jend>iend) { //mexons[i] end extend
1169               mexons[i].end=jend;
1170               //now this could overlap the next mexon(s), so we have to merge them all
1171               while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
1172                   uint nextend=mexons[i+1].end;
1173                   mexons.Delete(i+1);
1174                   if (nextend>mexons[i].end) {
1175                       mexons[i].end=nextend;
1176                       break; //no need to check next mexons
1177                   }
1178               } //while next mexons merge
1179           } // mexons[i] end extend
1180           //  } //exon overlap
1181           j++; //check the next locus.mexon
1182         }//while mexons
1183       if (ovlexons.Count()==0) return false;
1184       if (strand=='.' && loc->mrna_maxcov->strand!='.')
1185              strand=loc->mrna_maxcov->strand;
1186       //have exon overlap:
1187       //-- add the rest of the non-overlapping mexons:
1188        GSeg seg;
1189        for (int i=0;i<loc->mexons.Count();i++) {
1190             seg.start=loc->mexons[i].start;
1191             seg.end=loc->mexons[i].end;
1192             if (!ovlexons.Exists(i)) mexons.Add(seg);
1193             }
1194       // -- adjust start/end as needed
1195       if (start>loc->start) start=loc->start;
1196       if (end<loc->end) end=loc->end;
1197       loc->xlocus=this;
1198       if (loc->qfidx<0) rloci.Add(loc);
1199                   else  qloci.Add(loc);
1200       return true;
1201       }
1202 
addMerge(GXLocus & oxloc)1203   void addMerge(GXLocus& oxloc) {
1204     GArray<int> ovlexons(true,true); //list of oxloc.mexons indexes overlapping existing mexons
1205     int i=0; //index of first mexons with a merge
1206     int j=0; //index current mrna exon
1207     while (i<mexons.Count() && j<oxloc.mexons.Count()) {
1208         uint istart=mexons[i].start;
1209         uint iend=mexons[i].end;
1210         uint jstart=oxloc.mexons[j].start;
1211         uint jend=oxloc.mexons[j].end;
1212         if (iend<jstart) { i++; continue; }
1213         if (jend<istart) { j++; continue; }
1214         //if (mexons[i].overlap(jstart, jend)) {
1215         //exon overlap was found :
1216         ovlexons.Add(j);
1217         //extend mexons[i] as needed
1218         if (jstart<istart) mexons[i].start=jstart;
1219         if (jend>iend) { //mexons[i] end extend
1220             mexons[i].end=jend;
1221             //now this could overlap the next mexon(s), so we have to merge them all
1222             while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
1223                 uint nextend=mexons[i+1].end;
1224                 mexons.Delete(i+1);
1225                 if (nextend>mexons[i].end) {
1226                     mexons[i].end=nextend;
1227                     break; //no need to check next mexons
1228                 }
1229             } //while next mexons merge
1230         } // mexons[i] end extend
1231         //  } //exon overlap
1232         j++; //check the next oxloc.mexon
1233     }
1234     if (ovlexons.Count()==0) {
1235       GError("Error: attempt to merge GXLoci with non-overlapping exons!\n");
1236       }
1237     //-- add the rest of the non-overlapping mexons:
1238     GSeg seg;
1239     for (int i=0;i<oxloc.mexons.Count();i++) {
1240         seg.start=oxloc.mexons[i].start;
1241         seg.end=oxloc.mexons[i].end;
1242         if (!ovlexons.Exists(i)) mexons.Add(seg);
1243         }
1244    if (start>oxloc.start) start=oxloc.start;
1245    if (end<oxloc.end) end=oxloc.end;
1246    if (strand=='.') strand=oxloc.strand;
1247     //-- steal all qloci and rloci
1248     for (int i=0;i<oxloc.qloci.Count();i++) {
1249          if (oxloc.qloci[i]->xlocus==this) continue;
1250          qloci.Add(oxloc.qloci[i]);
1251          oxloc.qloci[i]->xlocus=this;
1252          }
1253     for (int i=0;i<oxloc.rloci.Count();i++) {
1254          if (oxloc.rloci[i]->xlocus==this) continue;
1255          rloci.Add(oxloc.rloci[i]);
1256          oxloc.rloci[i]->xlocus=this;
1257          }
1258   } //::addMerge()
1259 
1260 
checkContainment()1261  void checkContainment() {
1262    //checking containment
1263   for (int j=0;j<tcons.Count()-1;j++) {
1264     GXConsensus* t=tcons[j];
1265     for (int i=j+1;i<tcons.Count();i++) {
1266        if (tcons[i]->contained!=NULL && t->tcons->exons.Count()>1) continue; //will check the container later anyway
1267        int c_status=checkXConsContain(t->tcons, tcons[i]->tcons);
1268        if (c_status==0) continue; //no containment relationship between t and tcons[i]
1269        if (c_status>0) { //t is a container for tcons[i]
1270             tcons[i]->contained=t;
1271             }
1272          else { //contained into exising XCons
1273             t->contained=tcons[i];
1274             break;
1275             }
1276        }
1277    }
1278   }
1279 
checkXConsContain(GffObj * a,GffObj * b)1280  int checkXConsContain(GffObj* a, GffObj* b) {
1281   // returns  1 if a is the container of b
1282   //         -1 if a is contained in b
1283   //          0 if no
1284   if (a->end<b->start || b->end<a->start) return 0;
1285   if (a->exons.Count()==b->exons.Count()) {
1286      if (a->exons.Count()>1) return 0; //same number of exons - no containment possible
1287                                        //because equivalence was already tested
1288            else { //single exon containment testing
1289              //this is fuzzy and messy (end result may vary depending on the testing order)
1290              int ovlen=a->exons[0]->overlapLen(b->exons[0]);
1291              int minlen=GMIN(a->covlen, b->covlen);
1292              if (ovlen>=minlen*0.8) { //if at least 80% of the shorter one is covered, it is contained
1293                 return ((a->covlen>b->covlen) ? 1 : -1);
1294                 }
1295               else return 0;
1296              //if (a->start<=b->start+10 && a->end+10>=b->end) return 1;
1297              //  else { if (b->start<=a->start+10 && b->end+10>=a->end) return -1;
1298              //          else return 0;
1299              //}
1300              }
1301      }
1302    //different number of exons:
1303    if (a->exons.Count()>b->exons.Count()) return t_contains(*a, *b) ? 1:0;
1304                      else return t_contains(*b, *a) ? -1 : 0;
1305   }
1306 
addXCons(GXConsensus * t)1307  void addXCons(GXConsensus* t) {
1308   tcons.Add(t);
1309   }
1310 
1311 }; //GXLocus
1312 
1313 
1314 
1315 int parse_mRNAs(GfList& mrnas,
1316 				 GList<GSeqData>& glstdata,
1317 				 bool is_ref_set=true,
1318 				 int check_for_dups=0,
1319 				 int qfidx=-1, bool only_multiexon=false);
1320 
1321 //reading a mRNAs from a gff file and grouping them into loci
1322 void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data=NULL,
1323               int check_for_dups=0, int qfidx=-1, const char* fname=NULL,
1324               bool only_multiexon=false);
1325 
1326 void read_transcripts(FILE* f, GList<GSeqData>& seqdata,
1327 #ifdef CUFFLINKS
1328   boost::crc_32_type& crc_result,
1329 #endif
1330   bool keepAttrs=true);
1331 
1332 void sort_GSeqs_byName(GList<GSeqData>& seqdata);
1333 
1334 bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen);
1335 
1336 //strict intron chain match, or single-exon match
1337 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl=false,
1338            bool contain_only=false);
1339 
1340 //use qsearch to "position" a given coordinate x within a list of transcripts sorted
1341 //by their start (lowest) coordinate; the returned int is the list index of the
1342 //closest GffObj starting just *ABOVE* coordinate x
1343 //Convention: returns -1 if there is no such GffObj (i.e. last GffObj start <= x)
1344 int qsearch_mrnas(uint x, GList<GffObj>& mrnas);
1345 int qsearch_loci(uint x, GList<GLocus>& segs); // same as above, but for GSeg lists
1346 
1347 GSeqData* getRefData(int gid, GList<GSeqData>& ref_data); //returns reference GSeqData for a specific genomic sequence
1348 
1349 #endif
1350