1 #ifndef GTF_TRACKING_H 2 #define GTF_TRACKING_H 3 /* 4 * gtf_tracking.h 5 * 6 */ 7 8 #ifdef HEAPROFILE 9 #include "gperftools/heap-profiler.h" 10 #endif 11 12 #include "gff.h" 13 #include "GFaSeqGet.h" 14 #include "GFastaIndex.h" 15 #include "GStr.h" 16 17 #define MAX_QFILES 500 18 19 extern bool gtf_tracking_verbose; 20 21 extern bool gtf_tracking_largeScale; 22 //many input files, no accuracy stats are generated, no *.tmap 23 // and exon attributes are discarded 24 25 int cmpByPtr(const pointer p1, const pointer p2); 26 27 bool t_contains(GffObj& a, GffObj& b); 28 //returns true only IF b has fewer exons than a AND a "contains" b 29 30 char* getGSeqName(int gseq_id); 31 32 //genomic fasta sequence handling 33 class GFastaHandler { 34 public: 35 char* fastaPath; 36 GFastaIndex* faIdx; getFastaFile(int gseq_id)37 char* getFastaFile(int gseq_id) { 38 if (fastaPath==NULL) return NULL; 39 GStr s(fastaPath); 40 s.trimR('/'); 41 s.appendfmt("/%s",getGSeqName(gseq_id)); 42 GStr sbase(s); 43 if (!fileExists(s.chars())) s.append(".fa"); 44 if (!fileExists(s.chars())) s.append("sta"); 45 if (fileExists(s.chars())) return Gstrdup(s.chars()); 46 else { 47 GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars()); 48 return NULL; 49 } 50 } 51 52 GFastaHandler(const char* fpath=NULL) { 53 fastaPath=NULL; 54 faIdx=NULL; 55 if (fpath!=NULL && fpath[0]!=0) init(fpath); 56 } 57 init(const char * fpath)58 void init(const char* fpath) { 59 if (fpath==NULL || fpath[0]==0) return; 60 if (!fileExists(fpath)) 61 GError("Error: file/directory %s does not exist!\n",fpath); 62 fastaPath=Gstrdup(fpath); 63 if (fastaPath!=NULL) { 64 if (fileExists(fastaPath)>1) { //exists and it's not a directory 65 GStr fainame(fastaPath); 66 //the .fai name might have been given directly 67 if (fainame.rindex(".fai")==fainame.length()-4) { 68 //.fai index file given directly 69 fastaPath[fainame.length()-4]=0; 70 if (!fileExists(fastaPath)) 71 GError("Error: cannot find fasta file for index %s !\n", fastaPath); 72 } 73 else fainame.append(".fai"); 74 //fainame.append(".fai"); 75 faIdx=new GFastaIndex(fastaPath,fainame.chars()); 76 GStr fainamecwd(fainame); 77 int ip=-1; 78 if ((ip=fainamecwd.rindex('/'))>=0) 79 fainamecwd.cut(0,ip+1); 80 if (!faIdx->hasIndex()) { //could not load index 81 //try current directory 82 if (fainame!=fainamecwd) { 83 if (fileExists(fainamecwd.chars())>1) { 84 faIdx->loadIndex(fainamecwd.chars()); 85 } 86 } 87 } //tried to load index 88 if (!faIdx->hasIndex()) { 89 GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath); 90 faIdx->buildIndex(); 91 if (faIdx->getCount()==0) GError("Error: no fasta records found!\n"); 92 GMessage("Fasta index rebuilt.\n"); 93 FILE* fcreate=fopen(fainame.chars(), "w"); 94 if (fcreate==NULL) { 95 GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars()); 96 if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w"); 97 if (fcreate==NULL) 98 GError("Error: cannot create fasta index %s!\n", fainamecwd.chars()); 99 } 100 if (faIdx->storeIndex(fcreate)<faIdx->getCount()) 101 GMessage("Warning: error writing the index file!\n"); 102 } //index created and attempted to store it 103 } //multi-fasta 104 } //genomic sequence given 105 } 106 GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) { 107 if (fastaPath==NULL) return NULL; 108 //genomic sequence given 109 GFaSeqGet* faseq=NULL; 110 if (faIdx!=NULL) { //fastaPath was the multi-fasta file name 111 char* gseqname=getGSeqName(gseq_id); 112 GFastaRec* farec=faIdx->getRecord(gseqname); 113 if (farec!=NULL) { 114 faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos, 115 farec->line_len, farec->line_blen); 116 faseq->loadall(); //just cache the whole sequence, it's faster 117 } 118 else { 119 GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname); 120 return NULL; 121 } 122 } 123 else //if (fileExists(fastaPath)==1) 124 { 125 char* sfile=getFastaFile(gseq_id); 126 if (sfile!=NULL) { 127 //if (gtf_tracking_verbose) 128 // GMessage("Processing sequence from fasta file '%s'\n",sfile); 129 faseq=new GFaSeqGet(sfile,checkFasta); 130 faseq->loadall(); 131 GFREE(sfile); 132 } 133 } //one fasta file per contig 134 return faseq; 135 } 136 ~GFastaHandler()137 ~GFastaHandler() { 138 GFREE(fastaPath); 139 delete faIdx; 140 } 141 }; 142 143 144 145 bool betterRef(GffObj* a, GffObj* b); //for better CovLink reference ranking 146 147 class GLocus; 148 149 class COvLink { 150 public: coderank(char c)151 static int coderank(char c) { 152 switch (c) { 153 case '=': return 0; //ichain match 154 case 'c': return 2; //containment (ichain fragment) 155 case 'j': return 4; // overlap with at least a junction match 156 case 'e': return 6; // single exon transfrag overlapping an intron of reference (possible pre-mRNA) 157 case 'o': return 8; // generic exon overlap 158 case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand 159 case 'x': return 18; // exon overlap on opposite strand (usually wrong strand mapping) 160 case 'i': return 20; // intra-intron 161 case 'p': return 90; //polymerase run 162 case 'r': return 92; //repeats 163 case 'u': return 94; //intergenic 164 case 0 : return 100; 165 default: return 96; 166 } 167 } 168 char code; 169 int rank; 170 GffObj* mrna; 171 int ovlen; 172 COvLink(char c=0,GffObj* m=NULL, int ovl=0) { 173 code=c; 174 mrna=m; 175 ovlen=ovl; 176 rank=coderank(c); 177 } 178 bool operator<(COvLink& b) { 179 if (rank==b.rank) 180 return (ovlen==b.ovlen)? betterRef(mrna, b.mrna) : (ovlen>b.ovlen); 181 else return rank<b.rank; 182 } 183 bool operator==(COvLink& b) { 184 return (rank==b.rank && mrna==b.mrna); 185 } 186 }; 187 188 class GISeg: public GSeg { 189 public: 190 GffObj* t; //pointer to the largest transcript with a segment this exact exon coordinates GSeg(s,e)191 GISeg(uint s=0,uint e=0, GffObj* ot=NULL):GSeg(s,e) { t=ot; } 192 }; 193 194 class GIArray:public GArray<GISeg> { 195 public: 196 GIArray(bool uniq=true):GArray<GISeg>(true,uniq) { } IAdd(GISeg * item)197 int IAdd(GISeg* item) { 198 if (item==NULL) return -1; 199 int result=-1; 200 if (Found(*item, result)) { 201 if (fUnique) { 202 //cannot add a duplicate, return index of existing item 203 if (item->t!=NULL && fArray[result].t!=NULL && 204 item->t->covlen>fArray[result].t->covlen) 205 fArray[result].t=item->t; 206 return result; 207 } 208 } 209 //Found sets result to the position where the item should be 210 idxInsert(result, *item); 211 return result; 212 } 213 214 }; 215 216 class CEqList: public GList<GffObj> { 217 public: 218 GffObj* head; CEqList()219 CEqList():GList<GffObj>((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true) { 220 head=NULL; 221 } 222 }; 223 224 class CTData { //transcript associated data 225 public: 226 GffObj* mrna; //owner transcript 227 GLocus* locus; 228 GList<COvLink> ovls; //overlaps with other transcripts (ref vs query) 229 //-- just for ichain match tracking: 230 GffObj* eqref; //ref transcript having an ichain match 231 int qset; //qry set index (qfidx), -1 means reference dataset 232 //GffObj* eqnext; //next GffObj in the linked list of matching transfrags 233 bool eqhead; 234 CEqList* eqlist; //keep track of matching transfrags 235 //int eqdata; // flags for EQ list (is it a list head?) 236 // Cufflinks specific data: 237 double FPKM; 238 double conf_hi; 239 double conf_lo; 240 double cov; 241 char classcode; //the best/final classcode ovls(true,true,true)242 CTData(GffObj* m=NULL, GLocus* l=NULL):ovls(true,true,true) { 243 mrna=m; 244 if (mrna!=NULL) mrna->uptr=this; 245 locus=l; 246 classcode=0; 247 eqref=NULL; 248 eqhead=false; 249 eqlist=NULL; 250 qset=-2; 251 FPKM=0; 252 conf_lo=0; 253 conf_hi=0; 254 cov=0; 255 } 256 ~CTData()257 ~CTData() { 258 ovls.Clear(); 259 //if ((eqdata & EQHEAD_TAG)!=0) delete eqlist; 260 //if (isEqHead()) delete eqlist; 261 if (eqhead) delete eqlist; 262 } 263 264 //inline bool eqHead() { return ((eqdata & EQHEAD_TAG)!=0); } 265 /* bool isEqHead() { 266 if (eqlist==NULL) return false; 267 return (eqlist->head==this->mrna); 268 } 269 */ joinEqList(GffObj * m)270 void joinEqList(GffObj* m) { //add list from m 271 //list head is set to the transfrag with the lower qset# 272 CTData* md=(CTData*)(m->uptr); 273 //ASSERT(md); 274 if (eqlist==NULL) { //no eqlist yet for this node 275 if (md->eqlist!=NULL) { //m in an eqlist already 276 eqlist=md->eqlist; 277 eqlist->Add(this->mrna); 278 CTData* md_head_d=(CTData*)(md->eqlist->head->uptr); 279 if (this->qset < md_head_d->qset) { 280 eqlist->head=this->mrna; 281 eqhead=true; 282 md_head_d->eqhead=false; 283 } 284 } 285 else { //m was not in an EQ list either 286 eqlist=new CEqList(); 287 eqlist->Add(this->mrna); 288 eqlist->Add(m); 289 md->eqlist=eqlist; 290 if (qset<md->qset) { 291 eqlist->head=this->mrna; 292 eqhead=true; 293 } 294 else { 295 eqlist->head=m; 296 md->eqhead=true; 297 } 298 } 299 }//no eqlist before 300 else { //merge two eqlists 301 if (eqlist==md->eqlist) //already in the same eqlist, nothing to do 302 return; 303 if (md->eqlist!=NULL) { 304 //copy the smaller list into the larger one 305 CEqList* srclst, *destlst; 306 if (md->eqlist->Count()<eqlist->Count()) { 307 srclst=md->eqlist; 308 destlst=eqlist; 309 } 310 else { 311 srclst=eqlist; 312 destlst=md->eqlist; 313 } 314 for (int i=0;i<srclst->Count();i++) { 315 destlst->Add(srclst->Get(i)); 316 CTData* od=(CTData*)((*srclst)[i]->uptr); 317 od->eqlist=destlst; 318 } 319 this->eqlist=destlst; 320 CTData* s_head_d=(CTData*)(srclst->head->uptr); 321 CTData* d_head_d=(CTData*)(destlst->head->uptr); 322 if (s_head_d->qset < d_head_d->qset ) { 323 this->eqlist->head=srclst->head; 324 s_head_d->eqhead=true; 325 d_head_d->eqhead=false; 326 } 327 else { 328 s_head_d->eqhead=false; 329 d_head_d->eqhead=true; 330 } 331 delete srclst; 332 } 333 else { //md->eqlist==NULL 334 eqlist->Add(m); 335 md->eqlist=eqlist; 336 CTData* head_d=(CTData*)(eqlist->head->uptr); 337 if (md->qset<head_d->qset) { 338 eqlist->head=m; 339 md->eqhead=true; 340 } 341 } 342 } 343 } 344 345 void addOvl(char code,GffObj* target=NULL, int ovlen=0) { 346 ovls.AddIfNew(new COvLink(code, target, ovlen)); 347 } getBestCode()348 char getBestCode() { 349 return (ovls.Count()>0) ? ovls[0]->code : 0 ; 350 } 351 bool operator<(CTData& b) { return (mrna < b.mrna); } 352 bool operator==(CTData& b) { return (mrna==b.mrna); } 353 }; 354 355 class GSuperLocus; 356 class GTrackLocus; 357 class GXLocus; 358 359 class GXSeg : public GSeg { 360 public: 361 int flags; GSeg(s,e)362 GXSeg(uint s=0, uint e=0, int f=0):GSeg(s,e),flags(f) { } 363 }; 364 365 //Data structure holding a query locus data (overlapping mRNAs on the same strand) 366 // and also the accuracy data of all mRNAs of a query locus 367 // (against all reference loci overlapping the same region) 368 class GLocus:public GSeg { 369 public: 370 int gseq_id; //id of underlying genomic sequence 371 int qfidx; // for locus tracking 372 GTrackLocus* t_ptr; //for locus tracking cluster 373 GffObj* mrna_maxcov; //transcript with maximum coverage (for main "ref" transcript) 374 GffObj* mrna_maxscore; //transcript with maximum gscore (for major isoform) 375 GList<GffObj> mrnas; //list of transcripts (isoforms) for this locus 376 GArray<GXSeg> uexons; //list of unique exons (covered segments) in this region 377 GArray<GSeg> mexons; //list of merged exons in this region 378 GIArray introns; 379 GList<GLocus> cmpovl; //temp list of overlapping qry/ref loci to compare to (while forming superloci) 380 381 //only for reference loci --> keep track of all superloci found for each qry dataset 382 // which contain this reference locus 383 GList<GSuperLocus>* superlst; 384 GXLocus* xlocus; //superlocus formed by exon overlaps across all qry datasets 385 // -- if genomic sequence was given: 386 int spl_major; // number of GT-AG splice site consensi 387 int spl_rare; // number of GC-AG, AT-AC and other rare splice site consensi 388 int spl_wrong; //number of "wrong" (unrecognized) splice site consensi 389 int ichains; //number of multi-exon mrnas 390 int ichainTP; //number of intron chains fully matching reference introns 391 //int ichainATP; 392 int mrnaTP; 393 //int mrnaATP; 394 int v; //user flag/data mrnas(true,false,false)395 GLocus(GffObj* mrna=NULL, int qidx=-1):mrnas(true,false,false),uexons(true,true),mexons(true,true), 396 introns(), cmpovl(true,false,true) { 397 //this will NOT free mrnas! 398 ichains=0; 399 gseq_id=-1; 400 qfidx=qidx; 401 t_ptr=NULL; 402 creset(); 403 xlocus=NULL; 404 mrna_maxcov=NULL; 405 mrna_maxscore=NULL; 406 superlst=new GList<GSuperLocus>(true,false,false); 407 if (mrna!=NULL) { 408 start=mrna->exons.First()->start; 409 end=mrna->exons.Last()->end;; 410 gseq_id=mrna->gseq_id; 411 GISeg seg; 412 for (int i=0;i<mrna->exons.Count();i++) { 413 seg.start=mrna->exons[i]->start; 414 seg.end=mrna->exons[i]->end; 415 int flags=0; //terminal exon flags (1=left end, 2=right end) 416 if (i==0) flags|=1; 417 if (i==mrna->exons.Count()-1) flags|=2; 418 GXSeg xseg(seg.start, seg.end, flags); 419 uexons.Add(xseg); 420 mexons.Add(seg); 421 if (i>0) { 422 seg.start=mrna->exons[i-1]->end+1; 423 seg.end=mrna->exons[i]->start-1; 424 seg.t=mrna; 425 introns.Add(seg); 426 } 427 } 428 mrnas.Add(mrna); 429 if (mrna->exons.Count()>1) ichains++; 430 ((CTData*)(mrna->uptr))->locus=this; 431 mrna_maxscore=mrna; 432 mrna_maxcov=mrna; 433 } 434 } ~GLocus()435 ~GLocus() { 436 delete superlst; 437 } creset()438 void creset() { 439 spl_major=0;spl_rare=0;spl_wrong=0; 440 v=0; //visited/other data 441 ichainTP=0; 442 //ichainATP=0; 443 mrnaTP=0; 444 //mrnaATP=0; 445 cmpovl.Clear(); 446 } 447 addMerge(GLocus & locus,GffObj * lnkmrna)448 void addMerge(GLocus& locus, GffObj* lnkmrna) { 449 //add all the elements of the other locus (merging) 450 //-- merge mexons 451 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons 452 int i=0; //index of first mexons with a merge 453 int j=0; //index current mrna exon 454 while (i<mexons.Count() && j<locus.mexons.Count()) { 455 uint istart=mexons[i].start; 456 uint iend=mexons[i].end; 457 uint jstart=locus.mexons[j].start; 458 uint jend=locus.mexons[j].end; 459 if (iend<jstart) { i++; continue; } 460 if (jend<istart) { j++; continue; } 461 //if (mexons[i].overlap(jstart, jend)) { 462 //exon overlap was found : 463 ovlexons.Add(j); 464 //extend mexons[i] as needed 465 if (jstart<istart) mexons[i].start=jstart; 466 if (jend>iend) { //mexons[i] end extend 467 mexons[i].end=jend; 468 //now this could overlap the next mexon(s), so we have to merge them all 469 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) { 470 uint nextend=mexons[i+1].end; 471 mexons.Delete(i+1); 472 if (nextend>mexons[i].end) { 473 mexons[i].end=nextend; 474 break; //no need to check next mexons 475 } 476 } //while next mexons merge 477 } // mexons[i] end extend 478 // } //exon overlap 479 j++; //check the next locus.mexon 480 } 481 //-- add the rest of the non-overlapping mexons: 482 GSeg seg; 483 for (int i=0;i<locus.mexons.Count();i++) { 484 seg.start=locus.mexons[i].start; 485 seg.end=locus.mexons[i].end; 486 if (!ovlexons.Exists(i)) mexons.Add(seg); 487 } 488 // -- merge uexons 489 //add to uexons: 490 for (int i=0;i<locus.uexons.Count();i++) { 491 uexons.Add(locus.uexons[i]); 492 } 493 for (int i=0;i<locus.introns.Count();i++) { 494 introns.IAdd(&(locus.introns[i])); 495 } 496 497 // -- add locus.mrnas 498 for (int i=0;i<locus.mrnas.Count();i++) { 499 ((CTData*)(locus.mrnas[i]->uptr))->locus=this; 500 if (locus.mrnas[i]!=lnkmrna) { 501 mrnas.Add(locus.mrnas[i]); 502 if (locus.mrnas[i]->exons.Count()>1) ichains++; 503 } 504 } 505 // -- adjust start/end as needed 506 if (start>locus.start) start=locus.start; 507 if (end<locus.end) end=locus.end; 508 if (mrna_maxcov->covlen<locus.mrna_maxcov->covlen) 509 mrna_maxcov=locus.mrna_maxcov; 510 if (mrna_maxscore->gscore<locus.mrna_maxscore->gscore) 511 mrna_maxscore=locus.mrna_maxscore; 512 } 513 514 exonOverlap(GLocus & loc)515 bool exonOverlap(GLocus& loc) { 516 //check if any mexons overlap! 517 int i=0; 518 int j=0; 519 while (i<mexons.Count() && j<loc.mexons.Count()) { 520 uint istart=mexons[i].start; 521 uint iend=mexons[i].end; 522 uint jstart=loc.mexons[j].start; 523 uint jend=loc.mexons[j].end; 524 if (iend<jstart) { i++; continue; } 525 if (jend<istart) { j++; continue; } 526 //exon overlap found if we're here: 527 return true; 528 } 529 return false; 530 } 531 add_mRNA(GffObj * mrna)532 bool add_mRNA(GffObj* mrna) { 533 if (mrnas.Count()>0 && mrna->gseq_id!=gseq_id) return false; //mrna must be on the same genomic seq 534 //check for exon overlap with existing mexons 535 //also update uexons and mexons accordingly, if mrna is added 536 uint mrna_start=mrna->exons.First()->start; 537 uint mrna_end=mrna->exons.Last()->end; 538 if (mrna_start>end || start>mrna_end) return false; 539 bool hasovl=false; 540 int i=0; //index of first mexons with a merge 541 int j=0; //index current mrna exon 542 GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons 543 while (i<mexons.Count() && j<mrna->exons.Count()) { 544 uint istart=mexons[i].start; 545 uint iend=mexons[i].end; 546 uint jstart=mrna->exons[j]->start; 547 uint jend=mrna->exons[j]->end; 548 if (iend<jstart) { i++; continue; } 549 if (jend<istart) { j++; continue; } 550 //exon overlap found if we're here: 551 ovlexons.Add(j); 552 hasovl=true; 553 //extend mexons[i] as needed 554 if (jstart<istart) mexons[i].start=jstart; 555 if (jend>iend) { //mexon stretch up 556 mexons[i].end=jend; 557 //now this could overlap the next mexon(s), so we have to merge them all 558 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) { 559 uint nextend=mexons[i+1].end; 560 mexons.Delete(i+1); 561 if (nextend>mexons[i].end) { 562 mexons[i].end=nextend; 563 break; //no need to check next mexons 564 } 565 } //while next mexons merge 566 } //possible mexons merge 567 568 j++; //check the next mrna exon 569 }//all vs all exon check loop 570 if (hasovl) { 571 GSeg seg; 572 //add the rest of the non-overlapping exons, 573 // and also to uexons etc. 574 for (int i=0;i<mrna->exons.Count();i++) { 575 seg.start=mrna->exons[i]->start; 576 seg.end=mrna->exons[i]->end; 577 if (!ovlexons.Exists(i)) mexons.Add(seg); 578 int xterm=0; 579 if (i==0) xterm|=1; 580 if (i==mrna->exons.Count()-1) xterm|=2; 581 GXSeg xseg(seg.start, seg.end, xterm); 582 uexons.Add(xseg); 583 GISeg iseg; 584 if (i>0) { 585 iseg.start=mrna->exons[i-1]->end+1; 586 iseg.end=mrna->exons[i]->start-1; 587 iseg.t=mrna; 588 introns.Add(iseg); 589 } 590 } 591 592 mrnas_add(mrna); 593 // add to mrnas 594 ((CTData*)mrna->uptr)->locus=this; 595 gseq_id=mrna->gseq_id; 596 if (mrna->exons.Count()>1) ichains++; 597 } 598 return hasovl; 599 } 600 601 //simpler,basic adding of a mrna mrnas_add(GffObj * mrna)602 void mrnas_add(GffObj* mrna) { 603 mrnas.Add(mrna); 604 // adjust start/end 605 if (start>mrna->start) start=mrna->start; 606 if (end<mrna->end) end=mrna->end; 607 if (mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna; 608 if (mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna; 609 } 610 }; 611 612 class GSuperLocus; 613 class GTrackLocus; 614 615 class GSuperLocus : public GSeg { 616 public: 617 int qfidx; //index of query dataset/file for which this superlocus was created 618 GList<GLocus> qloci; 619 GList<GLocus> rloci; 620 GList<GffObj> qmrnas; //list of transcripts (isoforms) for this locus 621 GArray<GSeg> qmexons; //list of merged exons in this region 622 GArray<GXSeg> quexons; //list of unique exons (covered segments) in this region 623 GIArray qintrons; //list of unique introns in this region 624 //same lists for reference: 625 GList<GffObj> rmrnas; //list of ref transcripts (isoforms) for this locus 626 GArray<GSeg> rmexons; //list of ref merged exons in this region 627 GArray<GXSeg> ruexons; //list of ref unique exons (covered segments) in this region 628 GArray<GISeg> rintrons; //list of unique introns in this region 629 // store problematic introns for printing: 630 GIArray i_missed; //missed reference introns (not overlapped by any qry intron) 631 GIArray i_notp; //wrong ref introns (one or both ends not matching any qry intron) 632 // 633 GIArray i_qwrong; //totally wrong qry introns (not overlapped by any ref intron) 634 GIArray i_qnotp; //imperfect qry introns (may overlap but has no "perfect" match) 635 636 637 int qbases_all; 638 int rbases_all; //in fact, it's all ref bases overlapping any query loci 639 int in_rmrnas; //count of ALL ref mrnas and loci given for this region 640 int in_rloci; //not just those overlapping qry data 641 // this will keep track of total qry loci, mrnas and exons in an area 642 int total_superloci; 643 int total_qloci; 644 int total_qloci_alt; //total qloci with multiple transcripts 645 646 int total_qmrnas; 647 int total_qexons; //unique exons 648 int total_qmexons; 649 int total_qintrons; //unique introns 650 int total_qichains; //total multi-exon transfrags predicted (incl. duplicates if -G) 651 652 // NOTE: these ref totals are only limited to data from loci overlapping any qry loci 653 int total_rmexons; 654 int total_rloci; 655 int total_rmrnas; 656 int total_richains; //total multi-exon reference transcripts 657 int total_rexons; 658 int total_rintrons; //unique introns 659 660 //--- accuracy data after compared to ref loci: 661 int locusQTP; 662 int locusTP; // +1 if ichainTP+mrnaTP > 0 663 //int locusAQTP; 664 //int locusATP; // 1 if ichainATP + mrnaATP > 0 665 int locusFP; 666 //int locusAFP; 667 //int locusAFN; 668 int locusFN; 669 //---transcript level accuracy -- all exon coordinates should match (most stringent) 670 int mrnaTP; // number of qry mRNAs with perfect match with ref transcripts 671 //int mrnaATP; 672 //---intron level accuracy (comparing the ordered set of splice sites): 673 int ichainTP; // number of fully matched ref intron chains (# correctly predicted ichains) 674 675 //int ichainFP; // number of qry intron chains not matching a reference intron chain 676 //int ichainFN; // number of ref intron chains in this region not being covered by a reference intron chain 677 /* 678 // same as above, but Approximate -- allowing a 5bp distance around splice site coordinates 679 int ichainATP; //as opposed to ichainTP, this also includes ref intron chains which are 680 //sub-chains of qry intron chains (rare cases) 681 */ 682 //---projected features --- 683 //---exon level accuracy: 684 int exonTP; //number of matched reference exons (true positives) 685 int exonQTP; //number of query exons matching reference exons 686 //int exonFP; //number of exons of query with no perfect match with a reference exon 687 //int exonFN; //number of exons of reference with no perfect match with a query exon 688 // same as the above but with acceptable approximation (10bp error window): 689 /*int exonATP; 690 int exonAFP; 691 int exonAFN;*/ 692 693 int intronTP; //number of perfectly overlapping introns (true positives) 694 int intronFP; //number of introns of query with no perfect match with a reference intron 695 int intronFN; //number of introns of reference with no perfect match with a query intron 696 /* 697 // same as the above but with acceptable approximation (10bp error window): 698 int intronATP; 699 int intronAFP; 700 int intronAFN; 701 */ 702 //-- EGASP added these too: 703 int m_exons; //number of exons totally missed (not overlapped *at all* by any query exon) 704 int w_exons; //numer of totally wrong exons (query exons not overlapping *at all* any reference exon) 705 int m_introns; //number of introns totally missed (not overlapped *at all* by any query intron) 706 int w_introns; //numer of totally wrong introns (query introns not overlapping *at all* any reference intron) 707 int m_loci; //missed loci 708 int w_loci; //novel/wrong loci 709 //---base level accuracy 710 int baseTP; //number of overlapping bases 711 int baseFP; //number of qry bases not overlapping reference 712 int baseFN; //number of ref bases not overlapping qry 713 // sorted,free,unique sorted,unique qloci(true,false,false)714 GSuperLocus(uint lstart=0,uint lend=0):qloci(true,false,false),rloci(true,false,false), 715 qmrnas(true,false,false), qmexons(true,false), quexons(true,false), qintrons(false), 716 rmrnas(true,false,false), rmexons(true,false), ruexons(true,false), rintrons(false), 717 i_missed(false),i_notp(false), i_qwrong(false), i_qnotp(false){ 718 qfidx=-1; 719 start=lstart; 720 end=lend; 721 qbases_all=0; 722 rbases_all=0; 723 baseTP=0;baseFP=0;baseFN=0; 724 locusTP=0;locusQTP=0; //locusAQTP=0; locusATP=0; 725 locusFP=0;// locusAFP=0;locusAFN=0; 726 locusFN=0; 727 in_rmrnas=0; 728 in_rloci=0; 729 w_loci=0; 730 m_loci=0; 731 total_superloci=0; 732 mrnaTP=0;//mrnaFP=0;mrnaFN=0; 733 ichainTP=0;//ichainFP=0;ichainFN=0; 734 exonTP=0;exonQTP=0; 735 //exonFP=0;exonFN=0; 736 intronTP=0;intronFP=0;intronFN=0; 737 /* mrnaATP=0;//mrnaAFP=0;mrnaAFN=0; 738 ichainATP=0;//ichainAFP=0;ichainAFN=0; 739 exonATP=0;exonAFP=0;exonAFN=0; 740 intronATP=0;intronAFP=0;intronAFN=0; */ 741 total_rmexons=0; 742 total_qmexons=0; 743 total_qexons=0;total_qloci=0;total_qmrnas=0; 744 total_qloci_alt=0; 745 total_qintrons=0;total_qichains=0; 746 total_rexons=0;total_rloci=0;total_rmrnas=0; 747 total_rintrons=0;total_richains=0; 748 w_exons=0; 749 m_exons=0; 750 w_introns=0; 751 m_introns=0; 752 } addQlocus(GLocus & loc)753 void addQlocus(GLocus& loc) { 754 if (start==0 || start>loc.start) start=loc.start; 755 if (end<loc.end) end=loc.end; 756 qloci.Add(&loc); 757 total_qloci++; 758 if (loc.ichains>0 && loc.mrnas.Count()>1) 759 total_qloci_alt++; 760 qmrnas.Add(loc.mrnas); 761 total_qmrnas+=loc.mrnas.Count(); 762 total_qichains+=loc.ichains; 763 qmexons.Add(loc.mexons); 764 total_qmexons+=loc.mexons.Count(); 765 quexons.Add(loc.uexons); 766 total_qexons+=loc.uexons.Count(); 767 qintrons.Add(loc.introns); 768 total_qintrons+=loc.introns.Count(); 769 } addRlocus(GLocus & loc)770 void addRlocus(GLocus& loc) { 771 if (start==0 || start>loc.start) start=loc.start; 772 if (end<loc.end) end=loc.end; 773 rloci.Add(&loc); 774 total_rloci++; 775 rmrnas.Add(loc.mrnas); 776 total_rmrnas+=loc.mrnas.Count(); 777 total_richains+=loc.ichains; 778 rmexons.Add(loc.mexons); 779 total_rmexons+=loc.mexons.Count(); 780 ruexons.Add(loc.uexons); 781 total_rexons+=loc.uexons.Count(); 782 rintrons.Add(loc.introns); 783 total_rintrons+=loc.introns.Count(); 784 } 785 calcF()786 void calcF() { 787 // base level 788 baseFP=qbases_all-baseTP; 789 baseFN=rbases_all-baseTP; 790 //exon level: 791 //exonFP=total_qexons-exonTP; 792 //exonFN=total_rexons-exonTP; 793 //intron stats 794 intronFP=total_qintrons-intronTP; 795 intronFN=total_rintrons-intronTP; 796 /* intronAFN=total_rintrons-intronATP; 797 intronAFP=total_qintrons-intronATP; 798 exonAFP=total_qexons-exonATP; 799 exonAFN=total_rexons-exonATP; */ 800 801 // ichain and transcript levels: 802 //ichainAFP=total_qichains-ichainATP; 803 //ichainFP=total_qichains-ichainTP; 804 //ichainAFN=total_richains-ichainATP; 805 //ichainFN=total_richains-ichainTP; 806 //mrnaFP=total_qmrnas-mrnaTP; 807 //mrnaFN=total_rmrnas-mrnaTP; 808 //mrnaAFP=total_qmrnas-mrnaATP; 809 //mrnaAFN=total_rmrnas-mrnaATP; 810 // locus/gene level: 811 locusFP=total_qloci-locusQTP; 812 /*locusAFN=total_rloci-locusATP; 813 locusAFP=total_qloci-locusAQTP;*/ 814 locusFN=total_rloci-locusTP; 815 } 816 addStats(GSuperLocus & s)817 void addStats(GSuperLocus& s) { 818 in_rmrnas+=s.in_rmrnas; 819 in_rloci+=s.in_rloci; 820 baseTP+=s.baseTP; 821 exonTP+=s.exonTP; 822 exonQTP+=s.exonQTP; 823 intronTP+=s.intronTP; 824 ichainTP+=s.ichainTP; 825 mrnaTP+=s.mrnaTP; 826 locusTP+=s.locusTP; 827 locusQTP+=s.locusQTP; 828 /* 829 mrnaATP+=s.mrnaATP; 830 exonATP+=s.exonATP; 831 intronATP+=s.intronATP; 832 ichainATP+=s.ichainATP; 833 locusATP+=s.locusATP; 834 locusAQTP+=s.locusAQTP; 835 */ 836 m_exons+=s.m_exons; 837 w_exons+=s.w_exons; 838 m_introns+=s.m_introns; 839 w_introns+=s.w_introns; 840 if (s.total_superloci==0 && s.qloci.Count()>0) s.total_superloci=1; 841 total_superloci+=s.total_superloci; 842 qbases_all+=s.qbases_all; 843 rbases_all+=s.rbases_all; 844 m_loci+=s.m_loci; 845 w_loci+=s.w_loci; 846 total_qexons+=s.total_qexons; 847 total_qintrons+=s.total_qintrons; 848 total_qmexons+=s.total_qmexons; 849 total_rexons+=s.total_rexons; 850 total_rintrons+=s.total_rintrons; 851 total_rmexons+=s.total_rmexons; 852 total_qmrnas+=s.total_qmrnas; 853 total_qichains+=s.total_qichains; 854 total_rmrnas+=s.total_rmrnas; 855 total_richains+=s.total_richains; 856 total_qloci+=s.total_qloci; 857 total_qloci_alt+=s.total_qloci_alt; 858 total_rloci+=s.total_rloci; 859 } 860 }; 861 862 class GSeqData { 863 int gseq_id; 864 public: 865 const char* gseq_name; 866 GList<GffObj> refs_f; //forward strand mRNAs 867 GList<GffObj> refs_r; //reverse strand mRNAs 868 GList<GffObj> mrnas_f; //forward strand mRNAs 869 GList<GffObj> mrnas_r; //reverse strand mRNAs 870 GList<GLocus> loci_f; //forward strand loci 871 GList<GLocus> loci_r; //reverse strand loci 872 //--> the fields below are not used by reference data -- 873 GList<GSuperLocus> gstats_f; //stats for forward strand superloci 874 GList<GSuperLocus> gstats_r; //stats for reverse strand superloci 875 GList<GLocus> nloci_f; //"novel" loci on forward strand 876 GList<GLocus> nloci_r; //"novel" loci on reverse strand 877 GList<GffObj> umrnas; //unknown orientation mrnas 878 GList<GLocus> nloci_u; //"novel" loci with no orientation found 879 880 GList<CTData> tdata; //transcript data (uptr holder for all mrnas here) 881 get_gseqid()882 int get_gseqid() { return gseq_id; } 883 884 //--< mrnas_f(true,true,false)885 GSeqData(int gid=-1):mrnas_f(true,true,false),mrnas_r(true,true,false), 886 loci_f(true,true,true),loci_r(true,true,true), 887 gstats_f(true,true,false),gstats_r(true,true,false), 888 nloci_f(true,false,true), nloci_r(true,false,true), 889 umrnas(true,true,false), nloci_u(true,true,true), tdata(false,true,false) { 890 gseq_id=gid; 891 if (gseq_id>=0) 892 gseq_name=GffObj::names->gseqs.getName(gseq_id); 893 } 894 bool operator==(GSeqData& d){ 895 return (gseq_id==d.gseq_id); 896 } 897 bool operator>(GSeqData& d){ 898 return (gseq_id>d.gseq_id); 899 } 900 bool operator<(GSeqData& d){ 901 return (gseq_id<d.gseq_id); 902 } 903 }; 904 905 906 // a group of qry loci and a transcript cluster for a single qry dataset 907 class GQCluster : public GList<GffObj> { 908 public: 909 GffObj* mrna_maxcov; //transcript with maximum coverage (for largest transcript) 910 GffObj* mrna_maxscore; //transcript with maximum gscore ( = major isoform for Cufflinks) 911 uint start; 912 uint end; 913 GList<GLocus> qloci; 914 //GCluster cl; //just a more compact way of keeping all transcripts in these loci 915 GQCluster(GList<GLocus>* loci=NULL):GList<GffObj>(true,false,false), 916 qloci(true,false,false) { 917 mrna_maxcov=NULL; 918 mrna_maxscore=NULL; 919 start=0; 920 end=0; 921 if (loci!=NULL) { 922 qloci.Add(*loci); 923 for (int i=0;i<loci->Count();i++) { 924 addLocus(loci->Get(i),false); 925 } 926 } 927 } 928 void addLocus(GLocus* loc, bool toLoci=true) { 929 //check so we don't add locus duplicates 930 if (toLoci) { 931 for (int i=0;i<qloci.Count();i++) { 932 if (loc==qloci[i]) return; 933 } 934 qloci.Add(loc); 935 } 936 for (int m=0;m<loc->mrnas.Count();m++) { 937 GffObj* mrna=loc->mrnas[m]; 938 Add(mrna); 939 if (start==0 || start>mrna->start) start=mrna->start; 940 if (end<mrna->end) end=mrna->end; 941 if (mrna_maxcov==NULL || mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna; 942 if (mrna_maxscore==NULL || mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna; 943 } 944 } 945 }; 946 947 //track a set of clustered qloci across multiple qry datasets 948 // the qloci in qcls[] overlap but not necessarily at exon level 949 // (so there can be multiple genes here in fact) 950 class GTrackLocus:public GSeg { 951 public: 952 char strand; 953 bool hasQloci; 954 //GLocus* rloc; //corresponding reference locus, if available 955 GList<GLocus> rloci; //ref loci found overlapping this region 956 GQCluster* qcls[MAX_QFILES]; //all qloci for this superlocus, grouped by dataset 957 GTrackLocus(GLocus* qloc=NULL, int q=-1):GSeg(0,0),rloci(true,false,true) { 958 strand='.'; 959 for (int i=0;i<MAX_QFILES;i++) qcls[i]=NULL; 960 if (qloc!=NULL) addQLocus(qloc,q); 961 } 962 addRLocus(GLocus * rl)963 void addRLocus(GLocus* rl) { 964 if (rl==NULL) return; 965 if (rl->qfidx>=0) 966 GError("Error: GTrackLocus::addRLocus called with a query locus (set# %d)\n", 967 rl->qfidx+1); 968 if (strand=='.') strand=rl->mrna_maxcov->strand; 969 if (start==0 || start>rl->start) start=rl->start; 970 if (end==0 || end<rl->end) end=rl->end; 971 rl->t_ptr=this; 972 rloci.Add(rl); 973 } 974 975 void addQLocus(GLocus* loc, int q=-1) { //adding qry locus 976 if (loc==NULL) return; 977 if (strand=='.' && loc->mrna_maxcov->strand!='.') 978 strand=loc->mrna_maxcov->strand; 979 if (loc->qfidx<0 && q<0) 980 GError("Error at GTrackLocus::addQLocus(): locus.qfidx not set and index not given!\n"); 981 if (q>=0) loc->qfidx=q; 982 else q=loc->qfidx; 983 if (start==0 || start>loc->start) start=loc->start; 984 if (end==0 || end<loc->end) end=loc->end; 985 if (qcls[q]==NULL) qcls[q]=new GQCluster(); 986 hasQloci=true; 987 loc->t_ptr = this; 988 qcls[q]->addLocus(loc); 989 } 990 add_Locus(GLocus * loc)991 bool add_Locus(GLocus* loc) { 992 if (start==0 || overlap(*loc)) { //simple range overlap, not exon overlap 993 if (loc->qfidx<0) addRLocus(loc); 994 else addQLocus(loc); 995 return true; 996 } 997 return false; 998 } 999 1000 addQCl(int q,GQCluster * qcl,GLocus * lnkloc)1001 void addQCl(int q, GQCluster* qcl, GLocus* lnkloc) { 1002 for (int i=0;i<qcl->qloci.Count();i++) { 1003 GLocus* loc=qcl->qloci[i]; 1004 if (loc==lnkloc) continue; // or if loc->t_ptr==this ? 1005 hasQloci=true; 1006 loc->t_ptr=this; 1007 qcls[q]->addLocus(loc); 1008 } 1009 } 1010 addMerge(GTrackLocus * loctrack,int qcount,GLocus * lnkloc)1011 void addMerge(GTrackLocus* loctrack, int qcount, GLocus* lnkloc) { 1012 if (loctrack==NULL) return; 1013 //merge qloci 1014 for (int q=0; q < qcount; q++) { 1015 if (qcls[q]==NULL) { 1016 if (loctrack->qcls[q]!=NULL) { 1017 qcls[q]=loctrack->qcls[q]; 1018 loctrack->qcls[q]=NULL; //just move pointer here 1019 //set all t_ptr pointers for moved loci 1020 for (int ql = 0; ql < qcls[q]->qloci.Count(); ql++) { 1021 qcls[q]->qloci[ql]->t_ptr=this; 1022 } 1023 hasQloci=true; 1024 } 1025 } 1026 else //existing qloci at q 1027 if (loctrack->qcls[q]!=NULL) { //merge elements 1028 addQCl(q, loctrack->qcls[q], lnkloc); 1029 } 1030 }//for each qset 1031 //merge rloci, if any 1032 if (loctrack->rloci.Count()>0) { 1033 for (int l=0;l<loctrack->rloci.Count();l++) { 1034 if (loctrack->rloci[l]!=lnkloc && loctrack->rloci[l]->t_ptr!=this) { 1035 rloci.Add(loctrack->rloci[l]); 1036 loctrack->rloci[l]->t_ptr=this; 1037 } 1038 } 1039 } 1040 if (loctrack->start<start) start=loctrack->start; 1041 if (loctrack->end>end) end=loctrack->end; 1042 if (strand=='.' && loctrack->strand!='.') strand=loctrack->strand; 1043 } 1044 1045 /* 1046 void add_QLoci(GList<GLocus>* loci, int q, GLocus& r) { 1047 // only add loci overlapping given refloc 1048 //rloc=&r; 1049 if (loci==NULL) return; 1050 for (int i=0;i<loci->Count();i++) { 1051 GLocus* loc=loci->Get(i); 1052 // if (!loc->exonOverlap(r)) continue; //do we really needed exon overlap? 1053 if (!loc->overlap(r)) continue; 1054 if (start==0 || start>loc->start) start=loc->start; 1055 if (end==0 || end<loc->end) end=loc->end; 1056 loc->t_ptr=this; 1057 loc->qfidx=q; 1058 if (qcls[q]==NULL) qcls[q]=new GQCluster(); 1059 qcls[q]->addLocus(loc); 1060 } 1061 strand=r.mrnas[0]->strand; 1062 } 1063 */ ~GTrackLocus()1064 ~GTrackLocus() { 1065 for (int q=0;q<MAX_QFILES;q++) 1066 if (qcls[q]!=NULL) { delete qcls[q]; qcls[q]=NULL; } 1067 } 1068 1069 GQCluster* operator[](int q) { 1070 if (q<0 || q>=MAX_QFILES) 1071 GError("Error: qfidx index out of bounds (%d) for GTrackLocus!\n",q); 1072 return qcls[q]; 1073 } 1074 }; 1075 1076 class GXConsensus:public GSeg { 1077 public: 1078 static int count; 1079 int id; //XConsensus ID 1080 int tss_id; //group id for those xconsensi with shared first exon 1081 int p_id; //group id for those xconsensi with "similar" protein 1082 GffObj* tcons; //longest transcript to represent the combined "consensus" structure 1083 GffObj* ref; //overlapping reference transcript 1084 char refcode; // the code for ref relationship (like in the tracking file) 1085 char* aa; 1086 int aalen; 1087 GXConsensus* contained; //if contained into another GXConsensus 1088 //list of ichain-matching query (cufflinks) transcripts that contributed to this consensus 1089 GList<GffObj> qchain; 1090 GXConsensus(GffObj* c, CEqList* qlst, GffObj* r=NULL, char rcode=0) qchain(false,false,false)1091 :qchain(false,false,false) { 1092 ref=r; 1093 refcode=rcode; 1094 tcons=c; 1095 if (qlst!=NULL) qchain.Add(*((GList<GffObj>*)qlst)); 1096 else qchain.Add(c); 1097 count++; 1098 tss_id=0; 1099 p_id=0; 1100 aalen=0; 1101 id=count; 1102 aa=NULL; 1103 start=tcons->start; 1104 end=tcons->end; 1105 contained=NULL; 1106 } ~GXConsensus()1107 ~GXConsensus() { 1108 if (aa!=NULL) GFREE(aa); 1109 } 1110 }; 1111 1112 class GXLocus:public GSeg { 1113 public: 1114 int id; 1115 int num_mtcons; //number of multi-exon "consensus" transcripts in this locus 1116 char strand; 1117 GList<GLocus> rloci; //list of ref loci overlapping any of the mexons 1118 GList<GLocus> qloci; //loci from all qry datasets that have overlapping exons with this region 1119 GArray<GSeg> mexons; //list of merged exonic regions for this locus 1120 GList<GXConsensus> tcons; 1121 GXLocus(GLocus* lfirst=NULL):GSeg(0,0), 1122 rloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true), 1123 qloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true), 1124 mexons(true,true), tcons(true,true,false) { 1125 strand='.'; 1126 num_mtcons=0; 1127 if (lfirst!=NULL) { 1128 add_Locus(lfirst); 1129 } 1130 id=0; 1131 } 1132 add_Locus(GLocus * loc)1133 bool add_Locus(GLocus* loc) { 1134 if (mexons.Count()>0 && (end<loc->start || start > loc->end)) 1135 return false; //no chance for overlapping exons 1136 if (mexons.Count()==0) { 1137 mexons.Add(loc->mexons); 1138 start=loc->start; 1139 end=loc->end; 1140 if (loc->qfidx<0) rloci.Add(loc); 1141 else qloci.Add(loc); 1142 strand=loc->mrna_maxcov->strand; 1143 loc->xlocus=this; 1144 return true; 1145 } 1146 int f=0; 1147 if (loc->qfidx<0) { 1148 if (rloci.Found(loc,f)) return false; 1149 } 1150 else if (qloci.Found(loc,f)) return false; 1151 1152 // -- merge mexons 1153 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons 1154 int i=0; //index of first mexons with a merge 1155 int j=0; //index current mrna exon 1156 while (i<mexons.Count() && j<loc->mexons.Count()) { 1157 uint istart=mexons[i].start; 1158 uint iend=mexons[i].end; 1159 uint jstart=loc->mexons[j].start; 1160 uint jend=loc->mexons[j].end; 1161 if (iend<jstart) { i++; continue; } 1162 if (jend<istart) { j++; continue; } 1163 //if (mexons[i].overlap(jstart, jend)) { 1164 //exon overlap was found : 1165 ovlexons.Add(j); 1166 //extend mexons[i] as needed 1167 if (jstart<istart) mexons[i].start=jstart; 1168 if (jend>iend) { //mexons[i] end extend 1169 mexons[i].end=jend; 1170 //now this could overlap the next mexon(s), so we have to merge them all 1171 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) { 1172 uint nextend=mexons[i+1].end; 1173 mexons.Delete(i+1); 1174 if (nextend>mexons[i].end) { 1175 mexons[i].end=nextend; 1176 break; //no need to check next mexons 1177 } 1178 } //while next mexons merge 1179 } // mexons[i] end extend 1180 // } //exon overlap 1181 j++; //check the next locus.mexon 1182 }//while mexons 1183 if (ovlexons.Count()==0) return false; 1184 if (strand=='.' && loc->mrna_maxcov->strand!='.') 1185 strand=loc->mrna_maxcov->strand; 1186 //have exon overlap: 1187 //-- add the rest of the non-overlapping mexons: 1188 GSeg seg; 1189 for (int i=0;i<loc->mexons.Count();i++) { 1190 seg.start=loc->mexons[i].start; 1191 seg.end=loc->mexons[i].end; 1192 if (!ovlexons.Exists(i)) mexons.Add(seg); 1193 } 1194 // -- adjust start/end as needed 1195 if (start>loc->start) start=loc->start; 1196 if (end<loc->end) end=loc->end; 1197 loc->xlocus=this; 1198 if (loc->qfidx<0) rloci.Add(loc); 1199 else qloci.Add(loc); 1200 return true; 1201 } 1202 addMerge(GXLocus & oxloc)1203 void addMerge(GXLocus& oxloc) { 1204 GArray<int> ovlexons(true,true); //list of oxloc.mexons indexes overlapping existing mexons 1205 int i=0; //index of first mexons with a merge 1206 int j=0; //index current mrna exon 1207 while (i<mexons.Count() && j<oxloc.mexons.Count()) { 1208 uint istart=mexons[i].start; 1209 uint iend=mexons[i].end; 1210 uint jstart=oxloc.mexons[j].start; 1211 uint jend=oxloc.mexons[j].end; 1212 if (iend<jstart) { i++; continue; } 1213 if (jend<istart) { j++; continue; } 1214 //if (mexons[i].overlap(jstart, jend)) { 1215 //exon overlap was found : 1216 ovlexons.Add(j); 1217 //extend mexons[i] as needed 1218 if (jstart<istart) mexons[i].start=jstart; 1219 if (jend>iend) { //mexons[i] end extend 1220 mexons[i].end=jend; 1221 //now this could overlap the next mexon(s), so we have to merge them all 1222 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) { 1223 uint nextend=mexons[i+1].end; 1224 mexons.Delete(i+1); 1225 if (nextend>mexons[i].end) { 1226 mexons[i].end=nextend; 1227 break; //no need to check next mexons 1228 } 1229 } //while next mexons merge 1230 } // mexons[i] end extend 1231 // } //exon overlap 1232 j++; //check the next oxloc.mexon 1233 } 1234 if (ovlexons.Count()==0) { 1235 GError("Error: attempt to merge GXLoci with non-overlapping exons!\n"); 1236 } 1237 //-- add the rest of the non-overlapping mexons: 1238 GSeg seg; 1239 for (int i=0;i<oxloc.mexons.Count();i++) { 1240 seg.start=oxloc.mexons[i].start; 1241 seg.end=oxloc.mexons[i].end; 1242 if (!ovlexons.Exists(i)) mexons.Add(seg); 1243 } 1244 if (start>oxloc.start) start=oxloc.start; 1245 if (end<oxloc.end) end=oxloc.end; 1246 if (strand=='.') strand=oxloc.strand; 1247 //-- steal all qloci and rloci 1248 for (int i=0;i<oxloc.qloci.Count();i++) { 1249 if (oxloc.qloci[i]->xlocus==this) continue; 1250 qloci.Add(oxloc.qloci[i]); 1251 oxloc.qloci[i]->xlocus=this; 1252 } 1253 for (int i=0;i<oxloc.rloci.Count();i++) { 1254 if (oxloc.rloci[i]->xlocus==this) continue; 1255 rloci.Add(oxloc.rloci[i]); 1256 oxloc.rloci[i]->xlocus=this; 1257 } 1258 } //::addMerge() 1259 1260 checkContainment()1261 void checkContainment() { 1262 //checking containment 1263 for (int j=0;j<tcons.Count()-1;j++) { 1264 GXConsensus* t=tcons[j]; 1265 for (int i=j+1;i<tcons.Count();i++) { 1266 if (tcons[i]->contained!=NULL && t->tcons->exons.Count()>1) continue; //will check the container later anyway 1267 int c_status=checkXConsContain(t->tcons, tcons[i]->tcons); 1268 if (c_status==0) continue; //no containment relationship between t and tcons[i] 1269 if (c_status>0) { //t is a container for tcons[i] 1270 tcons[i]->contained=t; 1271 } 1272 else { //contained into exising XCons 1273 t->contained=tcons[i]; 1274 break; 1275 } 1276 } 1277 } 1278 } 1279 checkXConsContain(GffObj * a,GffObj * b)1280 int checkXConsContain(GffObj* a, GffObj* b) { 1281 // returns 1 if a is the container of b 1282 // -1 if a is contained in b 1283 // 0 if no 1284 if (a->end<b->start || b->end<a->start) return 0; 1285 if (a->exons.Count()==b->exons.Count()) { 1286 if (a->exons.Count()>1) return 0; //same number of exons - no containment possible 1287 //because equivalence was already tested 1288 else { //single exon containment testing 1289 //this is fuzzy and messy (end result may vary depending on the testing order) 1290 int ovlen=a->exons[0]->overlapLen(b->exons[0]); 1291 int minlen=GMIN(a->covlen, b->covlen); 1292 if (ovlen>=minlen*0.8) { //if at least 80% of the shorter one is covered, it is contained 1293 return ((a->covlen>b->covlen) ? 1 : -1); 1294 } 1295 else return 0; 1296 //if (a->start<=b->start+10 && a->end+10>=b->end) return 1; 1297 // else { if (b->start<=a->start+10 && b->end+10>=a->end) return -1; 1298 // else return 0; 1299 //} 1300 } 1301 } 1302 //different number of exons: 1303 if (a->exons.Count()>b->exons.Count()) return t_contains(*a, *b) ? 1:0; 1304 else return t_contains(*b, *a) ? -1 : 0; 1305 } 1306 addXCons(GXConsensus * t)1307 void addXCons(GXConsensus* t) { 1308 tcons.Add(t); 1309 } 1310 1311 }; //GXLocus 1312 1313 1314 1315 int parse_mRNAs(GfList& mrnas, 1316 GList<GSeqData>& glstdata, 1317 bool is_ref_set=true, 1318 int check_for_dups=0, 1319 int qfidx=-1, bool only_multiexon=false); 1320 1321 //reading a mRNAs from a gff file and grouping them into loci 1322 void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data=NULL, 1323 int check_for_dups=0, int qfidx=-1, const char* fname=NULL, 1324 bool only_multiexon=false); 1325 1326 void read_transcripts(FILE* f, GList<GSeqData>& seqdata, 1327 #ifdef CUFFLINKS 1328 boost::crc_32_type& crc_result, 1329 #endif 1330 bool keepAttrs=true); 1331 1332 void sort_GSeqs_byName(GList<GSeqData>& seqdata); 1333 1334 bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen); 1335 1336 //strict intron chain match, or single-exon match 1337 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl=false, 1338 bool contain_only=false); 1339 1340 //use qsearch to "position" a given coordinate x within a list of transcripts sorted 1341 //by their start (lowest) coordinate; the returned int is the list index of the 1342 //closest GffObj starting just *ABOVE* coordinate x 1343 //Convention: returns -1 if there is no such GffObj (i.e. last GffObj start <= x) 1344 int qsearch_mrnas(uint x, GList<GffObj>& mrnas); 1345 int qsearch_loci(uint x, GList<GLocus>& segs); // same as above, but for GSeg lists 1346 1347 GSeqData* getRefData(int gid, GList<GSeqData>& ref_data); //returns reference GSeqData for a specific genomic sequence 1348 1349 #endif 1350