1 #ifndef GFF_H 2 #define GFF_H 3 4 //#define CUFFLINKS 1 5 6 #include "GBase.h" 7 #include "gdna.h" 8 #include "codons.h" 9 #include "GFaSeqGet.h" 10 #include "GList.hh" 11 #include "GHashMap.hh" 12 #include "GBitVec.h" 13 14 #ifdef CUFFLINKS 15 #include <boost/crc.hpp> // for boost::crc_32_type 16 #endif 17 18 //reserved Gffnames::feats entries -- basic feature types 19 extern int gff_fid_mRNA; // "mRNA" feature name 20 extern int gff_fid_transcript; // *RNA, *transcript feature name 21 extern int gff_fid_exon; 22 extern int gff_fid_CDS; 23 24 extern const uint GFF_MAX_LOCUS; 25 extern const uint GFF_MAX_EXON; 26 extern const uint GFF_MAX_INTRON; 27 //extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent 28 //extern const byte gfo_flagShift_LEVEL; 29 30 //extern bool gff_show_warnings; 31 32 #define GFF_LINELEN 4096 33 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n" 34 35 36 enum GffExonType { 37 exgffIntron=-1, // useless "intron" feature 38 exgffNone=0, //not recognizable or unitialized exonic segment 39 exgffStartCodon, //from "start_codon" feature (within CDS) 40 exgffStopCodon, //from "stop_codon" feature (may be outside CDS, but should) 41 exgffCDS, //from "CDS" feature 42 exgffUTR, //from "UTR" feature 43 exgffCDSUTR, //from a merge of UTR and CDS feature 44 exgffExon, //from "exon" feature 45 }; 46 47 extern const char* exonTypes[]; 48 49 const char* strExonType(char xtype); 50 class GfList; 51 52 typedef void GFFCommentParser(const char* cmline, GfList* gflst); //comment parser callback 53 //Useful for parsing/maintaining ref seq info from comment lines like this: 54 //##sequence-region chr1 1 24895642 55 56 class GffReader; 57 class GffObj; 58 59 //---transcript overlapping - utility functions 60 61 extern const byte CLASSCODE_OVL_RANK; //rank value just above 'o' class code 62 63 byte classcode_rank(char c); //returns priority value for class codes 64 65 struct TOvlData { 66 char ovlcode; 67 int ovlen; 68 int16_t numJmatch; //number of matching junctions (not introns) 69 GBitVec jbits; //bit array with 1 bit for each junctions (total = 2 * num_introns) ovlcodeTOvlData70 TOvlData(char oc=0, int olen=0, int16_t nmj=0, GBitVec* jb=NULL):ovlcode(oc), 71 ovlen(olen),numJmatch(nmj),jbits(jb) { } 72 }; 73 74 TOvlData getOvlData(GffObj& m, GffObj& r, bool stricterMatch=false, int trange=0); 75 76 char transcriptMatch(GffObj& a, GffObj& b, int& ovlen, int trange=0); //generic transcript match test 77 // -- return '=', '~' or 0 78 char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen, int trange=0); 79 //single-exon transcript match test - returning '=', '~' or 0 80 81 //--- 82 // -- tracking exon/CDS segments from local mRNA to genome coordinates 83 class GMapSeg:public GSeg { 84 public: 85 uint gstart; //genome start location 86 uint gend; //genome end location 87 //gend<gstart when mapped on reverse strand ! GSeg(s,e)88 GMapSeg(uint s=0, uint e=0, uint gs=0, uint ge=0):GSeg(s,e), 89 gstart(gs), gend(ge) {}; g_within(uint gc)90 int g_within(uint gc) { 91 //return 0 if not within gstart-gend intervals 92 //or offset from gstart otherwise (always positive) 93 if (gstart>gend) { //reverse strand mapping 94 if (gc<gend || gc>gstart) return 0; 95 return (gstart-gc); 96 } 97 else { 98 if (gc<gstart || gc>gend) return 0; 99 return (gc-gstart); 100 } 101 } 102 }; 103 104 struct GffScore { 105 float score; 106 int8_t precision; scoreGffScore107 GffScore(float sc=0, int8_t prec=-1):score(sc),precision(prec) { } printGffScore108 void print(FILE* outf) { 109 if (precision<0) fprintf(outf, "."); 110 else fprintf(outf, "%.*f", precision, score); 111 } sprintGffScore112 void sprint(char* outs) { 113 if (precision<0) sprintf(outs, "."); 114 else sprintf(outs, "%.*f", precision, score); 115 } 116 bool operator<(GffScore& v) { 117 return this->score<v.score; 118 } 119 bool operator<=(GffScore& v) { 120 return this->score<=v.score; 121 } 122 bool operator>(GffScore& v) { 123 return this->score>v.score; 124 } 125 bool operator>=(GffScore& v) { 126 return this->score>=v.score; 127 } 128 bool operator==(GffScore& v) { 129 return this->score==v.score; 130 } 131 }; 132 133 extern const GffScore GFFSCORE_NONE; 134 135 class GMapSegments:public GVec<GMapSeg> { 136 public: 137 int dir; //-1 or +1 (reverse/forward for genome coordinates) 138 GSeg lreg; // always 1,max local coord 139 GSeg greg; // genomic min,max coords 140 GMapSegments(char strand='+'):lreg(0,0),greg(0,0) { 141 dir=(strand=='-') ? -1 : 1; 142 } 143 void Clear(char strand='+') { 144 lreg.start=0;lreg.end=0; 145 greg.start=0;greg.end=0; 146 dir = (strand=='-') ? -1 : 1;; 147 GVec<GMapSeg>::Clear(); 148 } add(uint s,uint e,uint gs,uint ge)149 int add(uint s, uint e, uint gs, uint ge) { 150 if (dir<0) { 151 if (gs<ge) { 152 Gswap(gs, ge); 153 } 154 if (gs>greg.end) greg.end=gs; 155 if (ge<greg.start || greg.start==0) greg.start=ge; 156 } else { 157 if (ge>greg.end) greg.end=ge; 158 if (gs<greg.start || greg.start==0) greg.start=gs; 159 } 160 GMapSeg gm(s, e, gs, ge); 161 if (gm.end>lreg.end) lreg.end=gm.end; 162 if (gm.start<lreg.start || lreg.start==0) lreg.start=gm.start; 163 return GVec<GMapSeg>::Add(gm); 164 } gmap(uint lc)165 uint gmap(uint lc) { //takes a local coordinate and returns its mapping to genomic coordinates 166 //returns 0 if mapping cannot be performed! 167 if (lc==0 || fCount==0 || lc<lreg.start || lc>lreg.end) return 0; 168 //find local segment containing this coord 169 int i=0; 170 while (i<fCount) { 171 if (lc>=fArray[i].start && lc<=fArray[i].end) 172 return (fArray[i].gstart+dir*(lc-fArray[i].start)); 173 ++i; 174 } 175 return 0; 176 } lmap(uint gc)177 uint lmap(uint gc) { //takes a genome coordinate and returns its mapping to local coordinates 178 if (gc==0 || fCount==0 || gc<greg.start || gc>greg.end) return 0; 179 //find genomic segment containing this coord 180 int i=0; 181 while (i<fCount) { 182 int ofs=fArray[i].g_within(gc); 183 if (ofs!=0) 184 return (fArray[i].start+ofs); 185 ++i; 186 } 187 return 0; 188 } 189 }; 190 191 //reading a whole transcript from a BED-12 line 192 class BEDLine { 193 public: 194 bool skip; 195 char* dupline; //duplicate of original line 196 char* line; //this will have tabs replaced by \0 197 int llen; 198 char* gseqname; 199 uint fstart; 200 uint fend; 201 char strand; 202 char* ID; //transcript ID from BED-12 (4th column) 203 char* info; //13th column - these could be GFF3 attributes 204 uint cds_start; 205 uint cds_end; 206 char cds_phase; 207 GVec<GSeg> exons; 208 BEDLine(GffReader* r=NULL, const char* l=NULL); ~BEDLine()209 ~BEDLine() { 210 GFREE(dupline); 211 GFREE(line); 212 } 213 }; 214 215 class GffLine { 216 protected: 217 char* _parents; //stores a copy of the Parent attribute value, 218 //with commas replaced by \0 219 int _parents_len; 220 bool parseSegmentList(GVec<GSeg>& segs, char* str); 221 public: 222 char* dupline; //duplicate of original line 223 char* line; //this will have tabs replaced by \0 224 int llen; 225 char* gseqname; 226 char* track; 227 char* ftype; //feature name: mRNA/gene/exon/CDS 228 int ftype_id; 229 char* info; //the last, attributes' field, unparsed 230 uint fstart; 231 uint fend; 232 /* 233 uint qstart; //overlap coords on query, if available 234 uint qend; 235 uint qlen; //query len, if given 236 */ 237 float score; 238 int8_t score_decimals; 239 char strand; 240 union { 241 unsigned int flags; 242 struct { 243 bool is_exonlike:2; //CDS,codon, UTR, exon 244 }; 245 struct { 246 bool is_cds:1; //"cds" or "start/stop_codon" features 247 bool is_exon:1; //"exon" and "utr" features 248 bool is_transcript:1; //if current feature is *RNA or *transcript 249 bool is_gene:1; //current feature is *gene 250 //bool is_gff3:1; //line appears to be in GFF3 format (0=GTF) 251 bool is_gtf_transcript:1; //GTF transcript line with Parents parsed from gene_id 252 bool skipLine:1; 253 bool gffWarnings:1; 254 bool is_gene_segment:1; //for NCBI's D/J/V/C_gene_segment 255 }; 256 }; 257 int8_t exontype; // gffExonType 258 char phase; // '.' , '0', '1' or '2', can be also given as CDSphase attribute in TLF 259 uint cds_start; //if TLF: CDS=start:end attribute 260 uint cds_end; 261 GVec<GSeg> exons; //if TLF: exons= attribute 262 GVec<GSeg> cdss; //if TLF: CDS=segment_list attribute 263 char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3) 264 char* gene_id; //GTF only: value of "gene_id" attribute if present 265 char** parents; //for GTF only parents[0] is used 266 int num_parents; 267 char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id) 268 GffLine(GffReader* reader, const char* l); //parse the line accordingly discardParent()269 void discardParent() { 270 GFREE(_parents); 271 _parents_len=0; 272 num_parents=0; 273 GFREE(parents); 274 parents=NULL; 275 } 276 void ensembl_GFF_ID_process(char*& id); 277 void ensembl_GTF_ID_process(char*& id, const char* ver_attr); 278 static char* extractGFFAttr(char*& infostr, const char* oline, const char* pre, bool caseStrict=false, 279 bool enforce_GTF2=false, int* rlen=NULL, bool deleteAttr=true); 280 char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL){ 281 return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, true); 282 } 283 char* getAttrValue(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL) { 284 return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, false); 285 } GffLine(GffLine & l)286 GffLine(GffLine& l): _parents(NULL), _parents_len(l._parents_len), 287 dupline(NULL), line(NULL), llen(l.llen), gseqname(NULL), track(NULL), 288 ftype(NULL), ftype_id(l.ftype_id), info(NULL), fstart(l.fstart), fend(l.fend), 289 //qstart(l.fstart), qend(l.fend), qlen(l.qlen), 290 score(l.score), score_decimals(l.score_decimals), strand(l.strand), flags(l.flags), exontype(l.exontype), 291 phase(l.phase), cds_start(l.cds_start), cds_end(l.cds_end), exons(l.exons), cdss(l.cdss), 292 gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(l.num_parents), ID(NULL) { 293 //if (l==NULL || l->line==NULL) 294 // GError("Error: invalid GffLine(l)\n"); 295 //memcpy((void*)this, (void*)l, sizeof(GffLine)); 296 GMALLOC(line, llen+1); 297 memcpy(line, l.line, llen+1); 298 GMALLOC(dupline, llen+1); 299 memcpy(dupline, l.dupline, llen+1); 300 //--offsets within line[] 301 gseqname=line+(l.gseqname-l.line); 302 track=line+(l.track-l.line); 303 ftype=line+(l.ftype-l.line); 304 info=line+(l.info-l.line); 305 if (num_parents>0 && parents) { 306 GMALLOC(parents, num_parents*sizeof(char*)); 307 //_parents_len=l->_parents_len; copied above 308 _parents=NULL; //re-init, forget pointer copy 309 GMALLOC(_parents, _parents_len); 310 memcpy(_parents, l._parents, _parents_len); 311 for (int i=0;i<num_parents;i++) { 312 parents[i]=_parents+(l.parents[i] - l._parents); 313 } 314 } 315 //-- allocated string copies: 316 ID=Gstrdup(l.ID); 317 if (l.gene_name!=NULL) 318 gene_name=Gstrdup(l.gene_name); 319 if (l.gene_id!=NULL) 320 gene_id=Gstrdup(l.gene_id); 321 } GffLine()322 GffLine(): _parents(NULL), _parents_len(0), 323 dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL), 324 ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), //qstart(0), qend(0), qlen(0), 325 score(0), score_decimals(-1), strand(0), flags(0), exontype(0), phase(0), cds_start(0), cds_end(0), 326 exons(), cdss(), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) { 327 } ~GffLine()328 ~GffLine() { 329 GFREE(dupline); 330 GFREE(line); 331 GFREE(_parents); 332 GFREE(parents); 333 GFREE(ID); 334 GFREE(gene_name); 335 GFREE(gene_id); 336 } 337 }; 338 339 class GffAttr { 340 public: 341 union { 342 int id_full; 343 struct { 344 bool cds:1; 345 int attr_id:31; 346 }; 347 }; 348 char* attr_val; 349 GffAttr(int an_id, const char* av=NULL, bool is_cds=false):id_full(0), attr_val(NULL) { 350 attr_id=an_id; 351 setValue(av, is_cds); 352 } ~GffAttr()353 ~GffAttr() { 354 GFREE(attr_val); 355 } 356 void setValue(const char* av, bool is_cds=false) { 357 if (attr_val!=NULL) { 358 GFREE(attr_val); 359 } 360 if (av==NULL || av[0]==0) return; 361 //trim spaces 362 const char* vstart=av; 363 while (*vstart==' ') av++; 364 const char* vend=vstart; 365 bool keep_dq=false; 366 while (vend[1]!=0) { 367 if (*vend==' ' && vend[1]!=' ') keep_dq=true; 368 else if (*vend==';') keep_dq=true; 369 vend++; 370 } 371 //remove spaces at the end: 372 while (*vend==' ' && vend!=vstart) vend--; 373 //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes 374 if (!keep_dq && *vstart=='"' && *vend=='"') { 375 vend--; 376 vstart++; 377 } 378 attr_val=Gstrdup(vstart, vend); 379 cds=is_cds; 380 } 381 bool operator==(GffAttr& d){ 382 return (this==&d); 383 } 384 bool operator>(GffAttr& d){ 385 return (this>&d); 386 } 387 bool operator<(GffAttr& d){ 388 return (this<&d); 389 } 390 391 }; 392 393 class GffNameList; 394 class GffNames; 395 396 class GffNameInfo { 397 friend class GffNameList; 398 public: 399 int idx; 400 char* name; 401 GffNameInfo(const char* n=NULL):idx(-1),name(NULL) { 402 if (n) name=Gstrdup(n); 403 } 404 ~GffNameInfo()405 ~GffNameInfo() { 406 GFREE(name); 407 } 408 409 bool operator==(GffNameInfo& d){ 410 return (strcmp(this->name, d.name)==0); 411 } 412 bool operator<(GffNameInfo& d){ 413 return (strcmp(this->name, d.name)<0); 414 } 415 }; 416 417 class GffNameList:public GPVec<GffNameInfo> { 418 friend class GffNameInfo; 419 friend class GffNames; 420 protected: 421 GHashMap<const char*, GffNameInfo*> byName;//hash with shared keys 422 int idlast; //fList index of last added/reused name addStatic(const char * tname)423 int addStatic(const char* tname) {// fast add 424 GffNameInfo* f=new GffNameInfo(tname); 425 idlast=this->Add(f); 426 f->idx=idlast; 427 byName.Add(f->name,f); 428 return idlast; 429 } 430 public: 431 //GffNameList(int init_capacity=6):GList<GffNameInfo>(init_capacity, false,true,true), byName(false) { 432 GffNameList(int init_capacity=6):GPVec<GffNameInfo>(init_capacity, true), byName(false) { 433 idlast=-1; 434 setCapacity(init_capacity); 435 } lastNameUsed()436 char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; } lastNameId()437 int lastNameId() { return idlast; } getName(int nid)438 char* getName(int nid) { //retrieve name by its ID 439 if (nid<0 || nid>=fCount) 440 GError("GffNameList Error: invalid index (%d)\n",nid); 441 return fList[nid]->name; 442 } 443 addName(const char * tname)444 int addName(const char* tname) {//returns or create an id for the given name 445 //check idlast first, chances are it's the same feature name checked 446 /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0) 447 return idlast;*/ 448 GffNameInfo* f=byName.Find(tname); 449 int fidx=-1; 450 if (f!=NULL) fidx=f->idx; 451 else {//add new entry 452 f=new GffNameInfo(tname); 453 fidx=this->Add(f); 454 f->idx=fidx; 455 byName.Add(f->name,f); 456 } 457 idlast=fidx; 458 return fidx; 459 } 460 addNewName(const char * tname)461 int addNewName(const char* tname) { 462 GffNameInfo* f=new GffNameInfo(tname); 463 int fidx=this->Add(f); 464 f->idx=fidx; 465 byName.Add(f->name,f); 466 return fidx; 467 } 468 getId(const char * tname)469 int getId(const char* tname) { //only returns a name id# if found 470 GffNameInfo* f=byName.Find(tname); 471 if (f==NULL) return -1; 472 return f->idx; 473 } removeName()474 int removeName() { 475 GError("Error: removing names from GffNameList not allowed!\n"); 476 return -1; 477 } 478 }; 479 480 class GffNames { 481 public: 482 int numrefs; 483 GffNameList tracks; 484 GffNameList gseqs; 485 GffNameList attrs; 486 GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc. GffNames()487 GffNames():tracks(),gseqs(),attrs(), feats() { 488 numrefs=0; 489 //the order below is critical! 490 //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS 491 gff_fid_mRNA = feats.addStatic("mRNA");//index 0=gff_fid_mRNA 492 gff_fid_transcript=feats.addStatic("transcript");//index 1=gff_fid_transcript 493 gff_fid_exon=feats.addStatic("exon");//index 2=gff_fid_exon 494 gff_fid_CDS=feats.addStatic("CDS"); //index 3=gff_fid_CDS 495 } 496 }; 497 498 void gffnames_ref(GffNames* &n); 499 void gffnames_unref(GffNames* &n); 500 501 enum GffPrintMode { 502 pgtfAny, //print record as read, if GTF 503 pgtfExon, //print only exon features (CDS converted to exon if exons are missing) 504 pgtfCDS, //print only CDS features 505 pgtfBoth, //print both CDS and exon features 506 pgffAny, //print record as read (if isCDSonly() prints only CDS) 507 pgffExon, 508 pgffCDS, 509 pgffBoth, //enforce exon printing if isCDSOnly() 510 pgffTLF, //exon and CDS data shown as additional GFF attributes 511 //in the transcript line (Transcript Line Format) 512 //every line has the whole transcript data 513 pgffBED //print a BED line with all other GFF attributes in column 13 514 }; 515 516 517 class GffAttrs:public GList<GffAttr> { 518 public: GffAttrs()519 GffAttrs():GList<GffAttr>(false,true,true) { } add_if_new(GffNames * names,const char * attrname,const char * attrval)520 void add_if_new(GffNames* names, const char* attrname, const char* attrval) { 521 //adding a new value without checking for cds status 522 int nid=names->attrs.getId(attrname); 523 if (nid>=0) { //attribute name found in the dictionary 524 for (int i=0;i<Count();i++) 525 if (nid==Get(i)->attr_id) { return; } //don't update existing 526 } 527 else { //adding attribute name to global attr name dictionary 528 nid=names->attrs.addNewName(attrname); 529 } 530 this->Add(new GffAttr(nid, attrval)); 531 } 532 add_if_new(GffNames * names,const char * attrname,const char * attrval,bool is_cds)533 void add_if_new(GffNames* names, const char* attrname, const char* attrval, bool is_cds) { 534 int nid=names->attrs.getId(attrname); 535 if (nid>=0) { //attribute name found in the dictionary 536 for (int i=0;i<Count();i++) 537 if (nid==Get(i)->attr_id && is_cds==Get(i)->cds) { return; } //don't update existing 538 } 539 else { //adding attribute name to global attr name dictionary 540 nid=names->attrs.addNewName(attrname); 541 } 542 this->Add(new GffAttr(nid, attrval, is_cds)); 543 } 544 add_or_update(GffNames * names,const char * attrname,const char * val)545 void add_or_update(GffNames* names, const char* attrname, const char* val) { 546 //adding a new value without checking for cds status 547 int aid=names->attrs.getId(attrname); 548 if (aid>=0) { 549 //attribute found in the dictionary 550 for (int i=0;i<Count();i++) { 551 //do we have it? 552 if (aid==Get(i)->attr_id) { 553 //update the existing value for this attribute 554 Get(i)->setValue(val); 555 return; 556 } 557 } 558 } 559 else { //adding attribute name to global attr name dictionary 560 aid=names->attrs.addNewName(attrname); 561 } 562 this->Add(new GffAttr(aid, val)); 563 } 564 add_or_update(GffNames * names,const char * attrname,const char * val,bool is_cds)565 void add_or_update(GffNames* names, const char* attrname, const char* val, bool is_cds) { 566 int aid=names->attrs.getId(attrname); 567 if (aid>=0) { 568 //attribute found in the dictionary 569 for (int i=0;i<Count();i++) { 570 //do we have it? 571 if (aid==Get(i)->attr_id && Get(i)->cds==is_cds) { 572 //update the existing value for this attribute 573 Get(i)->setValue(val, is_cds); 574 return; 575 } 576 } 577 } 578 else { //adding attribute name to global attr name dictionary 579 aid=names->attrs.addNewName(attrname); 580 } 581 this->Add(new GffAttr(aid, val, is_cds)); 582 } 583 584 int haveId(int attr_id, bool is_cds=false) { 585 for (int i=0;i<Count();i++) 586 if (attr_id==Get(i)->attr_id && Get(i)->cds==is_cds) 587 return i; 588 return -1; 589 } 590 591 int haveId(const char* attrname, GffNames* names, bool is_cds=false) { 592 int aid=names->attrs.getId(attrname); 593 if (aid>=0) { 594 for (int i=0;i<Count();i++) 595 if (aid==Get(i)->attr_id && Get(i)->cds==is_cds) 596 return i; 597 } 598 return -1; 599 } 600 getAttr(GffNames * names,const char * attrname)601 char* getAttr(GffNames* names, const char* attrname) { 602 int aid=names->attrs.getId(attrname); 603 if (aid>=0) 604 for (int i=0;i<Count();i++) 605 if (aid==Get(i)->attr_id) return Get(i)->attr_val; 606 return NULL; 607 } 608 getAttr(GffNames * names,const char * attrname,bool is_cds)609 char* getAttr(GffNames* names, const char* attrname, bool is_cds) { 610 int aid=names->attrs.getId(attrname); 611 if (aid>=0) 612 for (int i=0;i<Count();i++) 613 if (aid==Get(i)->attr_id && Get(i)->cds==is_cds) return Get(i)->attr_val; 614 return NULL; 615 } 616 getAttr(int aid)617 char* getAttr(int aid) { 618 if (aid>=0) 619 for (int i=0;i<Count();i++) 620 if (aid==Get(i)->attr_id) return Get(i)->attr_val; 621 return NULL; 622 } 623 getAttr(int aid,bool is_cds)624 char* getAttr(int aid, bool is_cds) { 625 if (aid>=0) 626 for (int i=0;i<Count();i++) 627 if (aid==Get(i)->attr_id && Get(i)->cds==is_cds) 628 return Get(i)->attr_val; 629 return NULL; 630 } 631 632 void copyAttrs(GffAttrs* attrs, bool is_cds=false) { 633 //deep copy attributes from another GffAttrs list 634 // (only the ones which do not exist yet) 635 if (attrs==NULL) return; 636 for (int i=0;i<attrs->Count();i++) { 637 int aid=attrs->Get(i)->attr_id; 638 if (haveId(aid, is_cds)<0) 639 Add(new GffAttr(aid, attrs->Get(i)->attr_val, is_cds)); 640 } 641 } 642 }; 643 644 class GffExon : public GSeg { 645 public: 646 bool sharedAttrs; //do not free attrs on destruct! 647 GffAttrs* attrs; //other attributes kept for this exon/CDS 648 GffScore score; // gff score column 649 int8_t exontype; 650 char phase; //GFF phase column - for CDS segments only! 651 // '.' = undefined (UTR), '0','1','2' for CDS exons 652 void* uptr; //for associating extended user data to this exon 653 getAttr(GffNames * names,const char * atrname)654 char* getAttr(GffNames* names, const char* atrname) { 655 if (attrs==NULL || names==NULL || atrname==NULL) return NULL; 656 return attrs->getAttr(names, atrname); 657 } 658 getAttr(int aid)659 char* getAttr(int aid) { 660 if (attrs==NULL) return NULL; 661 return attrs->getAttr(aid); 662 } GffExon(bool share_attributes)663 GffExon(bool share_attributes):GSeg(0,0), sharedAttrs(share_attributes), attrs(NULL), score(), 664 exontype(0), phase('.'), uptr(NULL){ 665 } sharedAttrs(false)666 GffExon(uint s=0, uint e=0, int8_t et=0, char ph='.', float sc=0, int8_t sc_prec=0):sharedAttrs(false), attrs(NULL), 667 score(sc,sc_prec), exontype(et), phase(ph), uptr(NULL) { 668 if (s<e) { start=s; end=e; } 669 else { start=e; end=s; } 670 671 } //constructor 672 673 GffExon(const GffExon & ex)674 GffExon(const GffExon& ex):GSeg(ex.start, ex.end) { //copy constructor 675 (*this)=ex; //use the default (shallow!) copy operator 676 if (ex.attrs!=NULL) { //make a deep copy here 677 attrs=new GffAttrs(); 678 attrs->copyAttrs(ex.attrs); 679 } 680 } 681 682 GffExon& operator=(const GffExon& o) = default; //prevent gcc 9 warnings: 683 //yes, I want a shallow copy here 684 ~GffExon()685 ~GffExon() { //destructor 686 if (attrs!=NULL && !sharedAttrs) delete attrs; 687 } 688 689 }; 690 691 //only for mapping to spliced coding sequence: 692 class GffCDSeg:public GSeg { 693 public: 694 char phase; 695 int exonidx; 696 }; 697 698 //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments 699 class GffObj:public GSeg { 700 protected: 701 char* gffID; // ID name for mRNA (parent) feature 702 char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3) 703 char* geneID; //value of gene_id attribute (GTF) if present, or the ID attribute of a parent gene feature (GFF3) 704 union { 705 unsigned int flags; 706 struct { 707 bool flag_HAS_ERRORS :1; 708 bool flag_CHILDREN_PROMOTED :1; 709 bool flag_IS_GENE :1; 710 bool flag_IS_TRANSCRIPT :1; 711 bool flag_HAS_GFF_ID :1; //found transcript/RNA feature line (GFF3 or GTF2 with transcript line) 712 bool flag_BY_EXON :1; //created by subfeature (exon/CDS) directly 713 bool flag_CDS_ONLY :1; //transcript defined by CDS features only (GffObj::isCDS()) 714 bool flag_CDS_NOSTART :1; //partial CDS at 5' end (no start codon) 715 bool flag_CDS_NOSTOP :1; //partial CDS at 3' end (no stop codon) 716 bool flag_CDS_X :1; //transcript having CDS with ribosomal shift (i.e. after merging exons) 717 //CDS segments stored in ::cdss are incompatible with the exon segments 718 bool flag_GENE_SEGMENT :1; //a transcript-like C/D/J/V_gene_segment (NCBI's annotation) 719 bool flag_TRANS_SPLICED :1; 720 bool flag_DISCONTINUOUS :1; //discontinuous feature (e.g. cDNA_match) segments linked by same ID 721 bool flag_TARGET_ONLY :1; //Target= feature (e.g. from RepeatMasker output), lacks ID 722 bool flag_DISCARDED :1; //it will be discarded from the final GffReader list 723 bool flag_LST_KEEP :1; //controlled by isUsed(); if set, this GffObj will not be 724 //deallocated when GffReader is destroyed 725 bool flag_FINALIZED :1; //if finalize() was already called for this GffObj 726 unsigned int gff_level :4; //hierarchical level (0..15) 727 }; 728 }; 729 //-- friends: 730 friend class GffReader; 731 friend class GffExon; 732 public: 733 static GffNames* names; // dictionary storage that holds the various attribute names etc. 734 int track_id; // index of track name in names->tracks 735 int gseq_id; // index of genomic sequence name in names->gseqs 736 int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value 737 int subftype_id; //index of child subfeature name in names->feats (subfeatures stored in "exons") 738 //if ftype_id==gff_fid_mRNA then this value is ignored 739 GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id 740 GList<GffExon>* cdss; //only !NULL for cases of "programmed frameshift" when CDS boundaries do not match 741 //exons boundaries 742 GPVec<GffObj> children; 743 GffObj* parent; 744 int udata; //user data, flags etc. 745 void* uptr; //user pointer (to a parent object, cluster, locus etc.) 746 GffObj* ulink; //link to another GffObj (user controlled field) 747 //---mRNA specific fields: 748 //bool isCDS; //just a CDS, no UTRs 749 uint CDstart; //CDS lowest coordinate 750 uint CDend; //CDS highest coordinate 751 char CDphase; //initial phase for CDS start ('.','0'..'2') 752 //CDphase is at CDend if strand=='-' 753 static void decodeHexChars(char* dbuf, const char* s, int maxlen=1023); hasErrors()754 bool hasErrors() { return flag_HAS_ERRORS; } hasErrors(bool v)755 void hasErrors(bool v) { flag_HAS_ERRORS=v; } hasGffID()756 bool hasGffID() { return flag_HAS_GFF_ID; } hasGffID(bool v)757 void hasGffID(bool v) {flag_HAS_GFF_ID=v; } createdByExon()758 bool createdByExon() { return flag_BY_EXON; } createdByExon(bool v)759 void createdByExon(bool v) {flag_BY_EXON=v; } isCDSOnly()760 bool isCDSOnly() { return flag_CDS_ONLY; } isCDSOnly(bool v)761 void isCDSOnly(bool v) { flag_CDS_ONLY=v; } isXCDS()762 bool isXCDS() { return flag_CDS_X; } isXCDS(bool v)763 void isXCDS(bool v) { flag_CDS_X=v; } isFinalized()764 bool isFinalized() { return flag_FINALIZED; } isFinalized(bool v)765 void isFinalized(bool v) { flag_FINALIZED=v; } 766 isGene()767 bool isGene() { return flag_IS_GENE; } isGene(bool v)768 void isGene(bool v) {flag_IS_GENE=v; } isDiscarded()769 bool isDiscarded() { return flag_DISCARDED; } isDiscarded(bool v)770 void isDiscarded(bool v) { flag_DISCARDED=v; } isUsed()771 bool isUsed() { return flag_LST_KEEP; } isUsed(bool v)772 void isUsed(bool v) {flag_LST_KEEP=v; } isTranscript()773 bool isTranscript() { return flag_IS_TRANSCRIPT; } isTranscript(bool v)774 void isTranscript(bool v) {flag_IS_TRANSCRIPT=v; } isGeneSegment()775 bool isGeneSegment() { return flag_GENE_SEGMENT; } isGeneSegment(bool v)776 void isGeneSegment(bool v) {flag_GENE_SEGMENT=v; } promotedChildren()777 bool promotedChildren() { return flag_CHILDREN_PROMOTED; } promotedChildren(bool v)778 void promotedChildren(bool v) { flag_CHILDREN_PROMOTED=v; } setLevel(byte v)779 void setLevel(byte v) { gff_level=v; } getLevel()780 byte getLevel() { return gff_level; } incLevel()781 byte incLevel() { gff_level++; return gff_level; } 782 isValidTranscript()783 bool isValidTranscript() { 784 //return (ftype_id==gff_fid_mRNA && exons.Count()>0); 785 return (isTranscript() && exons.Count()>0); 786 } 787 788 //return the index of exon containing coordinate coord, or -1 if not 789 int whichExon(uint coord, GList<GffExon>* segs=NULL); 790 int readExon(GffReader& reader, GffLine& gl); 791 792 int addExon(GList<GffExon>& segs, GffLine& gl, int8_t exontype_override=exgffNone); //add to cdss or exons 793 794 int addExon(uint segstart, uint segend, int8_t exontype=exgffNone, char phase='.', 795 GffScore exon_score=GFFSCORE_NONE, GList<GffExon>* segs=NULL); 796 797 protected: 798 bool reduceExonAttrs(GList<GffExon>& segs); 799 //utility segment-merging function for addExon() 800 void expandSegment(GList<GffExon>&segs, int oi, uint segstart, uint segend, 801 int8_t exontype); 802 bool processGeneSegments(GffReader* gfr); //for genes that have _gene_segment features (NCBI annotation) 803 void transferCDS(GffExon* cds); 804 public: 805 void removeExon(int idx); 806 void removeExon(GffExon* p); 807 char strand; //'+', '-' or '.' 808 GffScore gscore; 809 int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths) 810 GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature 811 //constructor by gff line parsing: 812 GffObj(GffReader& gfrd, BEDLine& bedline); 813 GffObj(GffReader& gfrd, GffLine& gffline); 814 //if gfline->Parent!=NULL then this will also add the first sub-feature 815 // otherwise, only the main feature is created 816 void copyAttrs(GffObj* from); clearAttrs()817 void clearAttrs() { 818 if (attrs!=NULL) { 819 bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs); 820 delete attrs; attrs=NULL; 821 if (sharedattrs) exons[0]->attrs=NULL; 822 } 823 } 824 GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() { 825 //exons: sorted, free, non-unique 826 gffID=NULL; 827 uptr=NULL; 828 ulink=NULL; 829 flags=0; 830 udata=0; 831 parent=NULL; 832 ftype_id=-1; 833 subftype_id=-1; 834 if (anid!=NULL) gffID=Gstrdup(anid); 835 gffnames_ref(names); 836 CDstart=0; // hasCDS <=> CDstart>0 837 CDend=0; 838 CDphase=0; 839 gseq_id=-1; 840 track_id=-1; 841 strand='.'; 842 attrs=NULL; 843 covlen=0; 844 geneID=NULL; 845 gene_name=NULL; 846 } ~GffObj()847 ~GffObj() { 848 GFREE(gffID); 849 GFREE(gene_name); 850 GFREE(geneID); 851 delete cdss; 852 clearAttrs(); 853 gffnames_unref(names); 854 } 855 //-------------- 856 GffObj* finalize(GffReader* gfr); 857 //complete parsing: must be called in order to merge adjacent/close proximity subfeatures 858 void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false, bool CDSsrc=false); getSubfName()859 const char* getSubfName() { //returns the generic feature type of the entries in exons array 860 return names->feats.getName(subftype_id); 861 } 862 void setCDS(uint cd_start, uint cd_end, char phase=0); 863 void setCDS(GffObj* t); //set CDS from another transcript 864 monoFeature()865 bool monoFeature() { 866 return (exons.Count()==0 || 867 (exons.Count()==1 && //exon_ftype_id==ftype_id && 868 exons[0]->end==this->end && exons[0]->start==this->start)); 869 } 870 hasCDS()871 bool hasCDS() { return (CDstart>0); } 872 getFeatureName()873 const char* getFeatureName() { 874 return names->feats.getName(ftype_id); 875 } 876 void setFeatureName(const char* feature); 877 878 void addAttr(const char* attrname, const char* attrvalue); 879 int removeAttr(const char* attrname, const char* attrval=NULL); 880 int removeAttr(int aid, const char* attrval=NULL); 881 int removeAttrs(GStrSet<>& attrSet); //remove attributes whose names are NOT in attrSet 882 int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL); 883 int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL); 884 getAttrName(int i)885 const char* getAttrName(int i) { 886 if (attrs==NULL) return NULL; 887 return names->attrs.getName(attrs->Get(i)->attr_id); 888 } 889 890 char* getAttr(const char* attrname, bool checkFirstExon=false) { 891 if (names==NULL || attrname==NULL) return NULL; 892 char* r=NULL; 893 if (attrs==NULL) { 894 if (!checkFirstExon) return NULL; 895 } else 896 r=attrs->getAttr(names, attrname); 897 if (r!=NULL) return r; 898 if (checkFirstExon && exons.Count()>0) { 899 r=exons.First()->getAttr(names, attrname); 900 } 901 return r; 902 } 903 getExonAttr(GffExon * exon,const char * attrname)904 char* getExonAttr(GffExon* exon, const char* attrname) { 905 if (exon==NULL || attrname==NULL) return NULL; 906 return exon->getAttr(names, attrname); 907 } 908 getExonAttr(int exonidx,const char * attrname)909 char* getExonAttr(int exonidx, const char* attrname) { 910 if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL; 911 return exons[exonidx]->getAttr(names, attrname); 912 } 913 getAttrValue(int i)914 char* getAttrValue(int i) { 915 if (attrs==NULL) return NULL; 916 return attrs->Get(i)->attr_val; 917 } getGSeqName()918 const char* getGSeqName() { 919 return names->gseqs.getName(gseq_id); 920 } 921 getRefName()922 const char* getRefName() { 923 return names->gseqs.getName(gseq_id); 924 } 925 void setRefName(const char* newname); 926 getTrackName()927 const char* getTrackName() { 928 return names->tracks.getName(track_id); 929 } exonOverlap(uint s,uint e)930 bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment 931 //ignores strand! 932 if (s>e) Gswap(s,e); 933 for (int i=0;i<exons.Count();i++) { 934 if (exons[i]->overlap(s,e)) return true; 935 } 936 return false; 937 } exonOverlap(GffObj & m)938 bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment 939 //if (gseq_id!=m.gseq_id) return false; 940 // ignores strand and gseq_id, must check in advance 941 for (int i=0;i<exons.Count();i++) { 942 for (int j=0;j<m.exons.Count();j++) { 943 if (exons[i]->start>m.exons[j]->end) continue; 944 if (m.exons[j]->start>exons[i]->end) break; 945 //-- overlap if we are here: 946 return true; 947 } 948 } 949 return false; 950 } 951 952 int exonOverlapIdx(GList<GffExon>& segs, uint s, uint e, int* ovlen=NULL, int start_idx=0); 953 exonOverlapLen(GffObj & m)954 int exonOverlapLen(GffObj& m) { 955 if (start>m.end || m.start>end) return 0; 956 int i=0; 957 int j=0; 958 int ovlen=0; 959 while (i<exons.Count() && j<m.exons.Count()) { 960 uint istart=exons[i]->start; 961 uint iend=exons[i]->end; 962 uint jstart=m.exons[j]->start; 963 uint jend=m.exons[j]->end; 964 if (istart>jend) { j++; continue; } 965 if (jstart>iend) { i++; continue; } 966 //exon overlap 967 uint ovstart=GMAX(istart,jstart); 968 if (iend<jend) { 969 ovlen+=iend-ovstart+1; 970 i++; 971 } 972 else { 973 ovlen+=jend-ovstart+1; 974 j++; 975 } 976 }//while comparing exons 977 return ovlen; 978 } 979 exonOverlap(GffObj * m)980 bool exonOverlap(GffObj* m) { 981 return exonOverlap(*m); 982 } 983 984 //--------------------- 985 bool operator==(GffObj& d){ 986 return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0); 987 } 988 bool operator>(GffObj& d){ 989 if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id); 990 if (start==d.start) { 991 if (getLevel()==d.getLevel()) { 992 if (end==d.end) return (strcmp(gffID, d.gffID)>0); 993 else return (end>d.end); 994 } else return (getLevel()>d.getLevel()); 995 } else return (start>d.start); 996 } 997 bool operator<(GffObj& d){ 998 if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id); 999 if (start==d.start) { 1000 if (getLevel()==d.getLevel()) { 1001 if (end==d.end) return strcmp(gffID, d.gffID)<0; 1002 else return end<d.end; 1003 } else return (getLevel()<d.getLevel()); 1004 } else return (start<d.start); 1005 } getID()1006 char* getID() { return gffID; } 1007 getGeneID()1008 char* getGeneID() { return geneID; } getGeneName()1009 char* getGeneName() { return gene_name; } setGeneName(const char * gname)1010 void setGeneName(const char* gname) { 1011 GFREE(gene_name); 1012 if (gname) gene_name=Gstrdup(gname); 1013 } setGeneID(const char * gene_id)1014 void setGeneID(const char* gene_id) { 1015 GFREE(geneID); 1016 if (gene_id) geneID=Gstrdup(gene_id); 1017 } 1018 int addSeg(GffLine* gfline); 1019 int addSeg(int fnid, GffLine* gfline); 1020 void getCDSegs(GVec<GffExon>& cds); 1021 1022 void updateCDSPhase(GList<GffExon>& segs); //for CDS-only features, updates GffExon::phase 1023 void printGTab(FILE* fout, char** extraAttrs=NULL); 1024 void printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, 1025 bool iscds, GffExon* exon, bool gff3, bool cvtChars); 1026 void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, 1027 const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false); 1028 void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) { 1029 printGxf(fout, pgtfAny, tlabel, NULL, cvtChars); 1030 } 1031 void printGff(FILE* fout, const char* tlabel=NULL, 1032 const char* gfparent=NULL, bool cvtChars=false) { 1033 printGxf(fout, pgffAny, tlabel, gfparent, cvtChars); 1034 } 1035 bool printAttrs(FILE* fout, const char* sep=";", bool GTFstyle=false, bool cvtChars=false, bool sepFirst=true); 1036 void printTranscriptGff(FILE* fout, char* tlabel=NULL, 1037 bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) { 1038 if (isValidTranscript()) 1039 printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent, cvtChars); 1040 } 1041 void printExonList(FILE* fout); //print comma delimited list of exon intervals 1042 void printCDSList(FILE* fout); //print comma delimited list of CDS intervals 1043 1044 void printBED(FILE* fout, bool cvtChars); 1045 //print a BED-12 line + GFF3 attributes in 13th field 1046 void printSummary(FILE* fout=NULL); 1047 1048 char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL, 1049 uint* cds_start=NULL, uint* cds_end=NULL, GMapSegments* seglst=NULL, 1050 bool cds_open=false); 1051 char* getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst=NULL); 1052 1053 void addPadding(int padLeft, int padRight); //change exons to include this padding on the sides 1054 void removePadding(int padLeft, int padRight); 1055 1056 //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ? empty()1057 bool empty() { return (start==0); } 1058 }; 1059 1060 typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2); 1061 //user callback after parsing a mapping object: 1062 // Returns: "done with it" status: 1063 // TRUE if gobj is no longer needed so it's FREEd upon return 1064 // FALSE if the user needs the gobj pointer and is responsible for 1065 // collecting and freeing all GffObj objects 1066 1067 1068 //GSeqStat: collect basic stats about a common underlying genomic sequence 1069 // for multiple GffObj 1070 class GSeqStat { 1071 public: 1072 int gseqid; //gseq id in the global static pool of gseqs 1073 char* gseqname; //just a pointer to the name of gseq 1074 int fcount;//number of features on this gseq 1075 uint mincoord; 1076 uint maxcoord; 1077 uint maxfeat_len; //maximum feature length on this genomic sequence 1078 GffObj* maxfeat; 1079 GSeqStat(int id=-1, char* name=NULL) { 1080 gseqid=id; 1081 gseqname=name; 1082 fcount=0; 1083 mincoord=MAXUINT; 1084 maxcoord=0; 1085 maxfeat_len=0; 1086 maxfeat=NULL; 1087 } 1088 bool operator>(GSeqStat& g) { 1089 return (gseqid>g.gseqid); 1090 } 1091 bool operator<(GSeqStat& g) { 1092 return (gseqid<g.gseqid); 1093 } 1094 bool operator==(GSeqStat& g) { 1095 return (gseqid==g.gseqid); 1096 } 1097 }; 1098 1099 1100 int gfo_cmpByLoc(const pointer p1, const pointer p2); 1101 int gfo_cmpRefByID(const pointer p1, const pointer p2); 1102 1103 class GfList: public GList<GffObj> { 1104 public: 1105 GfList(bool sorted)1106 GfList(bool sorted):GList<GffObj>(sorted,false,false) { } GfList()1107 GfList():GList<GffObj>(false,false,false) { 1108 //GffObjs in this list are NOT deleted when the list is cleared 1109 //-- for deallocation of these objects, call freeAll() or freeUnused() as needed 1110 } 1111 void finalize(GffReader* gfr); 1112 freeAll()1113 void freeAll() { 1114 for (int i=0;i<fCount;i++) { 1115 delete fList[i]; 1116 fList[i]=NULL; 1117 } 1118 Clear(); 1119 } freeUnused()1120 void freeUnused() { 1121 for (int i=0;i<fCount;i++) { 1122 if (fList[i]->isUsed()) continue; 1123 /*//inform the children? 1124 for (int c=0;c<fList[i]->children.Count();c++) { 1125 fList[i]->children[c]->parent=NULL; 1126 } 1127 */ 1128 delete fList[i]; 1129 fList[i]=NULL; 1130 } 1131 Clear(); 1132 } 1133 }; 1134 1135 class CNonExon { //utility class used in subfeature promotion 1136 public: 1137 //int idx; 1138 GffObj* parent; 1139 GffExon* exon; 1140 GffLine* gffline; 1141 //CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) { CNonExon(GffObj * p,GffExon * e,GffLine & gl)1142 CNonExon(GffObj* p, GffExon* e, GffLine& gl) { 1143 parent=p; 1144 exon=e; 1145 //idx=i; 1146 gffline=new GffLine(gl); 1147 } ~CNonExon()1148 ~CNonExon() { 1149 delete gffline; 1150 } 1151 }; 1152 1153 1154 class GffReader { 1155 friend class GffObj; 1156 friend class GffLine; 1157 friend class GfList; 1158 char* linebuf; 1159 off_t fpos; 1160 int buflen; 1161 protected: 1162 union { 1163 unsigned int flags; 1164 unsigned int gff_type: 6; 1165 struct { 1166 bool is_gff3: 1; //GFF3 syntax was detected 1167 bool is_gtf:1; //GTF syntax was detected 1168 bool gtf_transcript:1; //has "transcript" features (2-level GTF) 1169 bool gtf_gene:1; //has "gene" features (3-level GTF ..Ensembl?) 1170 bool is_BED:1; //input is BED-12 format, possibly with attributes in 13th field 1171 bool is_TLF:1; //input is GFF3-like Transcript Line Format with exons= attribute 1172 //--other flags 1173 bool transcripts_Only:1; //default ; only keep recognized transcript features 1174 bool keep_Genes:1; //for transcriptsOnly, do not discard genes from gflst 1175 bool keep_Attrs:1; 1176 bool keep_AllExonAttrs:1; //when keep_Attrs, do not attempt to reduce exon attributes 1177 bool noExonAttrs:1; 1178 bool ignoreLocus:1; //discard locus features and attributes from input 1179 bool merge_CloseExons:1; 1180 bool gene2exon:1; 1181 bool sortByLoc:1; //if records should be sorted by location 1182 bool refAlphaSort:1; //if sortByLoc, reference sequences are 1183 // sorted lexically instead of their id# 1184 //Ensembl ID processing: 1185 bool xEnsemblID:1; //for ensemble GTF merge gene_version and transcript_version into the ID 1186 //for ensemble GFF3, cannot merge version (!), just remove "transcript:" and "gene:" prefixes 1187 bool gff_warns:1; 1188 }; 1189 }; 1190 //char* lastReadNext; 1191 FILE* fh; 1192 char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader 1193 GFFCommentParser* commentParser; 1194 GffLine* gffline; 1195 BEDLine* bedline; 1196 //bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features 1197 //bool gene2exon; // for childless genes: add an exon as the entire gene span 1198 GHash<int> discarded_ids; //for transcriptsOnly mode, keep track 1199 // of discarded parent IDs 1200 GHash< GPVec<GffObj>* > phash; //transcript_id => GPVec<GffObj>(false) 1201 char* gfoBuildId(const char* id, const char* ctg); 1202 //void gfoRemove(const char* id, const char* ctg); 1203 GffObj* gfoAdd(GffObj* gfo); 1204 GffObj* gfoAdd(GPVec<GffObj>& glst, GffObj* gfo); 1205 GffObj* gfoReplace(GPVec<GffObj>& glst, GffObj* gfo, GffObj* toreplace); 1206 // const char* id, const char* ctg, char strand, GVec<GfoHolder>** glst, uint start, uint end 1207 bool pFind(const char* id, GPVec<GffObj>*& glst); 1208 GffObj* gfoFind(const char* id, GPVec<GffObj>* & glst, const char* ctg=NULL, 1209 char strand=0, uint start=0, uint end=0); 1210 CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon*>& pex, char*& subp_name); 1211 void subfPoolAdd(GHash<CNonExon*>& pex, GffObj* newgfo); 1212 GffObj* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon*>& pex); 1213 1214 #ifdef CUFFLINKS 1215 boost::crc_32_type _crc_result; 1216 #endif 1217 public: 1218 GPVec<GSeqStat> gseqtable; //table with all genomic sequences, but only current GXF gseq ID indices will have non-NULL 1219 //GffNames* names; //just a pointer to the global static Gff names repository 1220 GfList gflst; //keeps track of all GffObj records being read (when readAll() is used) 1221 GffObj* newGffRec(GffLine* gffline, GffObj* parent=NULL, GffExon* pexon=NULL, 1222 GPVec<GffObj>* glst=NULL, bool replace_parent=false); 1223 GffObj* newGffRec(BEDLine* bedline, GPVec<GffObj>* glst=NULL); 1224 //GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx); 1225 GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline); 1226 GffObj* updateParent(GffObj* newgfh, GffObj* parent); 1227 bool readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon*>* pex = NULL); 1228 GPVec<GSeqStat> gseqStats; //populated after finalize() with only the ref seqs in this file linebuf(NULL)1229 GffReader(FILE* f=NULL, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0), 1230 buflen(0), flags(0), fh(f), fname(NULL), commentParser(NULL), gffline(NULL), 1231 bedline(NULL), discarded_ids(true), phash(true), gseqtable(1,true), 1232 gflst(), gseqStats(1, false) { 1233 GMALLOC(linebuf, GFF_LINELEN); 1234 buflen=GFF_LINELEN-1; 1235 gffnames_ref(GffObj::names); 1236 //gff_warns=gff_show_warnings; 1237 transcripts_Only=t_only; 1238 sortByLoc=sort; 1239 noExonAttrs=true; 1240 //lastReadNext=NULL; 1241 } 1242 /* 1243 void init(FILE *f, bool t_only=false, bool sortbyloc=false, bool g2exon=false) { 1244 fname=NULL; 1245 fh=f; 1246 if (fh!=NULL) rewind(fh); 1247 fpos=0; 1248 flags=0; 1249 transcriptsOnly=t_only; 1250 gflst.sortedByLoc(sortbyloc); 1251 gene2exon=g2exon; 1252 } 1253 */ gene2Exon(bool v)1254 void gene2Exon(bool v) { gene2exon=v;} procEnsemblID(bool v)1255 void procEnsemblID(bool v) { xEnsemblID=v;} procEnsemblID()1256 bool procEnsemblID() { return xEnsemblID; } 1257 void enableSorting(bool sorting=true) { sortByLoc=sorting; } getSorting()1258 bool getSorting() { return sortByLoc; } 1259 void isBED(bool v=true) { is_BED=v; } //should be set before any parsing! 1260 void isTLF(bool v=true) { is_TLF=v; } //should be set before any parsing! 1261 void keepAttrs(bool keep_attrs=true, bool discardExonAttrs=true, bool preserve_exon_attrs=false) { 1262 keep_Attrs=keep_attrs; 1263 noExonAttrs=discardExonAttrs; 1264 keep_AllExonAttrs=preserve_exon_attrs; 1265 } transcriptsOnly(bool t_only)1266 void transcriptsOnly(bool t_only) { transcripts_Only=t_only; } transcriptsOnly()1267 bool transcriptsOnly() { return transcripts_Only; } setIgnoreLocus(bool nolocus)1268 void setIgnoreLocus(bool nolocus) { ignoreLocus=nolocus; } keepGenes(bool keep_genes)1269 void keepGenes(bool keep_genes) { 1270 keep_Genes=keep_genes; 1271 } keepGenes()1272 bool keepGenes() { return keep_Genes; } 1273 void mergeCloseExons(bool merge_close_exons=true) { 1274 merge_CloseExons=merge_close_exons; 1275 } showWarnings(bool v)1276 void showWarnings(bool v) { 1277 gff_warns=v; 1278 //gff_show_warnings=v; 1279 } showWarnings()1280 bool showWarnings() { 1281 return gff_warns; 1282 } 1283 void setRefAlphaSorted(bool v=true) { 1284 refAlphaSort=v; 1285 if (v) sortByLoc=true; 1286 } 1287 void setCommentParser(GFFCommentParser* cmParser=NULL) { 1288 commentParser=cmParser; 1289 } 1290 linebuf(NULL)1291 GffReader(const char* fn, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0), 1292 buflen(0), flags(0), fh(NULL), fname(NULL), commentParser(NULL), 1293 gffline(NULL), bedline(NULL), discarded_ids(true), 1294 phash(true), gseqtable(1,true), gflst(), gseqStats(1,false) { 1295 //gff_warns=gff_show_warnings; 1296 gffnames_ref(GffObj::names); 1297 noExonAttrs=true; 1298 transcripts_Only=t_only; 1299 sortByLoc=sort; 1300 fname=Gstrdup(fn); 1301 fh=fopen(fname, "rb"); 1302 GMALLOC(linebuf, GFF_LINELEN); 1303 buflen=GFF_LINELEN-1; 1304 //lastReadNext=NULL; 1305 } 1306 ~GffReader()1307 ~GffReader() { 1308 delete gffline; 1309 gffline=NULL; 1310 fpos=0; 1311 if (fh && fh!=stdin) fclose(fh); 1312 gflst.freeUnused(); 1313 gflst.Clear(); 1314 discarded_ids.Clear(); 1315 phash.Clear(); 1316 GFREE(fname); 1317 GFREE(linebuf); 1318 //GFREE(lastReadNext); 1319 gffnames_unref(GffObj::names); 1320 } 1321 1322 1323 GffLine* nextGffLine(); 1324 BEDLine* nextBEDLine(); 1325 1326 // load all subfeatures, re-group them: 1327 void readAll(); 1328 void readAll(bool keepAttr, bool mergeCloseExons=false, bool noExonAttr=true) { 1329 this->keep_Attrs=keepAttr; 1330 this->merge_CloseExons=mergeCloseExons; 1331 this->noExonAttrs=noExonAttr; 1332 readAll(); 1333 } 1334 1335 1336 //only for well-formed files: BED or GxF where exons are strictly grouped by their transcript_id/Parent 1337 GffObj* readNext(); //user must free the returned GffObj* ! 1338 1339 #ifdef CUFFLINKS current_crc_result()1340 boost::crc_32_type current_crc_result() const { return _crc_result; } 1341 #endif 1342 1343 }; // end of GffReader 1344 1345 // ---------------------------------------------------------- 1346 // -- auxiliary classes for GffObj::processGeneSegments() -- 1347 class GSegMatch { //keep track of "matching" overlaps of a GeneCDSChain with multiple GeneSegment containers 1348 public: 1349 int child_idx; //index of matching _gene_segment GffObj in gene->children[] list 1350 int noncov; //number of "non-covered" bases in the GeneSegment 1351 int gsegidx; //index of _gene_segment in GVec<int> geneSegs 1352 // (i.e. UTRs + implied introns if exons are missing) 1353 bool operator<(GSegMatch& o) { return (noncov<o.noncov); } 1354 bool operator==(GSegMatch& o) { return (noncov==o.noncov); } child_idx(cidx)1355 GSegMatch(int cidx=-1, int ncov=-1, int gsidx=-1):child_idx(cidx), 1356 noncov(ncov), gsegidx(gsidx) { } 1357 }; 1358 1359 class GeneCDS: public GSeg { 1360 public: 1361 int idx; //index of this CDS entry in this gene->cdss[] list GSeg(cstart,cend)1362 GeneCDS(int i=-1, uint cstart=0, uint cend=0):GSeg(cstart, cend), idx(i) { 1363 } 1364 }; 1365 1366 class GeneCDSChain: public GSeg { //keep track of CDS chains of the gene and their boundaries 1367 public: 1368 GVec<GeneCDS> cdsList; //all CDSs in this chain 1369 GArray<GSegMatch> mxs; //list of "matching" container X_gene_segment transcripts; GeneCDSChain()1370 GeneCDSChain():cdsList(),mxs() { } GeneCDSChain(int idx,uint cstart,uint cend)1371 GeneCDSChain(int idx, uint cstart, uint cend):GSeg(cstart, cend), 1372 cdsList(),mxs(true) { 1373 addCDS(idx, cstart, cend); 1374 1375 } addCDS(int idx,uint cstart,uint cend)1376 void addCDS(int idx, uint cstart, uint cend) { 1377 GeneCDS cds(idx, cstart, cend); 1378 cdsList.Add(cds); 1379 expandInclude(cstart, cend); 1380 } addMatch(int childidx,int ncov,int gsegidx)1381 void addMatch(int childidx, int ncov, int gsegidx) { 1382 GSegMatch segmatch(childidx, ncov, gsegidx); 1383 mxs.Add(segmatch); 1384 } singleExonCDSMatch(uint tstart,uint tend,int & ncov)1385 bool singleExonCDSMatch(uint tstart, uint tend, int& ncov) { 1386 if (start>=tstart && end<=tend) { 1387 ncov=start-tstart + tend-end; 1388 //add all CDS-"introns" 1389 if (cdsList.Count()>1) 1390 //shouldn't really consider this a valid "match" 1391 for (int i=1;i<cdsList.Count();i++) 1392 ncov+=cdsList[i].start-cdsList[i-1].end-1; 1393 return true; 1394 } 1395 return false; 1396 } singleCDStoExon(GffObj & t,int & ncov)1397 bool singleCDStoExon(GffObj&t, int& ncov) { 1398 //cdsList[0] must be contained in a single exon of t 1399 int nc=0; 1400 bool match=false; 1401 for (int i=0;i<t.exons.Count();i++) { 1402 if (t.exons[i]->overlap(cdsList[0])) { 1403 if (cdsList[0].start>=t.exons[i]->start && 1404 cdsList[0].end<=t.exons[i]->end) { 1405 match=true; 1406 nc+=cdsList[0].start-t.exons[i]->start+t.exons[i]->end+cdsList[0].end; 1407 } //contained in this exon 1408 else return false; //overlap, but not contained 1409 continue; 1410 } 1411 nc+=t.exons[i]->len(); 1412 } 1413 if (!match) return false; 1414 ncov=nc; 1415 return true; 1416 } 1417 multiCDStoExon(GffObj & t,int & ncov)1418 bool multiCDStoExon(GffObj &t, int& ncov) { 1419 //multi-CDS vs multi-exon t 1420 int nc=0; 1421 int e=0, c=0; 1422 int emax=t.exons.Count()-1; 1423 int cmax=cdsList.Count()-1; 1424 int mintrons=0; //matched introns 1425 while (e<emax && c<cmax) { 1426 if (mintrons>0 && 1427 (cdsList[c].end!=t.exons[e]->end || 1428 cdsList[c+1].start!=t.exons[e+1]->start)) 1429 return false; 1430 GSeg cintron(cdsList[c].end+1, cdsList[c+1].start-1); 1431 GSeg eintron(t.exons[e]->end+1, t.exons[e+1]->start-1); 1432 if (cintron.start>eintron.end) { 1433 nc+=t.exons[e]->len(); 1434 e++; 1435 continue; 1436 } 1437 if (eintron.start<=cintron.end) { 1438 //intron overlap 1439 if (cintron.start==eintron.start && 1440 cintron.end==eintron.end) { 1441 //intron match 1442 if (mintrons==0) { 1443 if (cdsList[c].start<t.exons[e]->start) return false; 1444 nc+=cdsList[c].start-t.exons[e]->start; 1445 } 1446 mintrons++; 1447 c++;e++; 1448 continue; 1449 } 1450 else return false; 1451 } 1452 c++; //should never get here, CDS shouldn't be have to catch up with e 1453 } 1454 if (mintrons<cdsList.Count()-1) return false; 1455 //c should be cmax, e should be the last exon with CDS 1456 nc+=t.exons[e]->end-cdsList[c].end; 1457 for(int i=e+1;i<t.exons.Count();i++) 1458 nc+=t.exons[i]->len(); 1459 ncov=nc; 1460 return true; 1461 } 1462 containedBy(GffObj & t,int & ncov)1463 bool containedBy(GffObj& t, int& ncov) { 1464 1465 // (Warning: t may have no defined exons!) 1466 //if yes: ncov will be set to the number of non-CDS-covered bases in t 1467 if (t.exons.Count()<2) { 1468 if (t.exons.Count()==0) 1469 //no exons defined, just check boundaries 1470 return singleExonCDSMatch(t.start, t.end, ncov); 1471 else //single-exon 1472 return singleExonCDSMatch(t.exons[0]->start, t.exons[0]->end, ncov); 1473 } //single or no exon 1474 else { //multi-exon transcript 1475 if (start<t.exons.First()->start || end>t.exons.Last()->end) 1476 return false; //no containment possible; 1477 if (cdsList.Count()==1) 1478 return singleCDStoExon(t, ncov); 1479 //check intron compatibility! 1480 } 1481 return true; 1482 } 1483 }; 1484 1485 1486 #endif 1487