1 #ifndef GFF_H
2 #define GFF_H
3 
4 //#define CUFFLINKS 1
5 
6 #include "GBase.h"
7 #include "gdna.h"
8 #include "codons.h"
9 #include "GFaSeqGet.h"
10 #include "GList.hh"
11 #include "GHashMap.hh"
12 #include "GBitVec.h"
13 
14 #ifdef CUFFLINKS
15 #include <boost/crc.hpp>  // for boost::crc_32_type
16 #endif
17 
18 //reserved Gffnames::feats entries -- basic feature types
19 extern int gff_fid_mRNA; // "mRNA" feature name
20 extern int gff_fid_transcript; // *RNA, *transcript feature name
21 extern int gff_fid_exon;
22 extern int gff_fid_CDS;
23 
24 extern const uint GFF_MAX_LOCUS;
25 extern const uint GFF_MAX_EXON;
26 extern const uint GFF_MAX_INTRON;
27 //extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
28 //extern const byte gfo_flagShift_LEVEL;
29 
30 //extern bool gff_show_warnings;
31 
32 #define GFF_LINELEN 4096
33 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
34 
35 
36 enum GffExonType {
37   exgffIntron=-1, // useless "intron" feature
38   exgffNone=0,  //not recognizable or unitialized exonic segment
39   exgffStartCodon, //from "start_codon" feature (within CDS)
40   exgffStopCodon, //from "stop_codon" feature (may be outside CDS, but should)
41   exgffCDS,  //from "CDS" feature
42   exgffUTR,  //from "UTR" feature
43   exgffCDSUTR, //from a merge of UTR and CDS feature
44   exgffExon, //from "exon" feature
45 };
46 
47 extern const char* exonTypes[];
48 
49 const char* strExonType(char xtype);
50 class GfList;
51 
52 typedef void GFFCommentParser(const char* cmline, GfList* gflst); //comment parser callback
53 //Useful for parsing/maintaining ref seq info from comment lines like this:
54 //##sequence-region chr1 1 24895642
55 
56 class GffReader;
57 class GffObj;
58 
59 //---transcript overlapping - utility functions
60 
61 extern const byte CLASSCODE_OVL_RANK; //rank value just above 'o' class code
62 
63 byte classcode_rank(char c); //returns priority value for class codes
64 
65 struct TOvlData {
66 	char ovlcode;
67 	int ovlen;
68 	int16_t numJmatch; //number of matching junctions (not introns)
69 	GBitVec jbits; //bit array with 1 bit for each junctions (total = 2 * num_introns)
ovlcodeTOvlData70 	TOvlData(char oc=0, int olen=0, int16_t nmj=0, GBitVec* jb=NULL):ovlcode(oc),
71 			ovlen(olen),numJmatch(nmj),jbits(jb) { }
72 };
73 
74 TOvlData getOvlData(GffObj& m, GffObj& r, bool stricterMatch=false, int trange=0);
75 
76 char transcriptMatch(GffObj& a, GffObj& b, int& ovlen, int trange=0); //generic transcript match test
77 // -- return '=', '~'  or 0
78 char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen, int trange=0);
79 //single-exon transcript match test - returning '=', '~'  or 0
80 
81 //---
82 // -- tracking exon/CDS segments from local mRNA to genome coordinates
83 class GMapSeg:public GSeg {
84 public:
85 	uint gstart; //genome start location
86 	uint gend;   //genome end location
87 	//gend<gstart when mapped on reverse strand !
GSeg(s,e)88 	GMapSeg(uint s=0, uint e=0, uint gs=0, uint ge=0):GSeg(s,e),
89 			 gstart(gs), gend(ge) {};
g_within(uint gc)90     int g_within(uint gc) {
91     	//return 0 if not within gstart-gend intervals
92     	//or offset from gstart otherwise (always positive)
93     	if (gstart>gend) { //reverse strand mapping
94     		if  (gc<gend || gc>gstart) return 0;
95 			return (gstart-gc);
96     	}
97     	else {
98     		if (gc<gstart || gc>gend) return 0;
99     		return (gc-gstart);
100     	}
101     }
102 };
103 
104 struct GffScore {
105 	float score;
106 	int8_t precision;
scoreGffScore107 	GffScore(float sc=0, int8_t prec=-1):score(sc),precision(prec) { }
printGffScore108 	void print(FILE* outf) {
109 		if (precision<0) fprintf(outf, ".");
110 		else fprintf(outf, "%.*f", precision, score);
111 	}
sprintGffScore112 	void sprint(char* outs) {
113 		if (precision<0) sprintf(outs, ".");
114 		else sprintf(outs, "%.*f", precision, score);
115 	}
116 	bool operator<(GffScore& v) {
117 		return this->score<v.score;
118 	}
119 	bool operator<=(GffScore& v) {
120 		return this->score<=v.score;
121 	}
122 	bool operator>(GffScore& v) {
123 		return this->score>v.score;
124 	}
125 	bool operator>=(GffScore& v) {
126 		return this->score>=v.score;
127 	}
128 	bool operator==(GffScore& v) {
129 		return this->score==v.score;
130 	}
131 };
132 
133 extern const GffScore GFFSCORE_NONE;
134 
135 class GMapSegments:public GVec<GMapSeg> {
136   public:
137 	int dir; //-1 or +1 (reverse/forward for genome coordinates)
138 	GSeg lreg; // always 1,max local coord
139 	GSeg greg; // genomic min,max coords
140 	GMapSegments(char strand='+'):lreg(0,0),greg(0,0) {
141 		dir=(strand=='-') ? -1 : 1;
142 	}
143 	void Clear(char strand='+') {
144 		lreg.start=0;lreg.end=0;
145 		greg.start=0;greg.end=0;
146 		dir = (strand=='-') ? -1 : 1;;
147 		GVec<GMapSeg>::Clear();
148 	}
add(uint s,uint e,uint gs,uint ge)149     int add(uint s, uint e, uint gs, uint ge) {
150     	if (dir<0) {
151     		if (gs<ge) {
152     			Gswap(gs, ge);
153     		}
154     		if (gs>greg.end) greg.end=gs;
155     		if (ge<greg.start || greg.start==0) greg.start=ge;
156     	} else {
157     		if (ge>greg.end) greg.end=ge;
158     		if (gs<greg.start || greg.start==0) greg.start=gs;
159     	}
160     	GMapSeg gm(s, e, gs, ge);
161 		if (gm.end>lreg.end) lreg.end=gm.end;
162 		if (gm.start<lreg.start || lreg.start==0) lreg.start=gm.start;
163     	return GVec<GMapSeg>::Add(gm);
164     }
gmap(uint lc)165     uint gmap(uint lc) { //takes a local coordinate and returns its mapping to genomic coordinates
166     	//returns 0 if mapping cannot be performed!
167     	if (lc==0 || fCount==0 || lc<lreg.start || lc>lreg.end) return 0;
168     	//find local segment containing this coord
169     	int i=0;
170     	while (i<fCount) {
171     		if (lc>=fArray[i].start && lc<=fArray[i].end)
172     			return (fArray[i].gstart+dir*(lc-fArray[i].start));
173     		++i;
174         }
175     	return 0;
176     }
lmap(uint gc)177     uint lmap(uint gc) { //takes a genome coordinate and returns its mapping to local coordinates
178     	if (gc==0 || fCount==0 || gc<greg.start || gc>greg.end) return 0;
179     	//find genomic segment containing this coord
180     	int i=0;
181     	while (i<fCount) {
182     		int ofs=fArray[i].g_within(gc);
183     		if (ofs!=0)
184     			return (fArray[i].start+ofs);
185     		++i;
186         }
187     	return 0;
188     }
189 };
190 
191 //reading a whole transcript from a BED-12 line
192 class BEDLine {
193  public:
194     bool skip;
195     char* dupline; //duplicate of original line
196     char* line; //this will have tabs replaced by \0
197     int llen;
198     char* gseqname;
199     uint fstart;
200     uint fend;
201     char strand;
202     char* ID; //transcript ID from BED-12 (4th column)
203     char* info; //13th column - these could be GFF3 attributes
204     uint cds_start;
205     uint cds_end;
206     char cds_phase;
207     GVec<GSeg> exons;
208     BEDLine(GffReader* r=NULL, const char* l=NULL);
~BEDLine()209     ~BEDLine() {
210     	GFREE(dupline);
211     	GFREE(line);
212     }
213 };
214 
215 class GffLine {
216  protected:
217     char* _parents; //stores a copy of the Parent attribute value,
218        //with commas replaced by \0
219     int _parents_len;
220     bool parseSegmentList(GVec<GSeg>& segs, char* str);
221  public:
222     char* dupline; //duplicate of original line
223     char* line; //this will have tabs replaced by \0
224     int llen;
225     char* gseqname;
226     char* track;
227     char* ftype; //feature name: mRNA/gene/exon/CDS
228     int ftype_id;
229     char* info; //the last, attributes' field, unparsed
230     uint fstart;
231     uint fend;
232     /*
233     uint qstart; //overlap coords on query, if available
234     uint qend;
235     uint qlen; //query len, if given
236     */
237     float score;
238     int8_t score_decimals;
239     char strand;
240     union {
241     	unsigned int flags;
242     	struct {
243     		bool is_exonlike:2; //CDS,codon, UTR, exon
244     	};
245     	struct {
246     	    bool is_cds:1; //"cds" or "start/stop_codon" features
247     	    bool is_exon:1; //"exon" and "utr" features
248     	    bool is_transcript:1; //if current feature is *RNA or *transcript
249     	    bool is_gene:1; //current feature is *gene
250     	    //bool is_gff3:1; //line appears to be in GFF3 format (0=GTF)
251     	    bool is_gtf_transcript:1; //GTF transcript line with Parents parsed from gene_id
252     	    bool skipLine:1;
253     	    bool gffWarnings:1;
254     	    bool is_gene_segment:1; //for NCBI's D/J/V/C_gene_segment
255     	};
256     };
257     int8_t exontype; // gffExonType
258     char phase;  // '.' , '0', '1' or '2', can be also given as CDSphase attribute in TLF
259     uint cds_start; //if TLF: CDS=start:end attribute
260     uint cds_end;
261     GVec<GSeg> exons; //if TLF: exons= attribute
262     GVec<GSeg> cdss; //if TLF: CDS=segment_list attribute
263     char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3)
264     char* gene_id; //GTF only: value of "gene_id" attribute if present
265     char** parents; //for GTF only parents[0] is used
266     int num_parents;
267     char* ID;     // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id)
268     GffLine(GffReader* reader, const char* l); //parse the line accordingly
discardParent()269     void discardParent() {
270     	GFREE(_parents);
271     	_parents_len=0;
272     	num_parents=0;
273     	GFREE(parents);
274     	parents=NULL;
275     }
276     void ensembl_GFF_ID_process(char*& id);
277     void ensembl_GTF_ID_process(char*& id, const char* ver_attr);
278     static char* extractGFFAttr(char*& infostr, const char* oline, const char* pre, bool caseStrict=false,
279     		bool enforce_GTF2=false, int* rlen=NULL, bool deleteAttr=true);
280     char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL){
281     	return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, true);
282     }
283     char* getAttrValue(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL) {
284     	return extractGFFAttr(info, dupline, pre, caseStrict, enforce_GTF2, rlen, false);
285     }
GffLine(GffLine & l)286     GffLine(GffLine& l): _parents(NULL), _parents_len(l._parents_len),
287     		dupline(NULL), line(NULL), llen(l.llen), gseqname(NULL), track(NULL),
288     		ftype(NULL), ftype_id(l.ftype_id), info(NULL), fstart(l.fstart), fend(l.fend),
289 			//qstart(l.fstart), qend(l.fend), qlen(l.qlen),
290 			score(l.score), score_decimals(l.score_decimals), strand(l.strand), flags(l.flags), exontype(l.exontype),
291 			phase(l.phase), cds_start(l.cds_start), cds_end(l.cds_end), exons(l.exons), cdss(l.cdss),
292 			gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(l.num_parents), ID(NULL) {
293     	//if (l==NULL || l->line==NULL)
294     	//	GError("Error: invalid GffLine(l)\n");
295     	//memcpy((void*)this, (void*)l, sizeof(GffLine));
296     	GMALLOC(line, llen+1);
297     	memcpy(line, l.line, llen+1);
298     	GMALLOC(dupline, llen+1);
299     	memcpy(dupline, l.dupline, llen+1);
300     	//--offsets within line[]
301     	gseqname=line+(l.gseqname-l.line);
302     	track=line+(l.track-l.line);
303     	ftype=line+(l.ftype-l.line);
304     	info=line+(l.info-l.line);
305     	if (num_parents>0 && parents) {
306     		GMALLOC(parents, num_parents*sizeof(char*));
307     		//_parents_len=l->_parents_len; copied above
308     		_parents=NULL; //re-init, forget pointer copy
309     		GMALLOC(_parents, _parents_len);
310     		memcpy(_parents, l._parents, _parents_len);
311     		for (int i=0;i<num_parents;i++) {
312     			parents[i]=_parents+(l.parents[i] - l._parents);
313     		}
314     	}
315     	//-- allocated string copies:
316     	ID=Gstrdup(l.ID);
317     	if (l.gene_name!=NULL)
318     		gene_name=Gstrdup(l.gene_name);
319     	if (l.gene_id!=NULL)
320     		gene_id=Gstrdup(l.gene_id);
321     }
GffLine()322     GffLine(): _parents(NULL), _parents_len(0),
323     		dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
324     		ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), //qstart(0), qend(0), qlen(0),
325     		score(0), score_decimals(-1), strand(0), flags(0), exontype(0), phase(0), cds_start(0), cds_end(0),
326 			exons(), cdss(),  gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) {
327     }
~GffLine()328     ~GffLine() {
329     	GFREE(dupline);
330     	GFREE(line);
331     	GFREE(_parents);
332     	GFREE(parents);
333     	GFREE(ID);
334     	GFREE(gene_name);
335     	GFREE(gene_id);
336     }
337 };
338 
339 class GffAttr {
340  public:
341   union {
342     int id_full;
343     struct {
344       bool      cds:1;
345       int  attr_id:31;
346     };
347   };
348   char* attr_val;
349   GffAttr(int an_id, const char* av=NULL, bool is_cds=false):id_full(0), attr_val(NULL) {
350 	 attr_id=an_id;
351      setValue(av, is_cds);
352   }
~GffAttr()353   ~GffAttr() {
354      GFREE(attr_val);
355   }
356   void setValue(const char* av, bool is_cds=false) {
357      if (attr_val!=NULL) {
358         GFREE(attr_val);
359      }
360      if (av==NULL || av[0]==0) return;
361      //trim spaces
362      const char* vstart=av;
363      while (*vstart==' ') av++;
364      const char* vend=vstart;
365      bool keep_dq=false;
366      while (vend[1]!=0) {
367         if (*vend==' ' && vend[1]!=' ') keep_dq=true;
368           else if (*vend==';') keep_dq=true;
369         vend++;
370      }
371      //remove spaces at the end:
372      while (*vend==' ' && vend!=vstart) vend--;
373      //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes
374      if (!keep_dq && *vstart=='"' && *vend=='"') {
375                vend--;
376                vstart++;
377      }
378      attr_val=Gstrdup(vstart, vend);
379      cds=is_cds;
380   }
381   bool operator==(GffAttr& d){
382       return (this==&d);
383   }
384   bool operator>(GffAttr& d){
385      return (this>&d);
386   }
387   bool operator<(GffAttr& d){
388     return (this<&d);
389   }
390 
391  };
392 
393 class GffNameList;
394 class GffNames;
395 
396 class GffNameInfo {
397   friend class GffNameList;
398  public:
399    int idx;
400    char* name;
401    GffNameInfo(const char* n=NULL):idx(-1),name(NULL) {
402      if (n) name=Gstrdup(n);
403      }
404 
~GffNameInfo()405    ~GffNameInfo() {
406       GFREE(name);
407      }
408 
409    bool operator==(GffNameInfo& d){
410        return (strcmp(this->name, d.name)==0);
411        }
412    bool operator<(GffNameInfo& d){
413      return (strcmp(this->name, d.name)<0);
414      }
415 };
416 
417 class GffNameList:public GPVec<GffNameInfo> {
418   friend class GffNameInfo;
419   friend class GffNames;
420 protected:
421   GHashMap<const char*, GffNameInfo*> byName;//hash with shared keys
422   int idlast; //fList index of last added/reused name
addStatic(const char * tname)423   int addStatic(const char* tname) {// fast add
424      GffNameInfo* f=new GffNameInfo(tname);
425      idlast=this->Add(f);
426      f->idx=idlast;
427      byName.Add(f->name,f);
428      return idlast;
429      }
430 public:
431  //GffNameList(int init_capacity=6):GList<GffNameInfo>(init_capacity, false,true,true), byName(false) {
432   GffNameList(int init_capacity=6):GPVec<GffNameInfo>(init_capacity, true), byName(false) {
433     idlast=-1;
434     setCapacity(init_capacity);
435     }
lastNameUsed()436  char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
lastNameId()437  int lastNameId() { return idlast; }
getName(int nid)438  char* getName(int nid) { //retrieve name by its ID
439    if (nid<0 || nid>=fCount)
440          GError("GffNameList Error: invalid index (%d)\n",nid);
441    return fList[nid]->name;
442    }
443 
addName(const char * tname)444  int addName(const char* tname) {//returns or create an id for the given name
445    //check idlast first, chances are it's the same feature name checked
446    /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
447        return idlast;*/
448    GffNameInfo* f=byName.Find(tname);
449    int fidx=-1;
450    if (f!=NULL) fidx=f->idx;
451      else {//add new entry
452       f=new GffNameInfo(tname);
453       fidx=this->Add(f);
454       f->idx=fidx;
455       byName.Add(f->name,f);
456       }
457    idlast=fidx;
458    return fidx;
459    }
460 
addNewName(const char * tname)461  int addNewName(const char* tname) {
462     GffNameInfo* f=new GffNameInfo(tname);
463     int fidx=this->Add(f);
464     f->idx=fidx;
465     byName.Add(f->name,f);
466     return fidx;
467     }
468 
getId(const char * tname)469  int getId(const char* tname) { //only returns a name id# if found
470     GffNameInfo* f=byName.Find(tname);
471     if (f==NULL) return -1;
472     return f->idx;
473     }
removeName()474  int removeName() {
475    GError("Error: removing names from GffNameList not allowed!\n");
476    return -1;
477    }
478 };
479 
480 class GffNames {
481  public:
482    int numrefs;
483    GffNameList tracks;
484    GffNameList gseqs;
485    GffNameList attrs;
486    GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc.
GffNames()487    GffNames():tracks(),gseqs(),attrs(), feats() {
488     numrefs=0;
489     //the order below is critical!
490     //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
491     gff_fid_mRNA = feats.addStatic("mRNA");//index 0=gff_fid_mRNA
492     gff_fid_transcript=feats.addStatic("transcript");//index 1=gff_fid_transcript
493     gff_fid_exon=feats.addStatic("exon");//index 2=gff_fid_exon
494     gff_fid_CDS=feats.addStatic("CDS"); //index 3=gff_fid_CDS
495     }
496 };
497 
498 void gffnames_ref(GffNames* &n);
499 void gffnames_unref(GffNames* &n);
500 
501 enum GffPrintMode {
502   pgtfAny, //print record as read, if GTF
503   pgtfExon, //print only exon features (CDS converted to exon if exons are missing)
504   pgtfCDS,  //print only CDS features
505   pgtfBoth,  //print both CDS and exon features
506   pgffAny, //print record as read (if isCDSonly() prints only CDS)
507   pgffExon,
508   pgffCDS,
509   pgffBoth, //enforce exon printing if isCDSOnly()
510   pgffTLF,  //exon and CDS data shown as additional GFF attributes
511             //in the transcript line (Transcript Line Format)
512             //every line has the whole transcript data
513   pgffBED //print a BED line with all other GFF attributes in column 13
514 };
515 
516 
517 class GffAttrs:public GList<GffAttr> {
518   public:
GffAttrs()519     GffAttrs():GList<GffAttr>(false,true,true) { }
add_if_new(GffNames * names,const char * attrname,const char * attrval)520     void add_if_new(GffNames* names, const char* attrname, const char* attrval) {
521         //adding a new value without checking for cds status
522         int nid=names->attrs.getId(attrname);
523         if (nid>=0) { //attribute name found in the dictionary
524            for (int i=0;i<Count();i++)
525               if (nid==Get(i)->attr_id) { return; } //don't update existing
526         }
527         else { //adding attribute name to global attr name dictionary
528            nid=names->attrs.addNewName(attrname);
529         }
530         this->Add(new GffAttr(nid, attrval));
531     }
532 
add_if_new(GffNames * names,const char * attrname,const char * attrval,bool is_cds)533     void add_if_new(GffNames* names, const char* attrname, const char* attrval, bool is_cds) {
534         int nid=names->attrs.getId(attrname);
535         if (nid>=0) { //attribute name found in the dictionary
536            for (int i=0;i<Count();i++)
537               if (nid==Get(i)->attr_id && is_cds==Get(i)->cds) { return; } //don't update existing
538         }
539         else { //adding attribute name to global attr name dictionary
540            nid=names->attrs.addNewName(attrname);
541         }
542         this->Add(new GffAttr(nid, attrval, is_cds));
543     }
544 
add_or_update(GffNames * names,const char * attrname,const char * val)545     void add_or_update(GffNames* names, const char* attrname, const char* val) {
546     //adding a new value without checking for cds status
547         int aid=names->attrs.getId(attrname);
548         if (aid>=0) {
549            //attribute found in the dictionary
550            for (int i=0;i<Count();i++) {
551               //do we have it?
552               if (aid==Get(i)->attr_id) {
553                   //update the existing value for this attribute
554                   Get(i)->setValue(val);
555                   return;
556                   }
557               }
558         }
559         else { //adding attribute name to global attr name dictionary
560            aid=names->attrs.addNewName(attrname);
561         }
562         this->Add(new GffAttr(aid, val));
563     }
564 
add_or_update(GffNames * names,const char * attrname,const char * val,bool is_cds)565     void add_or_update(GffNames* names, const char* attrname, const char* val, bool is_cds) {
566       int aid=names->attrs.getId(attrname);
567       if (aid>=0) {
568          //attribute found in the dictionary
569          for (int i=0;i<Count();i++) {
570             //do we have it?
571             if (aid==Get(i)->attr_id && Get(i)->cds==is_cds) {
572                 //update the existing value for this attribute
573                 Get(i)->setValue(val, is_cds);
574                 return;
575                 }
576             }
577       }
578       else { //adding attribute name to global attr name dictionary
579          aid=names->attrs.addNewName(attrname);
580       }
581       this->Add(new GffAttr(aid, val, is_cds));
582     }
583 
584     int haveId(int attr_id, bool is_cds=false) {
585         for (int i=0;i<Count();i++)
586            if (attr_id==Get(i)->attr_id && Get(i)->cds==is_cds)
587         	   return i;
588         return -1;
589     }
590 
591     int haveId(const char* attrname, GffNames* names, bool is_cds=false) {
592     	int aid=names->attrs.getId(attrname);
593     	if (aid>=0) {
594             for (int i=0;i<Count();i++)
595                if (aid==Get(i)->attr_id && Get(i)->cds==is_cds)
596             	   return i;
597     	}
598     	return -1;
599     }
600 
getAttr(GffNames * names,const char * attrname)601     char* getAttr(GffNames* names, const char* attrname) {
602       int aid=names->attrs.getId(attrname);
603       if (aid>=0)
604         for (int i=0;i<Count();i++)
605           if (aid==Get(i)->attr_id) return Get(i)->attr_val;
606       return NULL;
607     }
608 
getAttr(GffNames * names,const char * attrname,bool is_cds)609     char* getAttr(GffNames* names, const char* attrname, bool is_cds) {
610       int aid=names->attrs.getId(attrname);
611       if (aid>=0)
612         for (int i=0;i<Count();i++)
613           if (aid==Get(i)->attr_id && Get(i)->cds==is_cds) return Get(i)->attr_val;
614       return NULL;
615     }
616 
getAttr(int aid)617     char* getAttr(int aid) {
618       if (aid>=0)
619         for (int i=0;i<Count();i++)
620           if (aid==Get(i)->attr_id) return Get(i)->attr_val;
621       return NULL;
622     }
623 
getAttr(int aid,bool is_cds)624     char* getAttr(int aid, bool is_cds) {
625       if (aid>=0)
626         for (int i=0;i<Count();i++)
627           if (aid==Get(i)->attr_id && Get(i)->cds==is_cds)
628             return Get(i)->attr_val;
629       return NULL;
630     }
631 
632     void copyAttrs(GffAttrs* attrs, bool is_cds=false) {
633  	 //deep copy attributes from another GffAttrs list
634      // (only the ones which do not exist yet)
635     	if (attrs==NULL) return;
636     	for (int i=0;i<attrs->Count();i++) {
637     		int aid=attrs->Get(i)->attr_id;
638     		if (haveId(aid, is_cds)<0)
639     			Add(new GffAttr(aid, attrs->Get(i)->attr_val, is_cds));
640     	}
641     }
642 };
643 
644 class GffExon : public GSeg {
645  public:
646   bool sharedAttrs; //do not free attrs on destruct!
647   GffAttrs* attrs; //other attributes kept for this exon/CDS
648   GffScore score; // gff score column
649   int8_t exontype;
650   char phase; //GFF phase column - for CDS segments only!
651 	             // '.' = undefined (UTR), '0','1','2' for CDS exons
652   void* uptr; //for associating extended user data to this exon
653 
getAttr(GffNames * names,const char * atrname)654   char* getAttr(GffNames* names, const char* atrname) {
655     if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
656     return attrs->getAttr(names, atrname);
657   }
658 
getAttr(int aid)659   char* getAttr(int aid) {
660     if (attrs==NULL) return NULL;
661     return attrs->getAttr(aid);
662   }
GffExon(bool share_attributes)663   GffExon(bool share_attributes):GSeg(0,0), sharedAttrs(share_attributes), attrs(NULL), score(),
664 		  exontype(0), phase('.'), uptr(NULL){
665   }
sharedAttrs(false)666   GffExon(uint s=0, uint e=0, int8_t et=0, char ph='.', float sc=0, int8_t sc_prec=0):sharedAttrs(false), attrs(NULL),
667 		  score(sc,sc_prec), exontype(et), phase(ph), uptr(NULL) {
668 		if (s<e) { start=s; end=e; }
669 		    else { start=e; end=s; }
670 
671   } //constructor
672 
673 
GffExon(const GffExon & ex)674   GffExon(const GffExon& ex):GSeg(ex.start, ex.end) { //copy constructor
675       (*this)=ex; //use the default (shallow!) copy operator
676       if (ex.attrs!=NULL) { //make a deep copy here
677         attrs=new GffAttrs();
678         attrs->copyAttrs(ex.attrs);
679       }
680   }
681 
682   GffExon& operator=(const GffExon& o) = default; //prevent gcc 9 warnings:
683                                            //yes, I want a shallow copy here
684 
~GffExon()685   ~GffExon() { //destructor
686      if (attrs!=NULL && !sharedAttrs) delete attrs;
687   }
688 
689 };
690 
691 //only for mapping to spliced coding sequence:
692 class GffCDSeg:public GSeg {
693  public:
694   char phase;
695   int exonidx;
696 };
697 
698 //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
699 class GffObj:public GSeg {
700  protected:
701    char* gffID; // ID name for mRNA (parent) feature
702    char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3)
703    char* geneID; //value of gene_id attribute (GTF) if present, or the ID attribute of a parent gene feature (GFF3)
704    union {
705       unsigned int flags;
706       struct {
707     	  bool flag_HAS_ERRORS        :1;
708     	  bool flag_CHILDREN_PROMOTED :1;
709     	  bool flag_IS_GENE           :1;
710     	  bool flag_IS_TRANSCRIPT     :1;
711     	  bool flag_HAS_GFF_ID        :1; //found transcript/RNA feature line (GFF3 or GTF2 with transcript line)
712     	  bool flag_BY_EXON           :1; //created by subfeature (exon/CDS) directly
713     	  bool flag_CDS_ONLY          :1; //transcript defined by CDS features only (GffObj::isCDS())
714     	  bool flag_CDS_NOSTART       :1; //partial CDS at 5' end (no start codon)
715     	  bool flag_CDS_NOSTOP        :1; //partial CDS at 3' end (no stop codon)
716     	  bool flag_CDS_X             :1; //transcript having CDS with ribosomal shift (i.e. after merging exons)
717     	                                  //CDS segments stored in ::cdss are incompatible with the exon segments
718     	  bool flag_GENE_SEGMENT      :1; //a transcript-like C/D/J/V_gene_segment (NCBI's annotation)
719     	  bool flag_TRANS_SPLICED     :1;
720     	  bool flag_DISCONTINUOUS     :1; //discontinuous feature (e.g. cDNA_match) segments linked by same ID
721     	  bool flag_TARGET_ONLY       :1; //Target= feature (e.g. from RepeatMasker output), lacks ID
722     	  bool flag_DISCARDED         :1; //it will be  discarded from the final GffReader list
723     	  bool flag_LST_KEEP          :1; //controlled by isUsed(); if set, this GffObj will not be
724     	                                  //deallocated when GffReader is destroyed
725     	  bool flag_FINALIZED         :1; //if finalize() was already called for this GffObj
726     	  unsigned int gff_level      :4; //hierarchical level (0..15)
727       };
728    };
729    //-- friends:
730    friend class GffReader;
731    friend class GffExon;
732 public:
733   static GffNames* names; // dictionary storage that holds the various attribute names etc.
734   int track_id; // index of track name in names->tracks
735   int gseq_id; // index of genomic sequence name in names->gseqs
736   int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
737   int subftype_id; //index of child subfeature name in names->feats (subfeatures stored in "exons")
738                    //if ftype_id==gff_fid_mRNA then this value is ignored
739   GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
740   GList<GffExon>* cdss; //only !NULL for cases of "programmed frameshift" when CDS boundaries do not match
741                       //exons boundaries
742   GPVec<GffObj> children;
743   GffObj* parent;
744   int udata; //user data, flags etc.
745   void* uptr; //user pointer (to a parent object, cluster, locus etc.)
746   GffObj* ulink; //link to another GffObj (user controlled field)
747   //---mRNA specific fields:
748   //bool isCDS; //just a CDS, no UTRs
749   uint CDstart; //CDS lowest coordinate
750   uint CDend;   //CDS highest coordinate
751   char CDphase; //initial phase for CDS start ('.','0'..'2')
752                 //CDphase is at CDend if strand=='-'
753   static void decodeHexChars(char* dbuf, const char* s, int maxlen=1023);
hasErrors()754   bool hasErrors() { return flag_HAS_ERRORS; }
hasErrors(bool v)755   void hasErrors(bool v) { flag_HAS_ERRORS=v; }
hasGffID()756   bool hasGffID() { return flag_HAS_GFF_ID; }
hasGffID(bool v)757   void hasGffID(bool v) {flag_HAS_GFF_ID=v; }
createdByExon()758   bool createdByExon() { return flag_BY_EXON; }
createdByExon(bool v)759   void createdByExon(bool v) {flag_BY_EXON=v; }
isCDSOnly()760   bool isCDSOnly() { return flag_CDS_ONLY; }
isCDSOnly(bool v)761   void isCDSOnly(bool v) {  flag_CDS_ONLY=v; }
isXCDS()762   bool isXCDS() { return flag_CDS_X; }
isXCDS(bool v)763   void isXCDS(bool v) {  flag_CDS_X=v; }
isFinalized()764   bool isFinalized() {  return flag_FINALIZED; }
isFinalized(bool v)765   void isFinalized(bool v) {  flag_FINALIZED=v; }
766 
isGene()767   bool isGene() { return flag_IS_GENE; }
isGene(bool v)768   void isGene(bool v) {flag_IS_GENE=v; }
isDiscarded()769   bool isDiscarded() { return flag_DISCARDED; }
isDiscarded(bool v)770   void isDiscarded(bool v) { flag_DISCARDED=v; }
isUsed()771   bool isUsed() { return flag_LST_KEEP; }
isUsed(bool v)772   void isUsed(bool v) {flag_LST_KEEP=v; }
isTranscript()773   bool isTranscript() { return flag_IS_TRANSCRIPT; }
isTranscript(bool v)774   void isTranscript(bool v) {flag_IS_TRANSCRIPT=v; }
isGeneSegment()775   bool isGeneSegment() { return flag_GENE_SEGMENT; }
isGeneSegment(bool v)776   void isGeneSegment(bool v) {flag_GENE_SEGMENT=v; }
promotedChildren()777   bool promotedChildren() { return flag_CHILDREN_PROMOTED; }
promotedChildren(bool v)778   void promotedChildren(bool v) { flag_CHILDREN_PROMOTED=v; }
setLevel(byte v)779   void setLevel(byte v) { gff_level=v; }
getLevel()780   byte getLevel() { return gff_level; }
incLevel()781   byte incLevel() { gff_level++; return gff_level; }
782 
isValidTranscript()783   bool isValidTranscript() {
784     //return (ftype_id==gff_fid_mRNA && exons.Count()>0);
785     return (isTranscript() && exons.Count()>0);
786   }
787 
788   //return the index of exon containing coordinate coord, or -1 if not
789   int whichExon(uint coord, GList<GffExon>* segs=NULL);
790   int readExon(GffReader& reader, GffLine& gl);
791 
792   int addExon(GList<GffExon>& segs, GffLine& gl, int8_t exontype_override=exgffNone); //add to cdss or exons
793 
794   int addExon(uint segstart, uint segend, int8_t exontype=exgffNone, char phase='.',
795 		      GffScore exon_score=GFFSCORE_NONE, GList<GffExon>* segs=NULL);
796 
797 protected:
798   bool reduceExonAttrs(GList<GffExon>& segs);
799   //utility segment-merging function for addExon()
800   void expandSegment(GList<GffExon>&segs, int oi, uint segstart, uint segend,
801        int8_t exontype);
802   bool processGeneSegments(GffReader* gfr); //for genes that have _gene_segment features (NCBI annotation)
803   void transferCDS(GffExon* cds);
804 public:
805   void removeExon(int idx);
806   void removeExon(GffExon* p);
807   char  strand; //'+', '-' or '.'
808   GffScore gscore;
809   int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
810   GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
811    //constructor by gff line parsing:
812   GffObj(GffReader& gfrd, BEDLine& bedline);
813   GffObj(GffReader& gfrd, GffLine& gffline);
814    //if gfline->Parent!=NULL then this will also add the first sub-feature
815    // otherwise, only the main feature is created
816   void copyAttrs(GffObj* from);
clearAttrs()817   void clearAttrs() {
818     if (attrs!=NULL) {
819       bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
820       delete attrs; attrs=NULL;
821       if (sharedattrs) exons[0]->attrs=NULL;
822       }
823     }
824   GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() {
825                                    //exons: sorted, free, non-unique
826        gffID=NULL;
827        uptr=NULL;
828        ulink=NULL;
829        flags=0;
830        udata=0;
831        parent=NULL;
832        ftype_id=-1;
833        subftype_id=-1;
834        if (anid!=NULL) gffID=Gstrdup(anid);
835        gffnames_ref(names);
836        CDstart=0; // hasCDS <=> CDstart>0
837        CDend=0;
838        CDphase=0;
839        gseq_id=-1;
840        track_id=-1;
841        strand='.';
842        attrs=NULL;
843        covlen=0;
844        geneID=NULL;
845        gene_name=NULL;
846    }
~GffObj()847    ~GffObj() {
848        GFREE(gffID);
849        GFREE(gene_name);
850        GFREE(geneID);
851        delete cdss;
852        clearAttrs();
853        gffnames_unref(names);
854        }
855    //--------------
856    GffObj* finalize(GffReader* gfr);
857                //complete parsing: must be called in order to merge adjacent/close proximity subfeatures
858    void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false, bool CDSsrc=false);
getSubfName()859    const char* getSubfName() { //returns the generic feature type of the entries in exons array
860      return names->feats.getName(subftype_id);
861      }
862    void setCDS(uint cd_start, uint cd_end, char phase=0);
863    void setCDS(GffObj* t); //set CDS from another transcript
864 
monoFeature()865    bool monoFeature() {
866      return (exons.Count()==0 ||
867           (exons.Count()==1 &&  //exon_ftype_id==ftype_id &&
868               exons[0]->end==this->end && exons[0]->start==this->start));
869      }
870 
hasCDS()871    bool hasCDS() { return (CDstart>0); }
872 
getFeatureName()873    const char* getFeatureName() {
874      return names->feats.getName(ftype_id);
875      }
876    void setFeatureName(const char* feature);
877 
878    void addAttr(const char* attrname, const char* attrvalue);
879    int removeAttr(const char* attrname, const char* attrval=NULL);
880    int removeAttr(int aid, const char* attrval=NULL);
881    int removeAttrs(GStrSet<>& attrSet); //remove attributes whose names are NOT in attrSet
882    int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
883    int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
884 
getAttrName(int i)885    const char* getAttrName(int i) {
886      if (attrs==NULL) return NULL;
887      return names->attrs.getName(attrs->Get(i)->attr_id);
888    }
889 
890    char* getAttr(const char* attrname, bool checkFirstExon=false) {
891      if (names==NULL || attrname==NULL) return NULL;
892      char* r=NULL;
893      if (attrs==NULL) {
894          if (!checkFirstExon) return NULL;
895      } else
896          r=attrs->getAttr(names, attrname);
897      if (r!=NULL) return r;
898      if (checkFirstExon && exons.Count()>0) {
899         r=exons.First()->getAttr(names, attrname);
900      }
901      return r;
902    }
903 
getExonAttr(GffExon * exon,const char * attrname)904    char* getExonAttr(GffExon* exon, const char* attrname) {
905       if (exon==NULL || attrname==NULL) return NULL;
906       return exon->getAttr(names, attrname);
907    }
908 
getExonAttr(int exonidx,const char * attrname)909    char* getExonAttr(int exonidx, const char* attrname) {
910       if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL;
911       return exons[exonidx]->getAttr(names, attrname);
912       }
913 
getAttrValue(int i)914    char* getAttrValue(int i) {
915      if (attrs==NULL) return NULL;
916      return attrs->Get(i)->attr_val;
917      }
getGSeqName()918    const char* getGSeqName() {
919      return names->gseqs.getName(gseq_id);
920      }
921 
getRefName()922    const char* getRefName() {
923      return names->gseqs.getName(gseq_id);
924      }
925    void setRefName(const char* newname);
926 
getTrackName()927    const char* getTrackName() {
928      return names->tracks.getName(track_id);
929    }
exonOverlap(uint s,uint e)930    bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
931       //ignores strand!
932       if (s>e) Gswap(s,e);
933       for (int i=0;i<exons.Count();i++) {
934          if (exons[i]->overlap(s,e)) return true;
935          }
936       return false;
937    }
exonOverlap(GffObj & m)938    bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
939      //if (gseq_id!=m.gseq_id) return false;
940      // ignores strand and gseq_id, must check in advance
941      for (int i=0;i<exons.Count();i++) {
942          for (int j=0;j<m.exons.Count();j++) {
943             if (exons[i]->start>m.exons[j]->end) continue;
944             if (m.exons[j]->start>exons[i]->end) break;
945             //-- overlap if we are here:
946             return true;
947          }
948      }
949      return false;
950    }
951 
952    int exonOverlapIdx(GList<GffExon>& segs, uint s, uint e, int* ovlen=NULL, int start_idx=0);
953 
exonOverlapLen(GffObj & m)954    int exonOverlapLen(GffObj& m) {
955       if (start>m.end || m.start>end) return 0;
956       int i=0;
957       int j=0;
958       int ovlen=0;
959       while (i<exons.Count() && j<m.exons.Count()) {
960         uint istart=exons[i]->start;
961         uint iend=exons[i]->end;
962         uint jstart=m.exons[j]->start;
963         uint jend=m.exons[j]->end;
964         if (istart>jend) { j++; continue; }
965         if (jstart>iend) { i++; continue; }
966         //exon overlap
967         uint ovstart=GMAX(istart,jstart);
968         if (iend<jend) {
969            ovlen+=iend-ovstart+1;
970            i++;
971            }
972         else {
973            ovlen+=jend-ovstart+1;
974            j++;
975            }
976         }//while comparing exons
977       return ovlen;
978       }
979 
exonOverlap(GffObj * m)980     bool exonOverlap(GffObj* m) {
981       return exonOverlap(*m);
982       }
983 
984    //---------------------
985    bool operator==(GffObj& d){
986        return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
987        }
988    bool operator>(GffObj& d){
989       if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
990       if (start==d.start) {
991          if (getLevel()==d.getLevel()) {
992              if (end==d.end) return (strcmp(gffID, d.gffID)>0);
993                         else return (end>d.end);
994              } else return (getLevel()>d.getLevel());
995          } else return (start>d.start);
996       }
997    bool operator<(GffObj& d){
998      if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
999      if (start==d.start) {
1000          if (getLevel()==d.getLevel()) {
1001             if (end==d.end) return strcmp(gffID, d.gffID)<0;
1002                      else return end<d.end;
1003             } else return (getLevel()<d.getLevel());
1004         } else return (start<d.start);
1005      }
getID()1006    char* getID() { return gffID; }
1007 
getGeneID()1008    char* getGeneID() { return geneID; }
getGeneName()1009    char* getGeneName() { return gene_name; }
setGeneName(const char * gname)1010    void setGeneName(const char* gname) {
1011         GFREE(gene_name);
1012         if (gname) gene_name=Gstrdup(gname);
1013    }
setGeneID(const char * gene_id)1014    void setGeneID(const char* gene_id) {
1015         GFREE(geneID);
1016         if (gene_id) geneID=Gstrdup(gene_id);
1017    }
1018    int addSeg(GffLine* gfline);
1019    int addSeg(int fnid, GffLine* gfline);
1020    void getCDSegs(GVec<GffExon>& cds);
1021 
1022    void updateCDSPhase(GList<GffExon>& segs); //for CDS-only features, updates GffExon::phase
1023    void printGTab(FILE* fout, char** extraAttrs=NULL);
1024    void printGxfExon(FILE* fout, const char* tlabel, const char* gseqname,
1025           bool iscds, GffExon* exon, bool gff3, bool cvtChars);
1026    void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
1027              const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false);
1028    void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) {
1029       printGxf(fout, pgtfAny, tlabel, NULL, cvtChars);
1030    }
1031    void printGff(FILE* fout, const char* tlabel=NULL,
1032                                 const char* gfparent=NULL, bool cvtChars=false) {
1033       printGxf(fout, pgffAny, tlabel, gfparent, cvtChars);
1034    }
1035    bool printAttrs(FILE* fout,  const char* sep=";", bool GTFstyle=false, bool cvtChars=false, bool sepFirst=true);
1036    void printTranscriptGff(FILE* fout, char* tlabel=NULL,
1037                             bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) {
1038       if (isValidTranscript())
1039          printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent, cvtChars);
1040       }
1041    void printExonList(FILE* fout); //print comma delimited list of exon intervals
1042    void printCDSList(FILE* fout); //print comma delimited list of CDS intervals
1043 
1044    void printBED(FILE* fout, bool cvtChars);
1045        //print a BED-12 line + GFF3 attributes in 13th field
1046    void printSummary(FILE* fout=NULL);
1047 
1048    char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
1049            uint* cds_start=NULL, uint* cds_end=NULL, GMapSegments* seglst=NULL,
1050 		   bool cds_open=false);
1051     char* getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst=NULL);
1052 
1053     void addPadding(int padLeft, int padRight); //change exons to include this padding on the sides
1054     void removePadding(int padLeft, int padRight);
1055 
1056    //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
empty()1057    bool empty() { return (start==0); }
1058 };
1059 
1060 typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
1061 //user callback after parsing a mapping object:
1062 // Returns: "done with it" status:
1063 //   TRUE if gobj is no longer needed so it's FREEd upon return
1064 //   FALSE if the user needs the gobj pointer and is responsible for
1065 //             collecting and freeing all GffObj objects
1066 
1067 
1068 //GSeqStat: collect basic stats about a common underlying genomic sequence
1069 //          for multiple GffObj
1070 class GSeqStat {
1071  public:
1072    int gseqid; //gseq id in the global static pool of gseqs
1073    char* gseqname; //just a pointer to the name of gseq
1074    int fcount;//number of features on this gseq
1075    uint mincoord;
1076    uint maxcoord;
1077    uint maxfeat_len; //maximum feature length on this genomic sequence
1078    GffObj* maxfeat;
1079    GSeqStat(int id=-1, char* name=NULL) {
1080      gseqid=id;
1081      gseqname=name;
1082      fcount=0;
1083      mincoord=MAXUINT;
1084      maxcoord=0;
1085      maxfeat_len=0;
1086      maxfeat=NULL;
1087      }
1088    bool operator>(GSeqStat& g) {
1089     return (gseqid>g.gseqid);
1090     }
1091    bool operator<(GSeqStat& g) {
1092     return (gseqid<g.gseqid);
1093     }
1094    bool operator==(GSeqStat& g) {
1095     return (gseqid==g.gseqid);
1096     }
1097 };
1098 
1099 
1100 int gfo_cmpByLoc(const pointer p1, const pointer p2);
1101 int gfo_cmpRefByID(const pointer p1, const pointer p2);
1102 
1103 class GfList: public GList<GffObj> {
1104  public:
1105 
GfList(bool sorted)1106    GfList(bool sorted):GList<GffObj>(sorted,false,false) { }
GfList()1107    GfList():GList<GffObj>(false,false,false) {
1108      //GffObjs in this list are NOT deleted when the list is cleared
1109      //-- for deallocation of these objects, call freeAll() or freeUnused() as needed
1110    }
1111    void finalize(GffReader* gfr);
1112 
freeAll()1113    void freeAll() {
1114      for (int i=0;i<fCount;i++) {
1115        delete fList[i];
1116        fList[i]=NULL;
1117        }
1118      Clear();
1119    }
freeUnused()1120    void freeUnused() {
1121      for (int i=0;i<fCount;i++) {
1122        if (fList[i]->isUsed()) continue;
1123        /*//inform the children?
1124        for (int c=0;c<fList[i]->children.Count();c++) {
1125           fList[i]->children[c]->parent=NULL;
1126        }
1127        */
1128        delete fList[i];
1129        fList[i]=NULL;
1130        }
1131      Clear();
1132      }
1133 };
1134 
1135 class CNonExon { //utility class used in subfeature promotion
1136  public:
1137    //int idx;
1138    GffObj* parent;
1139    GffExon* exon;
1140    GffLine* gffline;
1141    //CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
CNonExon(GffObj * p,GffExon * e,GffLine & gl)1142    CNonExon(GffObj* p, GffExon* e, GffLine& gl) {
1143      parent=p;
1144      exon=e;
1145      //idx=i;
1146      gffline=new GffLine(gl);
1147      }
~CNonExon()1148   ~CNonExon() {
1149      delete gffline;
1150      }
1151  };
1152 
1153 
1154 class GffReader {
1155   friend class GffObj;
1156   friend class GffLine;
1157   friend class GfList;
1158   char* linebuf;
1159   off_t fpos;
1160   int buflen;
1161  protected:
1162   union {
1163 	unsigned int flags;
1164     unsigned int gff_type: 6;
1165     struct {
1166        bool is_gff3: 1;  //GFF3 syntax was detected
1167        bool is_gtf:1; //GTF syntax was detected
1168        bool gtf_transcript:1; //has "transcript" features (2-level GTF)
1169        bool gtf_gene:1; //has "gene" features (3-level GTF ..Ensembl?)
1170        bool is_BED:1; //input is BED-12 format, possibly with attributes in 13th field
1171        bool is_TLF:1; //input is GFF3-like Transcript Line Format with exons= attribute
1172        //--other flags
1173        bool transcripts_Only:1; //default ; only keep recognized transcript features
1174        bool keep_Genes:1; //for transcriptsOnly, do not discard genes from gflst
1175        bool keep_Attrs:1;
1176        bool keep_AllExonAttrs:1; //when keep_Attrs, do not attempt to reduce exon attributes
1177        bool noExonAttrs:1;
1178        bool ignoreLocus:1; //discard locus features and attributes from input
1179        bool merge_CloseExons:1;
1180        bool gene2exon:1;
1181        bool sortByLoc:1; //if records should be sorted by location
1182        bool refAlphaSort:1; //if sortByLoc, reference sequences are
1183                        // sorted lexically instead of their id#
1184        //Ensembl ID processing:
1185        bool xEnsemblID:1; //for ensemble GTF merge gene_version and transcript_version into the ID
1186                 //for ensemble GFF3, cannot merge version (!), just remove "transcript:" and "gene:" prefixes
1187        bool gff_warns:1;
1188     };
1189   };
1190   //char* lastReadNext;
1191   FILE* fh;
1192   char* fname;  //optional fasta file with the underlying genomic sequence to be attached to this reader
1193   GFFCommentParser* commentParser;
1194   GffLine* gffline;
1195   BEDLine* bedline;
1196   //bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
1197   //bool gene2exon;  // for childless genes: add an exon as the entire gene span
1198   GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
1199                             // of discarded parent IDs
1200   GHash< GPVec<GffObj>* > phash; //transcript_id => GPVec<GffObj>(false)
1201   char* gfoBuildId(const char* id, const char* ctg);
1202   //void gfoRemove(const char* id, const char* ctg);
1203   GffObj* gfoAdd(GffObj* gfo);
1204   GffObj* gfoAdd(GPVec<GffObj>& glst, GffObj* gfo);
1205   GffObj* gfoReplace(GPVec<GffObj>& glst, GffObj* gfo, GffObj* toreplace);
1206   // const char* id, const char* ctg, char strand, GVec<GfoHolder>** glst, uint start, uint end
1207   bool pFind(const char* id, GPVec<GffObj>*& glst);
1208   GffObj* gfoFind(const char* id, GPVec<GffObj>* & glst, const char* ctg=NULL,
1209 	                                         char strand=0, uint start=0, uint end=0);
1210   CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon*>& pex, char*& subp_name);
1211   void subfPoolAdd(GHash<CNonExon*>& pex, GffObj* newgfo);
1212   GffObj* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon*>& pex);
1213 
1214 #ifdef CUFFLINKS
1215      boost::crc_32_type  _crc_result;
1216 #endif
1217  public:
1218   GPVec<GSeqStat> gseqtable; //table with all genomic sequences, but only current GXF gseq ID indices will have non-NULL
1219   //GffNames* names; //just a pointer to the global static Gff names repository
1220   GfList gflst; //keeps track of all GffObj records being read (when readAll() is used)
1221   GffObj* newGffRec(GffLine* gffline, GffObj* parent=NULL, GffExon* pexon=NULL,
1222 		       GPVec<GffObj>* glst=NULL, bool replace_parent=false);
1223   GffObj* newGffRec(BEDLine* bedline, GPVec<GffObj>* glst=NULL);
1224   //GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
1225   GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline);
1226   GffObj* updateParent(GffObj* newgfh, GffObj* parent);
1227   bool readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon*>* pex = NULL);
1228   GPVec<GSeqStat> gseqStats; //populated after finalize() with only the ref seqs in this file
linebuf(NULL)1229   GffReader(FILE* f=NULL, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0),
1230 		  buflen(0), flags(0), fh(f), fname(NULL), commentParser(NULL), gffline(NULL),
1231 		  bedline(NULL), discarded_ids(true), phash(true), gseqtable(1,true),
1232 		  gflst(), gseqStats(1, false) {
1233       GMALLOC(linebuf, GFF_LINELEN);
1234       buflen=GFF_LINELEN-1;
1235       gffnames_ref(GffObj::names);
1236       //gff_warns=gff_show_warnings;
1237       transcripts_Only=t_only;
1238       sortByLoc=sort;
1239       noExonAttrs=true;
1240       //lastReadNext=NULL;
1241   }
1242   /*
1243   void init(FILE *f, bool t_only=false, bool sortbyloc=false, bool g2exon=false) {
1244       fname=NULL;
1245       fh=f;
1246       if (fh!=NULL) rewind(fh);
1247       fpos=0;
1248       flags=0;
1249       transcriptsOnly=t_only;
1250       gflst.sortedByLoc(sortbyloc);
1251       gene2exon=g2exon;
1252   }
1253   */
gene2Exon(bool v)1254   void gene2Exon(bool v) { gene2exon=v;}
procEnsemblID(bool v)1255   void procEnsemblID(bool v) { xEnsemblID=v;}
procEnsemblID()1256   bool procEnsemblID() { return xEnsemblID; }
1257   void enableSorting(bool sorting=true) { sortByLoc=sorting; }
getSorting()1258   bool getSorting() { return sortByLoc; }
1259   void isBED(bool v=true) { is_BED=v; } //should be set before any parsing!
1260   void isTLF(bool v=true) { is_TLF=v; } //should be set before any parsing!
1261   void keepAttrs(bool keep_attrs=true, bool discardExonAttrs=true, bool preserve_exon_attrs=false) {
1262 	  keep_Attrs=keep_attrs;
1263 	  noExonAttrs=discardExonAttrs;
1264 	  keep_AllExonAttrs=preserve_exon_attrs;
1265   }
transcriptsOnly(bool t_only)1266   void transcriptsOnly(bool t_only) { transcripts_Only=t_only; }
transcriptsOnly()1267   bool transcriptsOnly() { return transcripts_Only; }
setIgnoreLocus(bool nolocus)1268   void setIgnoreLocus(bool nolocus) { ignoreLocus=nolocus; }
keepGenes(bool keep_genes)1269   void keepGenes(bool keep_genes) {
1270 	  keep_Genes=keep_genes;
1271   }
keepGenes()1272   bool keepGenes() { return keep_Genes; }
1273   void mergeCloseExons(bool merge_close_exons=true) {
1274 	  merge_CloseExons=merge_close_exons;
1275   }
showWarnings(bool v)1276   void showWarnings(bool v) {
1277      gff_warns=v;
1278      //gff_show_warnings=v;
1279   }
showWarnings()1280   bool showWarnings() {
1281      return gff_warns;
1282   }
1283   void setRefAlphaSorted(bool v=true) {
1284 	refAlphaSort=v;
1285 	if (v) sortByLoc=true;
1286   }
1287   void setCommentParser(GFFCommentParser* cmParser=NULL) {
1288 	  commentParser=cmParser;
1289   }
1290 
linebuf(NULL)1291   GffReader(const char* fn, bool t_only=false, bool sort=false):linebuf(NULL), fpos(0),
1292 	  		  buflen(0), flags(0), fh(NULL), fname(NULL), commentParser(NULL),
1293 			  gffline(NULL), bedline(NULL), discarded_ids(true),
1294 			  phash(true), gseqtable(1,true), gflst(), gseqStats(1,false) {
1295       //gff_warns=gff_show_warnings;
1296       gffnames_ref(GffObj::names);
1297       noExonAttrs=true;
1298       transcripts_Only=t_only;
1299       sortByLoc=sort;
1300       fname=Gstrdup(fn);
1301       fh=fopen(fname, "rb");
1302       GMALLOC(linebuf, GFF_LINELEN);
1303       buflen=GFF_LINELEN-1;
1304       //lastReadNext=NULL;
1305       }
1306 
~GffReader()1307  ~GffReader() {
1308       delete gffline;
1309       gffline=NULL;
1310       fpos=0;
1311       if (fh && fh!=stdin) fclose(fh);
1312       gflst.freeUnused();
1313       gflst.Clear();
1314       discarded_ids.Clear();
1315       phash.Clear();
1316       GFREE(fname);
1317       GFREE(linebuf);
1318       //GFREE(lastReadNext);
1319       gffnames_unref(GffObj::names);
1320   }
1321 
1322 
1323   GffLine* nextGffLine();
1324   BEDLine* nextBEDLine();
1325 
1326   // load all subfeatures, re-group them:
1327   void readAll();
1328   void readAll(bool keepAttr, bool mergeCloseExons=false, bool noExonAttr=true) {
1329 	  this->keep_Attrs=keepAttr;
1330 	  this->merge_CloseExons=mergeCloseExons;
1331 	  this->noExonAttrs=noExonAttr;
1332 	  readAll();
1333   }
1334 
1335 
1336   //only for well-formed files: BED or GxF where exons are strictly grouped by their transcript_id/Parent
1337   GffObj* readNext(); //user must free the returned GffObj* !
1338 
1339 #ifdef CUFFLINKS
current_crc_result()1340     boost::crc_32_type current_crc_result() const { return _crc_result; }
1341 #endif
1342 
1343 }; // end of GffReader
1344 
1345 // ----------------------------------------------------------
1346 // -- auxiliary classes for GffObj::processGeneSegments() --
1347 class GSegMatch { //keep track of "matching" overlaps of a GeneCDSChain with multiple GeneSegment containers
1348   public:
1349    int child_idx; //index of matching _gene_segment GffObj in gene->children[] list
1350    int noncov; //number of "non-covered" bases in the GeneSegment
1351    int gsegidx; //index of _gene_segment in GVec<int> geneSegs
1352      // (i.e. UTRs + implied introns if exons are missing)
1353    bool operator<(GSegMatch& o) { return (noncov<o.noncov); }
1354    bool operator==(GSegMatch& o) { return (noncov==o.noncov); }
child_idx(cidx)1355    GSegMatch(int cidx=-1, int ncov=-1, int gsidx=-1):child_idx(cidx),
1356     	   noncov(ncov), gsegidx(gsidx) { }
1357 };
1358 
1359 class GeneCDS: public GSeg {
1360   public:
1361 	int idx; //index of this CDS entry in this gene->cdss[] list
GSeg(cstart,cend)1362 	GeneCDS(int i=-1, uint cstart=0, uint cend=0):GSeg(cstart, cend), idx(i) {
1363 	}
1364 };
1365 
1366 class GeneCDSChain: public GSeg { //keep track of CDS chains of the gene and their boundaries
1367   public:
1368 	GVec<GeneCDS> cdsList; //all CDSs in this chain
1369 	GArray<GSegMatch> mxs; //list of "matching" container X_gene_segment transcripts;
GeneCDSChain()1370 	GeneCDSChain():cdsList(),mxs() { }
GeneCDSChain(int idx,uint cstart,uint cend)1371 	GeneCDSChain(int idx, uint cstart, uint cend):GSeg(cstart, cend),
1372 	    	cdsList(),mxs(true) {
1373 	    addCDS(idx, cstart, cend);
1374 
1375 	}
addCDS(int idx,uint cstart,uint cend)1376 	void addCDS(int idx, uint cstart, uint cend) {
1377 	    GeneCDS cds(idx, cstart, cend);
1378 	    cdsList.Add(cds);
1379 	    expandInclude(cstart, cend);
1380 	}
addMatch(int childidx,int ncov,int gsegidx)1381 	void addMatch(int childidx, int ncov, int gsegidx) {
1382 	    GSegMatch segmatch(childidx, ncov, gsegidx);
1383 	    mxs.Add(segmatch);
1384 	}
singleExonCDSMatch(uint tstart,uint tend,int & ncov)1385 	bool singleExonCDSMatch(uint tstart, uint tend, int& ncov) {
1386 	    if (start>=tstart && end<=tend) {
1387 	    	ncov=start-tstart + tend-end;
1388 	    	//add all CDS-"introns"
1389 	    	if (cdsList.Count()>1)
1390 	    		//shouldn't really consider this a valid "match"
1391 	    		for (int i=1;i<cdsList.Count();i++)
1392 	    			ncov+=cdsList[i].start-cdsList[i-1].end-1;
1393 	    	return true;
1394 	    }
1395 	    return false;
1396 	}
singleCDStoExon(GffObj & t,int & ncov)1397 	bool singleCDStoExon(GffObj&t, int& ncov) {
1398 	    //cdsList[0] must be contained in a single exon of t
1399 	    int nc=0;
1400 	    bool match=false;
1401 	    for (int i=0;i<t.exons.Count();i++) {
1402 	    	if (t.exons[i]->overlap(cdsList[0])) {
1403 	    	   if (cdsList[0].start>=t.exons[i]->start &&
1404 	    			cdsList[0].end<=t.exons[i]->end) {
1405 	    		  match=true;
1406 	    		  nc+=cdsList[0].start-t.exons[i]->start+t.exons[i]->end+cdsList[0].end;
1407 	    	   } //contained in this exon
1408 	    	   else return false; //overlap, but not contained
1409 	    	   continue;
1410 	    	}
1411 	    	nc+=t.exons[i]->len();
1412 	    }
1413 	    if (!match) return false;
1414 	    ncov=nc;
1415 	    return true;
1416 	}
1417 
multiCDStoExon(GffObj & t,int & ncov)1418 	bool multiCDStoExon(GffObj &t, int& ncov) {
1419 	    //multi-CDS vs multi-exon t
1420 	    int nc=0;
1421 	    int e=0, c=0;
1422 	    int emax=t.exons.Count()-1;
1423 	    int cmax=cdsList.Count()-1;
1424 	    int mintrons=0; //matched introns
1425 	    while (e<emax && c<cmax) {
1426 	    	if (mintrons>0 &&
1427 	    			(cdsList[c].end!=t.exons[e]->end ||
1428 	    					cdsList[c+1].start!=t.exons[e+1]->start))
1429 	    		return false;
1430 	    	GSeg cintron(cdsList[c].end+1, cdsList[c+1].start-1);
1431 	    	GSeg eintron(t.exons[e]->end+1, t.exons[e+1]->start-1);
1432 	    	if (cintron.start>eintron.end) {
1433 	    		nc+=t.exons[e]->len();
1434 	    		e++;
1435 	    		continue;
1436 	    	}
1437 	    	if (eintron.start<=cintron.end) {
1438 	    		//intron overlap
1439 	    		if (cintron.start==eintron.start &&
1440 	    				cintron.end==eintron.end) {
1441 	    			//intron match
1442 	    			if (mintrons==0) {
1443 	    				if (cdsList[c].start<t.exons[e]->start) return false;
1444 	    				nc+=cdsList[c].start-t.exons[e]->start;
1445 	    			}
1446 	    			mintrons++;
1447 	    			c++;e++;
1448 	    			continue;
1449 	    		}
1450 	    		else return false;
1451 	    	}
1452 	    	c++; //should never get here, CDS shouldn't be have to catch up with e
1453 	    }
1454 	    if (mintrons<cdsList.Count()-1) return false;
1455         //c should be cmax, e should be the last exon with CDS
1456 	    nc+=t.exons[e]->end-cdsList[c].end;
1457 	    for(int i=e+1;i<t.exons.Count();i++)
1458 	    	nc+=t.exons[i]->len();
1459 	    ncov=nc;
1460 	    return true;
1461 	}
1462 
containedBy(GffObj & t,int & ncov)1463 	bool containedBy(GffObj& t, int& ncov) {
1464 
1465 	    // (Warning: t may have no defined exons!)
1466 	    //if yes: ncov will be set to the number of non-CDS-covered bases in t
1467 	    if (t.exons.Count()<2) {
1468            if (t.exons.Count()==0)
1469                //no exons defined, just check boundaries
1470                return singleExonCDSMatch(t.start, t.end, ncov);
1471            else //single-exon
1472                return singleExonCDSMatch(t.exons[0]->start, t.exons[0]->end, ncov);
1473 	    } //single or no exon
1474 	    else { //multi-exon transcript
1475 	    	if (start<t.exons.First()->start || end>t.exons.Last()->end)
1476 	    		return false; //no containment possible;
1477 	    	if (cdsList.Count()==1)
1478 	    		return singleCDStoExon(t, ncov);
1479             //check intron compatibility!
1480 	    }
1481 	    return true;
1482 	}
1483 };
1484 
1485 
1486 #endif
1487