1 #include "gff.h"
2 
3 GffNames* GffObj::names=NULL;
4 //global set of feature names, attribute names etc.
5 // -- common for all GffObjs in current application!
6 
7 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
8 const uint GFF_MAX_EXON  =   30000; //longest known exon in human is ~11K
9 const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB mouse intron
10 const int  GFF_MIN_INTRON = 4; //for mergeCloseExons option
11 //bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 int gff_fid_mRNA=0; //mRNA (has CDS)
13 int gff_fid_transcript=1; // generic "transcript" feature
14 int gff_fid_exon=2; // generic "exon"-like feature (exon,CDS,UTR,start/stop codon)
15 int gff_fid_CDS=3; // CDS feature (CDS, start/stop codon)
16 const char* exonTypes[]={ "None", "StartCodon", "StopCodon",
17 		  "CDS", "UTR", "CDS+UTR", "exon" };
18 
19 const GffScore GFFSCORE_NONE;
20 //const uint gfo_flag_LEVEL_MSK        = 0x00FF0000;
21 //const byte gfo_flagShift_LEVEL           = 16;
22 
gffnames_ref(GffNames * & n)23 void gffnames_ref(GffNames* &n) {
24   if (n==NULL) n=new GffNames();
25   n->numrefs++;
26 }
27 
gffnames_unref(GffNames * & n)28 void gffnames_unref(GffNames* &n) {
29   if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
30   n->numrefs--;
31   if (n->numrefs==0) { delete n; n=NULL; }
32 }
33 
34 const int CLASSCODE_OVL_RANK = 15;
classcode_rank(char c)35 int classcode_rank(char c) {
36 	switch (c) {
37 		case '=': return 0; //intron chain match or full exon chain match if strict matching is enabled
38 		case '~': return 1; //intron chain match when strict matching is enabled
39 		case 'c': return 2; //containment, perfect partial match (transfrag < reference)
40 		case 'k': return 6; // reverse containment (reference < transfrag)
41 		case 'm': return 6; // full span overlap with all reference introns either matching or retained
42 		case 'n': return 6; // partial overlap transfrag with at least one intron retention
43 		case 'j': return 6; // multi-exon transfrag with at least one junction match
44 		case 'e': return 12; // single exon transfrag partially overlapping an intron of reference (possible pre-mRNA fragment)
45 		case 'o': return 14; // other generic exon overlap
46 	//****  >15 = no-overlaps (not on the same strand) from here on *****
47 		case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand (wrong strand mapping?)
48 		case 'x': return 18; // generic overlap on opposite strand (usually wrong strand mapping)
49 		case 'i': return 20; // intra-intron (transfrag fully contained within a reference intron)
50 		case 'y': return 30; // no exon overlap: ref exons fall within transfrag introns!
51 		case 'p': return 90; //polymerase run
52 		case 'r': return 92; //repeats
53 		case 'u': return 94; //intergenic
54 		case  0 : return 100;
55 		 default: return 96;
56 		}
57 }
58 
strExonType(char xtype)59 const char* strExonType(char xtype) {
60 	static const char* extbl[7]={"None", "start_codon", "stop_codon", "CDS", "UTR", "CDS_UTR", "exon"};
61 	if (xtype>0 && xtype<7)
62 	   return extbl[(int)xtype];
63 	else return "NULL";
64 }
65 
gfo_cmpByLoc(const pointer p1,const pointer p2)66 int gfo_cmpByLoc(const pointer p1, const pointer p2) {
67  GffObj& g1=*((GffObj*)p1);
68  GffObj& g2=*((GffObj*)p2);
69  if (g1.gseq_id==g2.gseq_id) {
70              if (g1.start!=g2.start)
71                     return (int)(g1.start-g2.start);
72                else if (g1.getLevel()!=g2.getLevel())
73                         return (int)(g1.getLevel()-g2.getLevel());
74                     else
75                         if (g1.end!=g2.end)
76                               return (int)(g1.end-g2.end);
77                         else return strcmp(g1.getID(), g2.getID());
78              }
79              else //return (int)(g1.gseq_id-g2.gseq_id); // input order !
80             	 return strcmp(g1.getGSeqName(), g2.getGSeqName()); //lexicographic !
81 }
82 
83 //comparator for ordering by reference sequence (chromosome) index
gfo_cmpRefByID(const pointer p1,const pointer p2)84 int gfo_cmpRefByID(const pointer p1, const pointer p2) {
85  GffObj& g1=*((GffObj*)p1);
86  GffObj& g2=*((GffObj*)p2);
87  if (g1.gseq_id==g2.gseq_id) {
88              if (g1.start!=g2.start)
89                     return (int)(g1.start-g2.start);
90                else if (g1.getLevel()!=g2.getLevel())
91                         return (int)(g1.getLevel()-g2.getLevel());
92                     else
93                         if (g1.end!=g2.end)
94                               return (int)(g1.end-g2.end);
95                         else return strcmp(g1.getID(), g2.getID());
96              }
97              else return (g1.gseq_id-g2.gseq_id); // sort refs by their id# order
98 }
99 
extractGFFAttr(char * & infostr,const char * oline,const char * attr,bool caseStrict,bool enforce_GTF2,int * rlen,bool deleteAttr)100 char* GffLine::extractGFFAttr(char* & infostr, const char* oline, const char* attr, bool caseStrict,
101 		   bool enforce_GTF2, int* rlen, bool deleteAttr) {
102  //parse a key attribute and remove it from the info string
103  //(only works for attributes that have values following them after ' ' or '=')
104  static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required for GTF) at line:\n%s\n";
105  int attrlen=strlen(attr);
106  char cend=attr[attrlen-1];
107  //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
108  //must make sure attr is not found in quoted text
109  char* pos=infostr;
110  char prevch=0;
111  bool in_str=false;
112  bool notfound=true;
113  int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
114  while (notfound && *pos) {
115    char ch=*pos;
116    if (ch=='"') {
117      in_str=!in_str;
118      pos++;
119      prevch=ch;
120      continue;
121      }
122    if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
123           && strcmpfn(attr, pos, attrlen)==0) {
124       //attr match found
125       //check for word boundary on right
126       char* epos=pos+attrlen;
127       if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
128         notfound=false;
129         break;
130         }
131       //not a perfect match, move on
132       pos=epos;
133       prevch=*(pos-1);
134       continue;
135       }
136    //not a match or in_str
137    prevch=ch;
138    pos++;
139    }
140  if (notfound) return NULL;
141  char* vp=pos+attrlen;
142  while (*vp==' ') vp++;
143  if (*vp==';' || *vp==0) {
144       GMessage("Warning: cannot parse value of GFF attribute \"%s\" at line:\n%s\n", attr, oline);
145       return NULL;
146  }
147  bool dq_enclosed=false; //value string enclosed by double quotes
148  if (*vp=='"') {
149      dq_enclosed=true;
150      vp++;
151      }
152  if (enforce_GTF2 && !dq_enclosed)
153       GError(GTF2_ERR, attr, oline);
154  char* vend=vp;
155  if (dq_enclosed) {
156     while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
157     }
158  else {
159     while (*vend!=';' && *vend!=0) vend++;
160     }
161  if (enforce_GTF2 && *vend!='"')
162      GError(GTF2_ERR, attr, oline);
163  char *r=Gstrdup(vp, vend-1);
164  if (rlen) *rlen = vend-vp;
165  if (deleteAttr) {//-- remove this attribute from infostr
166 	 while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
167 	 if (*vend==0) vend--;
168 	 for (char *src=vend, *dest=pos;;src++,dest++) {
169 	   *dest=*src; //shift the rest of infostr (copy over)
170 	   if (*src==0) break;
171 	 }
172  }
173  return r;
174 }
BEDLine(GffReader * reader,const char * l)175 BEDLine::BEDLine(GffReader* reader, const char* l): skip(true), dupline(NULL), line(NULL), llen(0),
176 		gseqname(NULL), fstart(0), fend(0), strand(0), ID(NULL), info(NULL),
177 		cds_start(0), cds_end(0), cds_phase(0), exons(1) {
178   if (reader==NULL || l==NULL) return;
179   llen=strlen(l);
180   GMALLOC(line,llen+1);
181   memcpy(line, l, llen+1);
182   GMALLOC(dupline, llen+1);
183   memcpy(dupline, l, llen+1);
184   char* t[14];
185   int i=0;
186   int tidx=1;
187   t[0]=line;
188   if (startsWith(line, "browser ") || startsWith(line, "track "))
189 	  return;
190   while (line[i]!=0) {
191    if (line[i]=='\t') {
192     line[i]=0;
193     t[tidx]=line+i+1;
194     tidx++;
195     //our custom BED-13+ format, with GFF3 attributes in 13th column
196     if (tidx>12) { info=t[12]; break; }
197    }
198    i++;
199   }
200   /* if (tidx<6) { // require BED-6+ lines
201    GMessage("Warning: 6+ BED columns expected, instead found:\n%s\n", l);
202    return;
203    }
204   */
205   gseqname=t[0];
206   char* p=t[1];
207   if (!parseUInt(p,fstart)) {
208     GMessage("Warning: invalid BED start coordinate at line:\n%s\n",l);
209     return;
210     }
211   ++fstart; //BED start is 0 based
212   p=t[2];
213   if (!parseUInt(p,fend)) {
214     GMessage("Warning: invalid BED end coordinate at line:\n%s\n",l);
215     return;
216     }
217   if (fend<fstart) Gswap(fend,fstart); //make sure fstart<=fend, always
218   if (tidx>5) {
219 	  strand=*t[5];
220 	  if (strand!='-' && strand !='.' && strand !='+') {
221 		  GMessage("Warning: unrecognized BED strand at line:\n%s\n",l);
222 		  return;
223 	  }
224   }
225   else strand='.';
226   //if (tidx>12) ID=t[12];
227   //        else ID=t[3];
228   ID=t[3];
229   //now parse the exons, if any
230   if (tidx>11) {
231 	  int numexons=0;
232 	  p=t[9];
233 	  if (!parseInt(p, numexons) || numexons<=0) {
234 	      GMessage("Warning: invalid BED block count at line:\n%s\n",l);
235 	      return;
236 	  }
237 	  char** blen;
238 	  char** bstart;
239 	  GMALLOC(blen, numexons * sizeof(char*));
240 	  GMALLOC(bstart, numexons * sizeof(char*));
241 	  i=0;
242 	  int b=1;
243 	  blen[0]=t[10];
244 	  while (t[10][i]!=0 && b<=numexons) {
245 		if (t[10][i]==',') {
246 			t[10][i]=0;
247 			if (b<numexons)
248 			  blen[b]=t[10]+i+1;
249 			b++;
250 		}
251 		i++;
252 	  }
253 	  b=1;i=0;
254 	  bstart[0]=t[11];
255 	  while (t[11][i]!=0 && b<=numexons) {
256 		if (t[11][i]==',') {
257 			t[11][i]=0;
258 			if (b<numexons)
259 			  bstart[b]=t[11]+i+1;
260 			b++;
261 		}
262 		i++;
263 	  }
264 	  GSeg ex;
265 	  for (i=0;i<numexons;++i) {
266 		  int exonlen;
267 		  if (!strToInt(blen[i], exonlen) || exonlen<=0) {
268 		      GMessage("Warning: invalid BED block size %s at line:\n%s\n",blen[i], l);
269 		      return;
270 		  }
271 		  int exonstart;
272 		  if (!strToInt(bstart[i], exonstart) || exonstart<0) {
273 		      GMessage("Warning: invalid BED block start %s at line:\n%s\n",bstart[i], l);
274 		      return;
275 		  }
276 		  if (i==0 && exonstart!=0) {
277 			  GMessage("Warning: first BED block start is %d>0 at line:\n%s\n",exonstart, l);
278 			  return;
279 		  }
280 		  exonstart+=fstart;
281 		  uint exonend=exonstart+exonlen-1;
282 		  if ((uint)exonstart>fend || exonend>fend) {
283 			  GMessage("Warning: BED exon %d-%d is outside record boundary at line:\n%s\n",exonstart,exonend, l);
284 			  return;
285 		  }
286 		  ex.start=exonstart;ex.end=exonend;
287 		  exons.Add(ex);
288 	  }
289 	  GFREE(blen);
290 	  GFREE(bstart);
291   }
292   else { //take it as single-exon transcript
293 	  GSeg v(fstart, fend);
294 	  exons.Add(v);
295   }
296   if (info!=NULL) {
297 	  char* cdstr=GffLine::extractGFFAttr(info, dupline, "CDS=");
298 	  if (cdstr) {
299 		 char* p=strchr(cdstr, ':');
300 		 if (p!=NULL) {
301 			*p='\0'; ++p;
302 		 }
303 		if (strToUInt(cdstr, cds_start) && cds_start>=fstart-1) {
304 			++cds_start;
305 			if (!strToUInt(p, cds_end) || cds_end>fend) {
306 				GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n",
307 						    cds_start, cds_end, dupline);
308 				cds_start=0;
309 				cds_end=0; //invalid CDS coordinates
310 			}
311 		}
312 		char* cdstr_phase=NULL;
313 		if (cds_start>0 && (cdstr_phase=GffLine::extractGFFAttr(info, dupline, "CDSphase="))!=NULL) {
314 			cds_phase=cdstr_phase[0];
315 			GFREE(cdstr_phase);
316 		}
317 		GFREE(cdstr);
318 	  }
319   }
320   if (cds_start==0 && cds_end==0 && tidx>7) {
321 	//check if columns 7,8 can be reasonably assumed to be CDS start-end coordinates
322 	if (strToUInt(t[6], cds_start) && strToUInt(t[7], cds_end) && cds_end>cds_start) {
323        if (cds_start>=fstart-1 && cds_end<=fend)
324     	    cds_start++;
325        else { cds_start=0; cds_end=0; }
326 	}
327   }
328   skip=false;
329 }
330 
parseSegmentList(GVec<GSeg> & segs,char * str)331 bool GffLine::parseSegmentList(GVec<GSeg>& segs, char* str) {
332 	bool segs_valid=false;
333 	char* p=strchr(str, '-');
334 	if (p!=NULL && p>str) {
335 		GDynArray<char*> ss;
336 		strsplit(str, ss, ',');
337 		GSeg seg;
338 		segs_valid=true;
339 		for (uint i=0;i<ss.Count();++i) {
340 			char* p=strchr(ss[i], '-');
341 			if (p==NULL) {
342 				segs_valid=false;
343 				break;
344 			}
345 			*p='\0'; ++p;
346 			int xstart=0, xend=0;
347 			if (!strToInt(ss[i], xstart) || xstart<(int)fstart || xstart>(int)fend){
348 				segs_valid=false;
349 				break;
350 			}
351 			if (!strToInt(p, xend) || xend<(int)fstart || xend>(int)fend) {
352 				segs_valid=false;
353 				break;
354 			}
355 			if (xstart>xend) { seg.start=(uint)xend;seg.end=(uint)xstart; }
356 			else    { seg.start=(uint)xstart;seg.end=(uint)xend; }
357 			segs.Add(seg);
358 		} //parse all CDS segments
359 		if (segs_valid) {
360 			if (segs.Count()>1) segs.Sort();
361 		} else
362 			segs.Clear();
363 	}
364 	return segs_valid;
365 }
366 
GffLine(GffReader * reader,const char * l)367 GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len(0),
368 		dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
369 		ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), //qstart(0), qend(0), qlen(0),
370 		score(0), score_decimals(-1), strand(0), flags(0), exontype(exgffNone), phase(0), cds_start(0), cds_end(0),
371 		exons(), cdss(), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) {
372  llen=strlen(l);
373  GMALLOC(line,llen+1);
374  memcpy(line, l, llen+1);
375  GMALLOC(dupline, llen+1);
376  memcpy(dupline, l, llen+1);
377  skipLine=true; //clear only if we make it to the end of this function
378  char* t[9];
379  int i=0;
380  int tidx=1;
381  t[0]=line;
382  char fnamelc[128];
383  while (line[i]!=0) {
384   if (line[i]=='\t') {
385    line[i]=0;
386    t[tidx]=line+i+1;
387    tidx++;
388    if (tidx>8) break;
389    }
390   i++;
391   }
392  if (tidx<8) { // ignore non-GFF lines
393   return;
394  }
395  gffWarnings=reader->gff_warns;
396  gseqname=t[0];
397  track=t[1];
398  ftype=t[2];
399  info=t[8];
400  char* p=t[3];
401  if (!parseUInt(p,fstart)) {
402    //chromosome_band entries in Flybase
403    GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
404    return;
405    }
406  p=t[4];
407  if (!parseUInt(p,fend)) {
408    GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
409    return;
410    }
411  if (fend<fstart) {
412 	 GMessage("Error: invalid feature coordinates (end<start!) at line:\n%s\n",l);
413 	 Gswap(fend,fstart); //make sure fstart<=fend, always
414 	 //return;
415  }
416  p=t[5];
417  if (p[0]=='.' && p[1]==0) {
418   score=0;
419   score_decimals=-1;
420  }
421  else {
422   score_decimals=0;
423   //count decimals
424   char* pd=strchr(p,'.');
425   if (pd) {
426 	  ++pd;
427 	  char* pde=pd;
428 	  while ((*pde)!=0) ++pde;
429 	  score_decimals=pde-pd;
430   }
431   if (!parseFloat(p, score))
432        GError("Error parsing feature score from GFF line:\n%s\n",l);
433 
434   }
435  strand=*t[6];
436  if (strand!='+' && strand!='-' && strand!='.')
437      GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
438  phase=*t[7]; // must be '.', '0', '1' or '2'
439  // exon/CDS/mrna filter
440  strncpy(fnamelc, ftype, 127);
441  fnamelc[127]=0;
442  strlower(fnamelc); //convert to lower case
443  if (endsWith(fnamelc, "match")) {
444    //TODO: do not discard generic cDNA_match/protein_match features, convert them internally
445    // (when a hit chain has multiple _match features with the same ID, e.g. multiple HSPs with the same subject)
446    // and set GffObj::flag_DISCONTINUOUS
447    return;
448  }
449  bool is_t_data=false;
450  bool someRNA=false;
451  if (strstr(fnamelc, "utr")!=NULL) {
452 	 exontype=exgffUTR;
453 	 is_exon=true;
454 	 is_t_data=true;
455  }
456  else if (endsWith(fnamelc, "exon")) {
457 	 exontype=exgffExon;
458 	 is_exon=true;
459 	 is_t_data=true;
460  }
461  else if (strstr(fnamelc, "stop")!=NULL &&
462 		 (strstr(fnamelc, "codon")!=NULL || strstr(fnamelc, "cds")!=NULL) &&
463 		 strstr(fnamelc, "redefined")==NULL && strstr(fnamelc, "selenocysteine")==NULL) {
464 	 exontype=exgffStopCodon;
465 	 is_exon=true;
466 	 is_cds=true; //though some place it outside the last CDS segment
467 	 is_t_data=true;
468  }
469  else if (strstr(fnamelc, "start") &&
470 		 ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
471 	 exontype=exgffStartCodon;
472 	 is_exon=true;
473 	 is_cds=true;
474 	 is_t_data=true;
475  }
476  else if (strcmp(fnamelc, "cds")==0) {
477 	 exontype=exgffCDS;
478 	 is_exon=true;
479 	 is_cds=true;
480 	 is_t_data=true;
481  }
482  else if (startsWith(fnamelc, "intron") || endsWith(fnamelc, "intron")) {
483 	 exontype=exgffIntron;
484  }
485  else if ((someRNA=endsWith(fnamelc,"rna")) || endsWith(fnamelc,"transcript")) { // || startsWith(fnamelc+1, "rna")) {
486 	 is_transcript=true;
487 	 is_t_data=true;
488 	 if (someRNA) ftype_id=GffObj::names->feats.addName(ftype);
489  }
490  else if (endsWith(fnamelc, "_gene_segment")) {
491 	 is_transcript=true;
492 	 is_t_data=true;
493 	 is_gene_segment=true;
494  }
495  else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
496 	 is_gene=true;
497 	 is_t_data=true; //because its name will be attached to parented transcripts
498  }
499  char* Parent=NULL;
500  /*
501   Rejecting non-transcript lines early if only transcripts are requested ?!
502   It would be faster to do this here but there are GFF cases when we reject an
503   unusual parent feature here  (e.g. protein with CDS children) and then
504   their exon/CDS children show up and get assigned to an implicit parent mRNA
505   The solution is to still load this parent as GffObj for now and BAN it later
506   so its children get dismissed/discarded as well.
507  */
508  if (reader->ignoreLocus) {
509 	 if (strcmp(ftype, "locus")==0) return;
510 	 if (is_transcript || is_gene) {
511 		 char* locus=NULL;
512         if (reader->is_gff3 || reader->gff_type==0)
513         	locus=extractAttr("locus=");
514 		else locus=extractAttr("locus");
515         if (locus!=NULL) { GFREE(locus); }
516 	 }
517  }
518  char *gtf_tid=NULL;
519  char *gtf_gid=NULL;
520  if (reader->is_gff3 || reader->gff_type==0) {
521 	ID=extractAttr("ID=",true);
522 	Parent=extractAttr("Parent=",true);
523 	if (reader->gff_type==0) {
524 		if (ID!=NULL || Parent!=NULL) reader->is_gff3=true;
525 			else { //check if it looks like a GTF
526 				gtf_tid=extractAttr("transcript_id", true, true);
527 				if (gtf_tid==NULL) {
528 					gtf_gid=extractAttr("gene_id", true, true);
529 					if (gtf_gid==NULL) return; //cannot determine file type yet
530 				}
531 				reader->is_gtf=true;
532 			}
533 	}
534  }
535 
536  if (reader->is_gff3) {
537 	 //parse as GFF3
538 	 //if (ID==NULL && Parent==NULL) return; //silently ignore unidentified/unlinked features
539 	 if (ID!=NULL) {
540 		 //has ID attr so it's likely to be a parent feature
541 
542 		 //look for explicit gene name
543 		 gene_name=getAttrValue("gene_name=");
544 		 if (gene_name==NULL) {
545 			 gene_name=getAttrValue("geneName=");
546 			 if (gene_name==NULL) {
547 				 gene_name=getAttrValue("gene_sym=");
548 				 if (gene_name==NULL) {
549 					 gene_name=getAttrValue("gene=");
550 				 }
551 			 }
552 		 }
553 		 gene_id=getAttrValue("geneID=");
554 		 if (gene_id==NULL) {
555 			 gene_id=getAttrValue("gene_id=");
556 		 }
557 		 /*
558 		 if (is_gene) { //--WARNING: this might be mislabeled (e.g. TAIR: "mRNA_TE_gene")
559 			 //---special case: keep the Name and ID attributes of the gene feature
560 			 //if (gene_name==NULL)
561 			 //  gene_name=extractAttr("Name=");
562 			 if (gene_id==NULL) //the ID is also gene_id in this case
563 				 gene_id=Gstrdup(ID);
564 			 //skip=false;
565 			 //return;
566 			 //-- we don't care about gene parents.. unless it's a mislabeled "gene" feature
567 		 } //gene feature (probably)
568 		*/
569 		 //--parse exons for TLF
570 		 char* segstr=extractAttr("exons=");
571 		 bool exons_valid=false;
572 		 if (segstr) {
573 			 exons_valid=parseSegmentList(exons, segstr);
574 			 char* exoncountstr=extractAttr("exonCount=");
575 			 if (exoncountstr) {
576 				 int exoncount=0;
577 				 if (!strToInt(exoncountstr, exoncount) || exoncount!=(int)exons.Count())
578 					 GMessage("Warning: exonCount attribute value doesn't match the exons attribute!\n");
579 				 GFREE(exoncountstr);
580 			 }
581 			 GFREE(segstr);
582 		 }
583 		 if (exons_valid) {
584 			 bool validCDS=false;
585 			 segstr=extractAttr("CDS=");
586 			 if (segstr) {
587 				 char* p=strchr(segstr, ':');
588 				 if (p!=NULL) { // CDS=start:end format
589 					 *p='\0'; ++p;
590 					 validCDS=true;
591 					 if (validCDS && strToUInt(segstr, cds_start) && cds_start>=fstart) {
592 						 if (!strToUInt(p, cds_end) || cds_end>fend) {
593 							 validCDS=false;
594 						 }
595 					 }
596 					 if (!validCDS || (int)cds_start<=0 || (int)cds_end<=0) {
597 						 GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n",
598 								 cds_start, cds_end, dupline);
599 						 cds_start=0;
600 						 cds_end=0;
601 					 }
602 				 }  //CDS=start:end format
603 				 else { //CDS = list of start-end segments, just like the exons
604 					 validCDS=parseSegmentList(cdss, segstr);
605 					 if (validCDS && cdss.Count()>0) {
606 						 if (cds_start==0) cds_start=cdss.First().start;
607 						 if (cds_end==0) cds_end=cdss.Last().end;
608 					 }
609 				 }
610 				 GFREE(segstr);
611 			 }
612 			 if (validCDS) {
613 				 char* cds_phase=NULL;
614 				 if ((cds_phase=extractAttr("CDSphase="))!=NULL) {
615 					 phase=cds_phase[0];
616 					 GFREE(cds_phase);
617 				 }
618 			 } //CDS found
619 		 }//has valid exons
620 	 }// has GFF3 ID
621 	 if (Parent!=NULL) {
622 		 //keep Parent attr
623 		 //parse multiple parents
624 		 num_parents=1;
625 		 p=Parent;
626 		 int last_delim_pos=-1;
627 		 while (*p!=';' && *p!=0) {
628 			 if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
629 				 num_parents++;
630 				 last_delim_pos=(p-Parent);
631 			 }
632 			 p++;
633 		 }
634 		 _parents_len=p-Parent+1;
635 		 _parents=Parent;
636 		 GMALLOC(parents, num_parents*sizeof(char*));
637 		 parents[0]=_parents;
638 		 int i=1;
639 		 if (last_delim_pos>0) {
640 			 for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
641 				 if (*p==',') {
642 					 char* ep=p-1;
643 					 while (*ep==' ' && ep>_parents) ep--;
644 					 *(ep+1)=0; //end the string there
645 					 parents[i]=p+1;
646 					 i++;
647 				 }
648 			 }
649 		 }
650 	 } //has Parent field
651 	 //special case for gene_id: for genes, this is the ID
652 	 if (is_gene && gene_id==NULL && ID!=NULL) {
653 	    	 gene_id=Gstrdup(ID);
654 	 }
655 	 //parse other potentially useful GFF3 attributes
656 	 /*
657 	 if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
658 		 p+=7;
659 		 while (*p!=';' && *p!=0 && *p!=' ') p++;
660 		 if (*p!=' ') {
661 			 GError("Error parsing target coordinates from GFF line:\n%s\n",l);
662 		 }
663 		 if (!parseUInt(p,qstart))
664 			 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
665 		 if (*p!=' ') {
666 			 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
667 		 }
668 		 p++;
669 		 if (!parseUInt(p,qend))
670 			 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
671 	 }
672 	 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
673 		 p+=5;
674 		 if (!parseUInt(p,qstart))
675 			 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
676 		 if (*p!='-') {
677 			 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
678 		 }
679 		 p++;
680 		 if (!parseUInt(p,qend))
681 			 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
682 		 if (*p=='|' || *p==':') {
683 			 p++;
684 			 if (!parseUInt(p,qlen))
685 				 GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
686 		 }
687 	 }//has Qreg attr
688 	 if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
689 		 p+=5;
690 		 if (!parseUInt(p,qlen))
691 			 GError("Error parsing target length from GFF Qlen:\n%s\n",l);
692 	 }
693 	  */
694  } //GFF3
695  else { // ----------------- GTF syntax ------------------
696 	 if (reader->transcripts_Only && !is_t_data) {
697 		 return; //alwasys skip unrecognized non-transcript features in GTF
698 	 }
699 	 if (is_gene) {
700 		 reader->gtf_gene=true;
701 		 ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //Ensemble GTF might lack this
702 		 gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
703 		 if (ID==NULL) {
704 			 //no transcript_id -- this should not be valid GTF2 format, but Ensembl (Gencode?)
705 			 //has being known to add "gene" features with only gene_id in their GTF
706 			 if (gene_id!=NULL) { //likely a gene feature line (Ensembl!)
707 			 		 ID=Gstrdup(gene_id); //take over as ID (for defective GTF lacking transcript_id)
708 			 }
709 		 }
710 		 // else if (strcmp(gene_id, ID)==0) //GENCODE v20 gene feature ?
711 	 }
712 	 else if (is_transcript) {
713 		 ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
714 		//gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
715 		 if (ID==NULL) {
716 			 	 //something is wrong here, cannot parse the GTF ID
717 				 GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l);
718 				 return;
719 		 }
720 		 gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
721 		if (gene_id!=NULL)
722 			Parent=Gstrdup(gene_id);
723 		reader->gtf_transcript=true;
724 		is_gtf_transcript=1;
725 	 } else { //must be an exon type
726 		 Parent = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
727 		 gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); // for GTF this is the only attribute accepted as geneID
728 		 //old pre-GTF2 formats like Jigsaw's (legacy support)
729 		 if (Parent==NULL && exontype==exgffExon) {
730 			 if (startsWith(track,"jigsaw")) {
731 				 is_cds=true;
732 				 strcpy(track,"jigsaw");
733 				 p=strchr(info,';');
734 				 if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
735 				 else { Parent=Gstrdup(info,p-1);
736 				 info=p+1;
737 				 }
738 			 }
739 		 }
740 		 if (Parent==NULL) {
741 			 	 //something is wrong here couldn't parse the transcript ID for this feature
742 				 GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l);
743 				 return;
744 		 }
745 	 }
746 	 //more GTF attribute parsing
747 	 if (is_gene && gene_id==NULL && ID!=NULL)
748     	 gene_id=Gstrdup(ID);
749 	 gene_name=getAttrValue("gene_name");
750 	 if (gene_name==NULL) {
751 		 gene_name=getAttrValue("gene_sym");
752 		 if (gene_name==NULL) {
753 			 gene_name=getAttrValue("gene");
754 			 if (gene_name==NULL)
755 				 gene_name=getAttrValue("genesymbol");
756 		 }
757 	 }
758 	 //*** IMPORTANT: prepare GTF for easy parseAttr by adding '=' character after the attribute name
759 	 //          for ALL attributes
760 	 p=info;
761 	 bool noed=true; //not edited after the last delim
762 	 bool nsp=false; //non-space found after last delim
763 	 while (*p!=0) {
764 		 if (*p==' ') {
765 			 if (nsp && noed) {
766 				 *p='=';
767 				 noed=false;
768 				 p++;
769 				 continue;
770 			 }
771 		 }
772 		 else nsp=true; //non-space
773 		 if (*p==';') { noed=true; nsp=false; }
774 		 p++;
775 	 }
776 	 //-- GTF prepare parents[] if Parent found
777 	 if (Parent!=NULL) { //GTF transcript_id found as a parent
778 		 _parents=Parent;
779 		 num_parents=1;
780 		 _parents_len=strlen(Parent)+1;
781 		 GMALLOC(parents, sizeof(char*));
782 		 parents[0]=_parents;
783 	 }
784  } //GTF
785 
786 
787  if (ID==NULL && parents==NULL) {
788 	 if (gffWarnings)
789 		 GMessage("Warning: discarding unrecognized feature (no ID or Parent):\n%s\n",dupline);
790 	 return; //skip
791  }
792  skipLine=false;
793 }
794 
795 //FIXME - this should only be used AFTER finalize() was called, and must have cdss=NULL of course
setCDS(uint cd_start,uint cd_end,char phase)796 void GffObj::setCDS(uint cd_start, uint cd_end, char phase) {
797   if (cd_start<this->start) {
798 	  GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n",
799 			  gffID, cd_start);
800 	  return;
801   }
802   if (cd_end>this->end) {
803 	  GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n",
804 			  gffID, cd_end);
805 	  return;
806   }
807   this->CDstart=cd_start;
808   this->CDend=cd_end;
809   this->CDphase=phase;
810   isTranscript(true);
811   subftype_id=gff_fid_exon;
812   if (monoFeature()) {
813      if (exons.Count()==0) addExon(this->start, this->end, exgffExon);
814             else exons[0]->exontype=exgffExon;
815   }
816 }
817 
setCDS(GffObj * t)818 void GffObj::setCDS(GffObj* t) {
819 	//copy the cdss as well
820 	uint cd_start=t->CDstart;
821 	uint cd_end=t->CDend;
822 	uint phase=t->CDphase;
823 	if (cd_start<this->start) {
824 		  GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n",
825 				  gffID, cd_start);
826 		  return;
827 	}
828 	if (cd_end>this->end) {
829 		  GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n",
830 				  gffID, cd_end);
831 		  return;
832 	}
833 	this->CDstart=cd_start;
834 	this->CDend=cd_end;
835 	this->CDphase=phase;
836 	isTranscript(true);
837 	subftype_id=gff_fid_exon;
838 	if (monoFeature()) {
839 	   if (exons.Count()==0) addExon(this->start, this->end, exgffExon);
840 	          else exons[0]->exontype=exgffExon;
841 	}
842 	if (t->cdss!=NULL) {
843        if (this->cdss!=NULL) delete cdss;
844        cdss=new GList<GffExon>(true, true, false);
845        for (int i=0;i<t->cdss->Count();i++) {
846     	   cdss->Add(new GffExon(*(t->cdss->Get(i))));
847        }
848 	}
849 }
850 
readExon(GffReader & reader,GffLine & gl)851 int GffObj::readExon(GffReader& reader, GffLine& gl) {
852   // -- this should only be called before ::finalize()!
853   //should make sure to get the right subftype_id!
854   if (!isTranscript() && gl.exontype>exgffNone) {
855      //subfeature recognized as exon-like, so this should be considered a transcript!
856      isTranscript(true);
857   }
858   if (isTranscript()) {
859      if (subftype_id<0) {//exon_ftype_id=gff_fid_exon;
860           if (gl.exontype>exgffNone) subftype_id=gff_fid_exon;
861                          else subftype_id=names->feats.addName(gl.ftype);
862      }
863      //any recognized exon-like segment gets the generic "exon" type (also applies to CDS)
864      if (gl.exontype==exgffNone && !gl.is_transcript) {
865           //extraneous mRNA feature, discard
866           if (reader.gff_warns)
867             GMessage("Warning: discarding unrecognized transcript subfeature '%s' of %s\n",
868                 gl.ftype, gffID);
869           return -1;
870           }
871   }
872   else { //non-mRNA parent feature, check this subf type
873     int subf_id=names->feats.addName(gl.ftype);
874     if (subftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
875        subftype_id=subf_id;
876     else {
877        if (subftype_id!=subf_id) {
878          if (subftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
879             //the existing exon was just a dummy one created by default, discard it?
880             exons.Clear();
881             covlen=0;
882             subftype_id=subf_id; //allow the new subfeature to completely takeover
883             }
884          else { //multiple subfeatures, prefer those exon-like
885              if (reader.gff_warns)
886                GMessage("Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
887                   names->feats.getName(subf_id), names->feats.getName(subftype_id),gffID);
888             if (gl.exontype>exgffNone) { //new feature is an exon, discard previously parsed subfeatures
889                if (reader.gff_warns) GMessage("%s.\n", names->feats.getName(subftype_id));
890                subftype_id=subf_id;
891                exons.Clear();
892                covlen=0;
893             }
894               else { //discard new feature
895                if (reader.gff_warns) GMessage("Warning: skipping subfeature %s.\n", names->feats.getName(subf_id));
896                return -1; //skip this 2nd subfeature type for this parent!
897             }
898          }
899        } //incoming subfeature is of different type
900     } //new subfeature type
901   } //non-mRNA parent
902   int eidx=-1;
903   GList<GffExon>* segs=NULL; //either cds or &exons
904   if (gl.is_cds) {
905      if (cdss==NULL)
906        cdss=new GList<GffExon>(true, true, false);
907      segs=cdss;
908   } else {
909      segs=&exons;
910   }
911   eidx=addExon(*segs, gl);
912   if (eidx<0) {
913 	  GMessage("Warning: addExon() failed for GFF line:\n%s\n",gl.dupline);
914 	  return eidx; //this should never happen!
915   }
916   if (reader.keep_Attrs) {
917      if (reader.noExonAttrs) {
918            parseAttrs(attrs, gl.info, true);
919      }
920      else { //need all exon-level attributes
921          parseAttrs((*segs)[eidx]->attrs, gl.info, true, gl.is_cds);
922      }
923   }
924   return eidx;
925 }
926 
addExon(GList<GffExon> & segs,GffLine & gl,int8_t exontype_override)927 int GffObj::addExon(GList<GffExon>& segs, GffLine& gl, int8_t exontype_override) {
928 	int ex_type=(exontype_override!=exgffNone) ? exontype_override : gl.exontype;
929 	GffScore exon_score(gl.score, gl.score_decimals);
930 	int eidx=addExon(gl.fstart, gl.fend, ex_type, gl.phase, exon_score, &segs);
931 	if (&segs==cdss && isGene() && gl.ID!=NULL && eidx>=0) {
932      //special NCBI cases where CDS can be treated as discontiguous features, grouped by their ID
933 	 //-- used for genes with X_gene_segment features
934 	 //char* cds_id=Gstrdup(gl.ID);
935 	 //segs[eidx]->uptr=cds_id;
936 	 segs[eidx]->uptr=gl.ID;
937 	 gl.ID=NULL;
938 	}
939 	return eidx;
940 }
941 
exonOverlapIdx(GList<GffExon> & segs,uint s,uint e,int * ovlen,int start_idx)942 int GffObj::exonOverlapIdx(GList<GffExon>& segs, uint s, uint e, int* ovlen, int start_idx) {
943 	//return the exons' index for the overlapping OR ADJACENT exon
944 	//ovlen, if given, will return the overlap length
945 	//if (s>e) Gswap(s,e);
946 	for (int i=start_idx;i<segs.Count();i++) {
947 		if (segs[i]->start>e+1) break;
948 		if (s-1>segs[i]->end) continue;
949 		//-- overlap/adjacent if we are here:
950 		if (ovlen!=NULL) {
951 			int ovlend= (segs[i]->end>e) ? e : segs[i]->end;
952 			*ovlen= ovlend - ((s>segs[i]->start)? s : segs[i]->start)+1;
953 		}
954 		return i;
955 	} //for each exon
956 	*ovlen=0;
957 	return -1;
958 }
959 
transferCDS(GffExon * cds)960 void GffObj::transferCDS(GffExon* cds) {
961 	//direct adding of a cds to the cdss pointer, without checking
962 	 if (cdss==NULL) cdss=new GList<GffExon>(true, true, false);
963 	 cdss->Add(cds); //now the caller must forget this exon!
964 	 if (CDstart==0 || CDstart>cds->start) CDstart=cds->start;
965 }
966 
addExon(uint segstart,uint segend,int8_t exontype,char phase,GffScore exon_score,GList<GffExon> * segs)967 int GffObj::addExon(uint segstart, uint segend, int8_t exontype, char phase, GffScore exon_score, GList<GffExon>* segs) {
968    if (segstart>segend) { Gswap(segstart, segend); }
969    if (segs==NULL) segs=&exons;
970 	if (exontype!=exgffNone) { //check for overlaps between exon/CDS-type segments
971 		//addExonSegment(gl.fstart, gl.fend, gl.score, gl.phase, gl.is_cds, exontype_override);
972 		int ovlen=0;
973 		int oi=-1;
974 	    while ((oi=exonOverlapIdx(*segs, segstart, segend, &ovlen, oi+1))>=0) {
975 	        //note: ovlen==0 for adjacent segments
976 		    if ((*segs)[oi]->exontype>exgffNone &&
977 		    		(*segs)[oi]->start<=segstart && (*segs)[oi]->end>=segend) {
978 		    		//existing feature contains this segment, so we do NOT need to add it
979 		    	    //-- unless its the annoying NCBI exception: gene with multiple alternate
980 		    	    //        _gene_segment CDS features!
981 		    		if (!(this->isGene() && exontype==exgffCDS &&
982 		    				(*segs)[oi]->exontype==exgffCDS ))
983 		    			return oi;
984 		    }
985 		    if (ovlen==0 || !(exontype==exgffCDS && (*segs)[oi]->exontype==exgffCDS)) {
986 		    	//always merge adjacent features
987 		    	//but NEVER merge two overlapping CDS (CDS programmed ribosomal shift aware)
988 		    	int8_t segtype=((*segs)[oi]->exontype==exgffCDS || exontype==exgffCDS) ? exgffCDS : exgffExon;
989 		    	//if expanded upward, may overlap the segment(s) above
990 		    	expandSegment(*segs, oi, segstart, segend, segtype);
991 		    	return oi;
992 		    }
993 	    }
994 	} //exon overlap/adjacent check
995    //new exon/CDS, not merged in a previous one
996    GffExon* enew=new GffExon(segstart, segend, exontype, phase, exon_score.score, exon_score.precision);
997    int eidx=segs->Add(enew);
998    if (eidx<0) {
999     //this would actually be possible if the object is a "Gene" and "exons" are in fact isoforms
1000      delete enew;
1001      hasErrors(true);
1002      return -1;
1003    }
1004    if (start>segs->First()->start) start=segs->First()->start;
1005    if (end<segs->Last()->end) end=segs->Last()->end;
1006    if (isFinalized() && segs==&exons) {
1007 	   covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
1008    }
1009    return eidx;
1010 }
1011 
expandSegment(GList<GffExon> & segs,int oi,uint segstart,uint segend,int8_t exontype)1012 void GffObj::expandSegment(GList<GffExon>& segs, int oi, uint segstart, uint segend, int8_t exontype) {
1013   //oi is the index of the *first* overlapping segment found that must be enlarged
1014   covlen-=segs[oi]->len();
1015   if (segstart<segs[oi]->start)
1016 	  segs[oi]->start=segstart;
1017   //if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
1018   if (segend>segs[oi]->end)
1019 	  segs[oi]->end=segend;
1020   //if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
1021   //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
1022   //if (sc!=0) segs[oi]->score=sc;
1023   //covlen+=exons[oi]->len();
1024   //if (exons[oi]->exontype< exontype) -- always true
1025   segs[oi]->exontype = exontype;
1026   //if (exontype==exgffCDS) exons[oi]->phase=fr;
1027   //we must check if any more exons are also overlapping this
1028   int ni=oi+1; //next exon index after oi
1029   while (ni<segs.Count() && segs[ni]->start<=segend+1) { // next segment overlaps OR adjacent to newly enlarged segment
1030 	  if (segs[ni]->exontype>0 &&
1031 		(segs[ni]->start==segend+1 || segs[ni]->exontype!=exgffCDS || exontype!=exgffCDS)) {
1032          if (segs[ni]->start<segs[oi]->start) {
1033         	 segs[oi]->start=segs[ni]->start;
1034         	 if (strand=='+') segs[oi]->phase=segs[ni]->phase;
1035          }
1036          if (segs[ni]->end>segs[oi]->end) {
1037         	 segs[oi]->end=segs[ni]->end;
1038         	 if (strand=='-') segs[oi]->phase=segs[ni]->phase;
1039          }
1040 
1041          segs.Delete(ni);
1042       } else ++ni;
1043   } //until no more overlapping/adjacent segments found
1044   // -- make sure any other related boundaries are updated:
1045   if (isFinalized()) {
1046 	  if (&segs==&exons) {
1047 		start=exons.First()->start;
1048 		end=exons.Last()->end;
1049 		//recalculate covlen
1050 		covlen=0;
1051 		for (int i=0;i<exons.Count();++i) covlen+=exons[i]->len();
1052 	  }
1053   }
1054   else {
1055 	  if (start>segs.First()->start) start=segs.First()->start;
1056 	  if (end<segs.Last()->end) end=segs.Last()->end;
1057   }
1058 }
1059 
removeExon(int idx)1060 void GffObj::removeExon(int idx) {
1061   if (idx<0 || idx>=exons.Count()) return;
1062   int segstart=exons[idx]->start;
1063   int segend=exons[idx]->end;
1064   exons.Delete(idx);
1065   if (isFinalized()) {
1066     covlen -= (int)(segend-segstart)+1;
1067     start=exons.First()->start;
1068     end=exons.Last()->end;
1069     if (isCDSOnly()) { CDstart=start; CDend=end; }
1070   }
1071 }
1072 
removeExon(GffExon * p)1073 void GffObj::removeExon(GffExon* p) {
1074 	for (int idx=0;idx<exons.Count();idx++) {
1075 		if (exons[idx]==p) {
1076 			int segstart=exons[idx]->start;
1077 			int segend=exons[idx]->end;
1078 			exons.Delete(idx);
1079 			covlen -= (int)(segend-segstart)+1;
1080 
1081 			if (exons.Count() > 0) {
1082 				start=exons.First()->start;
1083 				end=exons.Last()->end;
1084 				if (isCDSOnly()) { CDstart=start; CDend=end; }
1085 			}
1086 			return;
1087 		}
1088 	}
1089 }
1090 
GffObj(GffReader & gfrd,BEDLine & bedline)1091 GffObj::GffObj(GffReader& gfrd, BEDLine& bedline):GSeg(0,0),
1092 		exons(true,true,false), cdss(NULL), gscore() {
1093 	uptr=NULL;
1094 	ulink=NULL;
1095 	parent=NULL;
1096 	udata=0;
1097 	flags=0;
1098 	CDstart=0;
1099 	CDend=0;
1100 	CDphase=0;
1101 	attrs=NULL;
1102 	gffID=NULL;
1103 	track_id=-1;
1104 	gseq_id=-1;
1105 	//ftype_id=-1;
1106 	//subftype_id=-1;
1107 	strand='.';
1108 	gffnames_ref(names);
1109 	//qlen=0;qstart=0;qend=0;
1110 	covlen=0;
1111 	geneID=NULL;
1112 	gene_name=NULL;
1113 	ftype_id=gff_fid_transcript;
1114 	subftype_id=gff_fid_exon;
1115 	start=bedline.fstart;
1116 	end=bedline.fend;
1117 	gseq_id=names->gseqs.addName(bedline.gseqname);
1118 	track_id=names->tracks.addName("BED");
1119 	strand=bedline.strand;
1120 	//setup flags from gffline
1121 	isGene(false);
1122 	isTranscript(true);
1123 	gffID=Gstrdup(bedline.ID);
1124 	for (int i=0;i<bedline.exons.Count();++i) {
1125 		int eidx=this->addExon(bedline.exons[i].start, bedline.exons[i].end, exgffExon);
1126 	    if (eidx<0 && gfrd.showWarnings())
1127 	       GMessage("Warning: failed adding segment %d-%d for %s (discarded)!\n",
1128 	    		   bedline.exons[i].start, bedline.exons[i].end, gffID);
1129 
1130 	}
1131 	if (bedline.cds_start>0) {
1132 		CDstart=bedline.cds_start;
1133 		CDend=bedline.cds_end;
1134 		if (CDstart>0 && bedline.cds_phase)
1135 			CDphase=bedline.cds_phase;
1136 	}
1137 	if (gfrd.keep_Attrs && bedline.info!=NULL) this->parseAttrs(attrs, bedline.info);
1138 }
1139 
GffObj(GffReader & gfrd,GffLine & gffline)1140 GffObj::GffObj(GffReader &gfrd, GffLine& gffline):
1141      GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() {
1142   uptr=NULL;
1143   ulink=NULL;
1144   parent=NULL;
1145   udata=0;
1146   flags=0;
1147   CDstart=0;
1148   CDend=0;
1149   CDphase=0;
1150   geneID=NULL;
1151   gene_name=NULL;
1152   attrs=NULL;
1153   gffID=NULL;
1154   track_id=-1;
1155   gseq_id=-1;
1156   //ftype_id=-1;
1157   subftype_id=-1;
1158   strand='.';
1159   gffnames_ref(names);
1160   //qlen=0;qstart=0;qend=0;
1161   covlen=0;
1162   ftype_id=gffline.ftype_id;
1163   start=gffline.fstart;
1164   end=gffline.fend;
1165   gseq_id=names->gseqs.addName(gffline.gseqname);
1166   track_id=names->tracks.addName(gffline.track);
1167   strand=gffline.strand;
1168   /*
1169   qcov=0;
1170   qlen=gffline.qlen;
1171   qstart=gffline.qstart;
1172   qend=gffline.qend;
1173   */
1174   //setup flags from gffline
1175   isCDSOnly(gffline.is_cds); //for now
1176   isGene(gffline.is_gene);
1177   isTranscript(gffline.is_transcript || gffline.exontype!=exgffNone);
1178   //fromGff3(gffline.is_gff3);
1179   isGeneSegment(gffline.is_gene_segment);
1180   if (gffline.parents!=NULL && !gffline.is_transcript) {
1181     //GTF style -- create a GffObj directly by subfeature
1182     //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
1183     if (gffline.exontype!=exgffNone) { //recognized exon-like feature
1184        ftype_id=gff_fid_transcript; //so this is some sort of transcript
1185        subftype_id=gff_fid_exon; //subfeatures MUST be exons
1186        //typical GTF2 without "transcript" line
1187        // or malformed GFF3 with orphan or premature exon feature (found before the transcript line)
1188        gffID=Gstrdup(gffline.parents[0]);
1189        this->createdByExon(true);
1190        if (gfrd.is_gff3 && gfrd.showWarnings())
1191      	   GMessage("Warning: exon feature found before transcript ID %s\n",gffID);
1192        //this is the first exon/segment of the transcript
1193        readExon(gfrd, gffline);
1194     }
1195     else {//unrecognized (non-exon) subfeature
1196        //make this GffObj of the same feature type
1197        ftype_id=names->feats.addName(gffline.ftype);
1198        if (gffline.ID!=NULL) { //unrecognized non-exon feature ? use the ID instead
1199             this->hasGffID(true);
1200             gffID=Gstrdup(gffline.ID);
1201             if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info);
1202        }
1203        else { //no ID, just Parent
1204            GMessage("Warning: unrecognized parented feature without ID found before its parent:\n%s\n", gffline.dupline);
1205            gffID=Gstrdup(gffline.parents[0]);
1206            this->createdByExon(true);
1207            readExon(gfrd, gffline);
1208        }
1209     } //unrecognized (non-exon) feature
1210   } //non-transcript parented subfeature given directly
1211   else {
1212     //non-parented feature OR a recognizable transcript
1213     //create a parent feature in its own right
1214     gscore.score=gffline.score;
1215     gscore.precision=gffline.score_decimals;
1216     if (gffline.ID==NULL || gffline.ID[0]==0)
1217       GError("Error: no valid ID found for GFF record\n");
1218     this->hasGffID(true);
1219     gffID=Gstrdup(gffline.ID); //there must be an ID here
1220     //if (gffline.is_transcript) ftype_id=gff_fid_mRNA;
1221       //else
1222     if (gffline.is_transcript) {
1223       subftype_id=gff_fid_exon;
1224       if (ftype_id<0)
1225         ftype_id=names->feats.addName(gffline.ftype);
1226       if (gfrd.is_gff3) {
1227 		  if (gffline.exons.Count()>0) {
1228 			  //for compact GFF-like transcript line format (TLF), exons were already found as attributes
1229 				for (int i=0;i<gffline.exons.Count();++i) {
1230 					int eidx=this->addExon(gffline.exons[i].start, gffline.exons[i].end, exgffExon, '.', gscore);
1231 				    if (eidx<0 && gfrd.showWarnings())
1232 				       GMessage("Warning: failed adding exon %d-%d for %s (discarded)!\n",
1233 				    		   gffline.exons[i].start, gffline.exons[i].end, gffID);
1234 
1235 				}
1236 		  }
1237 		  if (gffline.cds_start>0) {
1238 				CDstart=gffline.cds_start;
1239 				CDend=gffline.cds_end;
1240 		  }
1241 		  if (gffline.phase!=0) CDphase=gffline.phase;
1242 		  if (gffline.cdss.Count()>0) {
1243 			    //for compact GFF-like transcript line format (TLF), CDS might be already found as attributes
1244 			    if (cdss==NULL) cdss=new GList<GffExon>(true, true, false);
1245 				for (int i=0;i<gffline.cdss.Count();++i) {
1246 					int eidx=this->addExon(gffline.cdss[i].start, gffline.cdss[i].end, exgffCDS, 0, GFFSCORE_NONE, cdss);
1247 				    if (eidx<0 && gfrd.showWarnings())
1248 				       GMessage("Warning: failed adding CDS segment %d-%d for %s (discarded)!\n",
1249 				    		   gffline.cdss[i].start, gffline.cdss[i].end, gffID);
1250 
1251 				}
1252 		  }
1253       }
1254     } //is_transcript
1255     if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info);
1256     if (gfrd.is_gff3 && gffline.parents==NULL && gffline.exontype!=exgffNone) {
1257        //special case with bacterial genes just given as a CDS/exon, without parent!
1258        this->createdByExon(true);
1259        if (ftype_id<0) ftype_id=gff_fid_mRNA;
1260        readExon(gfrd, gffline);
1261     }
1262     if (ftype_id<0)
1263         ftype_id=names->feats.addName(gffline.ftype);
1264   }//no parent OR recognizable transcript
1265 
1266   if (gffline.gene_name!=NULL) {
1267      gene_name=Gstrdup(gffline.gene_name);
1268      }
1269   if (gffline.gene_id) { //only for gene features or GTF2 gene_id attribute
1270      geneID=Gstrdup(gffline.gene_id);
1271   }
1272   /*//we cannot assume parents[0] is a gene! for NCBI miRNA, parent can be a primary_transcript feature!
1273   else if (gffline.is_transcript && gffline.parents!=NULL) {
1274 	 geneID=Gstrdup(gffline.parents[0]);
1275   }
1276   */
1277 }
1278 
nextBEDLine()1279 BEDLine* GffReader::nextBEDLine() {
1280  if (bedline!=NULL) return bedline; //caller should free gffline after processing
1281  while (bedline==NULL) {
1282 	int llen=0;
1283 	buflen=GFF_LINELEN-1;
1284 	char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
1285 	if (l==NULL) return NULL;
1286 	int ns=0; //first nonspace position
1287 	while (l[ns]!=0 && isspace(l[ns])) ns++;
1288 	if (l[ns]=='#' || llen<7) continue;
1289 	bedline=new BEDLine(this, l);
1290 	if (bedline->skip) {
1291 	  delete bedline;
1292 	  bedline=NULL;
1293 	  continue;
1294 	}
1295  }
1296  return bedline;
1297 }
1298 
1299 
nextGffLine()1300 GffLine* GffReader::nextGffLine() {
1301  if (gffline!=NULL) return gffline; //caller should free gffline after processing
1302  while (gffline==NULL) {
1303     int llen=0;
1304     buflen=GFF_LINELEN-1;
1305     char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
1306     if (l==NULL) {
1307          return NULL; //end of file
1308          }
1309 #ifdef CUFFLINKS
1310      _crc_result.process_bytes( linebuf, llen );
1311 #endif
1312     int ns=0; //first nonspace position
1313     bool commentLine=false;
1314     while (l[ns]!=0 && isspace(l[ns])) ns++;
1315     if (l[ns]=='#') {
1316     	commentLine=true;
1317     	if (llen<10) {
1318     		if (commentParser!=NULL) (*commentParser)(l, &gflst);
1319     		continue;
1320     	}
1321     }
1322     gffline=new GffLine(this, l);
1323     if (gffline->skipLine) {
1324        if (commentLine && commentParser!=NULL) (*commentParser)(gffline->dupline, &gflst);
1325        delete gffline;
1326        gffline=NULL;
1327        continue;
1328     }
1329     if (gffline->ID==NULL && gffline->parents==NULL)  { //it must have an ID
1330         //this might not be needed, already checked in the GffLine constructor
1331         if (gff_warns)
1332             GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
1333         delete gffline;
1334         gffline=NULL;
1335         //continue;
1336         }
1337     }
1338 return gffline;
1339 }
1340 
1341 
gfoBuildId(const char * id,const char * ctg)1342 char* GffReader::gfoBuildId(const char* id, const char* ctg) {
1343 //caller must free the returned pointer
1344  char* buf=NULL;
1345  int idlen=strlen(id);
1346  GMALLOC(buf, idlen+strlen(ctg)+2);
1347  strcpy(buf, id);
1348  buf[idlen]='~';
1349  strcpy(buf+idlen+1, ctg);
1350  return buf;
1351 }
1352 
gfoAdd(GffObj * gfo)1353 GffObj* GffReader::gfoAdd(GffObj* gfo) {
1354  GPVec<GffObj>* glst=phash.Find(gfo->gffID);
1355  if (glst==NULL)
1356 	 glst=new GPVec<GffObj>(false);
1357  int i=glst->Add(gfo);
1358  phash.Add(gfo->gffID, glst);
1359  return glst->Get(i);
1360 }
1361 
gfoAdd(GPVec<GffObj> & glst,GffObj * gfo)1362 GffObj* GffReader::gfoAdd(GPVec<GffObj>& glst, GffObj* gfo) {
1363  int i=glst.Add(gfo);
1364  return glst[i];
1365 }
1366 
gfoReplace(GPVec<GffObj> & glst,GffObj * gfo,GffObj * toreplace)1367 GffObj* GffReader::gfoReplace(GPVec<GffObj>& glst, GffObj* gfo, GffObj* toreplace) {
1368  for (int i=0;i<glst.Count();++i) {
1369 	 if (glst[i]==toreplace) {
1370 		 //glst.Put(i,gfo);
1371 		 glst[i]=gfo;
1372 		 break;
1373 	 }
1374  }
1375  return gfo;
1376 }
1377 
pFind(const char * id,GPVec<GffObj> * & glst)1378 bool GffReader::pFind(const char* id, GPVec<GffObj>*& glst) {
1379 	glst = phash.Find(id);
1380 	return (glst!=NULL);
1381 }
1382 
gfoFind(const char * id,GPVec<GffObj> * & glst,const char * ctg,char strand,uint start,uint end)1383 GffObj* GffReader::gfoFind(const char* id, GPVec<GffObj>*& glst,
1384 		const char* ctg, char strand, uint start, uint end) {
1385 	GPVec<GffObj>* gl=NULL;
1386 	if (glst) {
1387 		gl=glst;
1388 	} else {
1389 		gl = phash.Find(id);
1390 	}
1391 	GffObj* gh=NULL;
1392 	if (gl) {
1393 		for (int i=0;i<gl->Count();i++) {
1394 			GffObj& gfo = *(gl->Get(i));
1395 			if (ctg!=NULL && strcmp(ctg, gfo.getGSeqName())!=0)
1396 				continue;
1397 			if (strand && gfo.strand!='.' && strand != gfo.strand)
1398 				continue;
1399 			if (start>0) {
1400 				if (abs((int)start-(int)gfo.start)> (int)GFF_MAX_LOCUS)
1401 					continue;
1402 				if (end>0 && (gfo.start>end || gfo.end<start))
1403 					continue;
1404 			}
1405 			//must be the same transcript, according to given comparison criteria
1406 			gh=&gfo;
1407 			break;
1408 		}
1409 	}
1410 	if (!glst) glst=gl;
1411 	return gh;
1412 }
1413 
updateParent(GffObj * newgfo,GffObj * parent)1414 GffObj* GffReader::updateParent(GffObj* newgfo, GffObj* parent) {
1415   //assert(parent);
1416   //assert(newgfo);
1417   parent->children.Add(newgfo);
1418   if (newgfo->parent==NULL) newgfo->parent=parent;
1419   newgfo->setLevel(parent->getLevel()+1);
1420   //if (parent->isGene()) {
1421   if (parent->gene_name!=NULL && newgfo->gene_name==NULL)
1422       newgfo->gene_name=Gstrdup(parent->gene_name);
1423   if (parent->geneID!=NULL && newgfo->geneID==NULL)
1424       newgfo->geneID=Gstrdup(parent->geneID);
1425   //}
1426 
1427   return newgfo;
1428 }
1429 
newGffRec(GffLine * gffline,GffObj * parent,GffExon * pexon,GPVec<GffObj> * glst,bool replace_parent)1430 GffObj* GffReader::newGffRec(GffLine* gffline, GffObj* parent, GffExon* pexon, GPVec<GffObj>* glst, bool replace_parent) {
1431   GffObj* newgfo=new GffObj(*this, *gffline);
1432   GffObj* r=NULL;
1433   gflst.Add(newgfo);
1434   //tag non-transcripts to be discarded later
1435   if (this->transcripts_Only && this->is_gff3 && gffline->ID!=NULL &&
1436 		  gffline->exontype==exgffNone && !gffline->is_gene && !gffline->is_transcript) {
1437 	  //unrecognized non-exon entity, should be discarded
1438 	  newgfo->isDiscarded(true);
1439 	  this->discarded_ids.Add(gffline->ID, new int(1));
1440   }
1441   if (replace_parent && glst) {
1442 	r=gfoReplace(*glst, newgfo, parent);
1443 	updateParent(r, parent);
1444   } else { //regular case of new GffObj creation
1445 	  r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo);
1446 	  if (parent!=NULL) {
1447 		updateParent(r, parent);
1448 		if (pexon!=NULL) parent->removeExon(pexon);
1449 	  }
1450   }
1451   return r;
1452 }
1453 
newGffRec(BEDLine * bedline,GPVec<GffObj> * glst)1454 GffObj* GffReader::newGffRec(BEDLine* bedline, GPVec<GffObj>* glst) {
1455   GffObj* newgfo=new GffObj(*this, *bedline);
1456   GffObj* r=NULL;
1457   gflst.Add(newgfo);
1458   r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo);
1459   return r;
1460 }
1461 
updateGffRec(GffObj * prevgfo,GffLine * gffline)1462 GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline) {
1463  if (prevgfo==NULL) return NULL;
1464  //prevgfo->gffobj->createdByExon(false);
1465  if (gffline->ftype_id>=0)
1466 	 prevgfo->ftype_id=gffline->ftype_id;
1467  else
1468 	 prevgfo->ftype_id=prevgfo->names->feats.addName(gffline->ftype);
1469  prevgfo->start=gffline->fstart;
1470  prevgfo->end=gffline->fend;
1471  prevgfo->isGene(gffline->is_gene);
1472  prevgfo->isTranscript(gffline->is_transcript || gffline->exontype!=exgffNone);
1473  prevgfo->hasGffID(gffline->ID!=NULL);
1474  if (keep_Attrs) {
1475    if (prevgfo->attrs!=NULL) prevgfo->attrs->Clear();
1476    prevgfo->parseAttrs(prevgfo->attrs, gffline->info);
1477    }
1478  return prevgfo;
1479 }
1480 
1481 
readExonFeature(GffObj * prevgfo,GffLine * gffline,GHash<CNonExon> * pex)1482 bool GffReader::readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>* pex) {
1483 	//this should only be called before prevgfo->finalize()!
1484 	bool r=true;
1485 	if (gffline->strand!=prevgfo->strand) {
1486 		if (prevgfo->strand=='.') {
1487 			prevgfo->strand=gffline->strand;
1488 		}
1489 		else {
1490 			GMessage("Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1491 					prevgfo->gffID, prevgfo->strand, gffline->fstart, gffline->fend, gffline->strand,
1492 					prevgfo->getGSeqName());
1493 			return true;
1494 		}
1495 	}
1496 	int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
1497 			((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
1498 					0 );
1499 	if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1500 		GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
1501 		//validation_errors = true;
1502 		r=false;
1503 		if (!gff_warns) exit(1);
1504 	}
1505 	int eidx=prevgfo->readExon(*this, *gffline);
1506 	if (pex!=NULL && eidx>=0) {
1507 		//if (eidx==0 && gffline->exontype>0) prevgfo->isTranscript(true);
1508 		if (gffline->ID!=NULL && gffline->exontype==exgffNone)
1509 		   subfPoolAdd(*pex, prevgfo);
1510 	}
1511 	return r;
1512 }
1513 
subfPoolCheck(GffLine * gffline,GHash<CNonExon> & pex,char * & subp_name)1514 CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
1515   CNonExon* subp=NULL;
1516   subp_name=NULL;
1517   for (int i=0;i<gffline->num_parents;i++) {
1518     if (transcripts_Only && discarded_ids.Find(gffline->parents[i])!=NULL)
1519         continue;
1520     subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1521     subp=pex.Find(subp_name);
1522     if (subp!=NULL)
1523        return subp;
1524     GFREE(subp_name);
1525     }
1526   return NULL;
1527 }
1528 
subfPoolAdd(GHash<CNonExon> & pex,GffObj * newgfo)1529 void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo) {
1530 //this might become a parent feature later
1531 if (newgfo->exons.Count()>0) {
1532    char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1533    pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], *gffline));
1534    GFREE(xbuf);
1535    }
1536 }
1537 
promoteFeature(CNonExon * subp,char * & subp_name,GHash<CNonExon> & pex)1538 GffObj* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex) {
1539   GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1540   //if (prevp!=gflst[subp->idx])
1541   //  GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1542   subp->gffline->discardParent();
1543   GffObj* gfoh=newGffRec(subp->gffline, prevp, subp->exon);
1544   pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1545   prevp->promotedChildren(true);
1546   return gfoh; //returns the holder of newly promoted feature
1547 }
1548 
1549 //In the rare cases where the GFF/GTF stream is properly formatted
1550 // i.e. when all sub-features are grouped with (and preceded by) their parent!
readNext()1551 GffObj* GffReader::readNext() { //user must free the returned GffObj*
1552  GffObj* gfo=NULL;
1553  //GSeg tseg(0,0); //transcript boundaries
1554  char* lastID=NULL;
1555  if (is_BED) {
1556 	 if (nextBEDLine()) {
1557 		 gfo=new GffObj(*this, *bedline);
1558 		 //tseg.start=gfo->start;
1559 		 //tseg.end=gfo->end;
1560 		 delete bedline;
1561 		 bedline=NULL;
1562 	 } //else return NULL;
1563  } else { //GFF parsing
1564     while (nextGffLine()!=NULL) {
1565     	char* tid=gffline->ID;
1566     	if (gffline->is_exon) tid=gffline->parents[0];
1567     	else //not an exon
1568     		if (!(gffline->is_transcript || gffline->is_gene))
1569     			tid=NULL; //WARNING: only parsing transcript && gene records here
1570     	//if (tid==NULL || gffline->num_parents>1) {
1571     	if (tid==NULL) { //not a suitable transcript ID found, skip this line
1572     		delete gffline;
1573     		gffline=NULL;
1574     		continue;
1575     	}
1576     	bool sameID=(lastID!=NULL && strcmp(lastID, tid)==0);
1577     	if (sameID) {
1578     		if (gfo==NULL) GError("Error: same transcript ID but GffObj not initialized?!(%s)\n", tid);
1579     		//TODO: if gffline->is_transcript: trans-splicing!
1580     		if (!gffline->is_exon) {
1581     			GMessage("Warning: skipping unexpected non-exon record with previously seen ID:\n%s\n", gffline->dupline);
1582     			delete gffline;
1583     			gffline=NULL;
1584     			continue;
1585     		}
1586     		readExonFeature(gfo, gffline); //also takes care of adding CDS segments
1587     	} else { //new transcript
1588     		if (gfo==NULL) {
1589     			//start gathering this transcript's data now
1590     			gfo=new GffObj(*this, *gffline);
1591     			//GFREE(lastID);
1592     			lastID=Gstrdup(tid);
1593     			/*if (gffline->is_transcript) {
1594     				tseg.start=gffline->fstart;
1595     				tseg.end=gffline->fend;
1596     			}*/
1597     		}
1598     		else {
1599     			//this gffline is for the next transcript!
1600     			//return what we've got so far
1601     			//return gfo;
1602     			break;
1603     		}
1604     	} //transcript ID change
1605     	//gffline processed, move on
1606 		delete gffline;
1607 		gffline=NULL;
1608     } //while nextgffline()
1609  } //GFF records
1610  GFREE(lastID);
1611  //gfo populated with all its sub-features (or eof reached)
1612  if (gfo!=NULL) {
1613 	gfo->finalize(this);
1614  }
1615  return gfo;
1616 }
1617 
1618 //Usually we have to parse the whole file because exons and other subfeatures can be scattered, unordered in the input
1619 // (thanks to the annoyingly loose GFF3 specs)
1620 //Trans-splicing and fusions shall only be accepted in proper GFF3 format, i.e. multiple transcript features
1621 //with the same ID but NOT overlapping/continuous
1622 //  *** BUT (exception): proximal xRNA features with the same ID, on the same strand, will be merged
1623 //  and the segments will be treated like exons (e.g. TRNAR15 (rna1940) in RefSeq)
readAll()1624 void GffReader::readAll() {
1625 	bool validation_errors = false;
1626 	if (is_BED) {
1627 		while (nextBEDLine()) {
1628 			GPVec<GffObj>* prevgflst=NULL;
1629 			GffObj* prevseen=gfoFind(bedline->ID, prevgflst, bedline->gseqname, bedline->strand, bedline->fstart);
1630 			if (prevseen) {
1631 			//duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs
1632 			  //e.g. a trans-spliced transcript - but segments should not overlap
1633 				if (prevseen->overlap(bedline->fstart, bedline->fend)) {
1634 					//overlapping feature with same ID is going too far
1635 					GMessage("Error: overlapping duplicate BED feature (ID=%s)\n", bedline->ID);
1636 					//validation_errors = true;
1637 					if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors
1638 						delete bedline;
1639 						bedline=NULL;
1640 						continue;
1641 					}
1642 					else exit(1);
1643 				}
1644 				//create a separate entry (true discontinuous feature?)
1645 				prevseen=newGffRec(bedline, prevgflst);
1646 				if (gff_warns) {
1647 					GMessage("Warning: duplicate BED feature ID %s (%d-%d) (discontinuous feature?)\n",
1648 							bedline->ID, bedline->fstart, bedline->fend);
1649 				}
1650 			}
1651 			else {
1652 				newGffRec(bedline, prevgflst);
1653 			}
1654 			delete bedline;
1655 			bedline=NULL;
1656 		}
1657 	}
1658 	else { //regular GFF/GTF or perhaps TLF?
1659 		//loc_debug=false;
1660 		GHash<CNonExon> pex; //keep track of any parented (i.e. exon-like) features that have an ID
1661 		//and thus could become promoted to parent features
1662 		while (nextGffLine()!=NULL) {
1663 			GffObj* prevseen=NULL;
1664 			GPVec<GffObj>* prevgflst=NULL;
1665 			if (gffline->ID && gffline->exontype==exgffNone) {
1666 				//parent-like feature ID (mRNA, gene, etc.) not recognized as an exon feature
1667 				//check if this ID was previously seen on the same chromosome/strand within GFF_MAX_LOCUS distance
1668 				prevseen=gfoFind(gffline->ID, prevgflst, gffline->gseqname, gffline->strand, gffline->fstart);
1669 				if (prevseen) {
1670 					//same ID seen in the same locus/region
1671 					if (prevseen->createdByExon()) {
1672 						if (gff_warns && (prevseen->start<gffline->fstart ||
1673 								prevseen->end>gffline->fend))
1674 							GMessage("Warning: invalid coordinates for %s parent feature (ID=%s)\n", gffline->ftype, gffline->ID);
1675 						//an exon of this ID was given before
1676 						//this line has the main attributes for this ID
1677 						updateGffRec(prevseen, gffline);
1678 					}
1679 					else { //possibly a duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs
1680 					    //e.g. a trans-spliced transcript - though segments should not overlap!
1681 						bool gtf_gene_dupID=(prevseen->isGene() && gffline->is_gtf_transcript);
1682 						if (prevseen->overlap(gffline->fstart, gffline->fend) && !gtf_gene_dupID) {
1683 							//in some GTFs a gene ID may actually be the same with the parented transcript ID (thanks)
1684 							//overlapping feature with same ID is going too far
1685 							GMessage("Error: discarding overlapping duplicate %s feature (%d-%d) with ID=%s\n", gffline->ftype,
1686 									gffline->fstart, gffline->fend, gffline->ID);
1687 							//validation_errors = true;
1688 							if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors
1689 								delete gffline;
1690 								gffline=NULL;
1691 								continue;
1692 							}
1693 							//else exit(1);
1694 						}
1695 						if (gtf_gene_dupID) {
1696 							//special GTF case where parent gene_id matches transcript_id (sigh)
1697 							prevseen=newGffRec(gffline, prevseen, NULL, prevgflst, true);
1698 						}
1699 						else {
1700 							//create a separate entry (true discontinuous feature)
1701 							prevseen=newGffRec(gffline, prevseen->parent, NULL, prevgflst);
1702 							if (gff_warns) {
1703 								GMessage("Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n",
1704 										gffline->ID, gffline->fstart, gffline->fend);
1705 							}
1706 						}
1707 					} //duplicate ID in the same locus
1708 				} //ID seen previously in the same locus
1709 			} //parent-like ID feature (non-exon)
1710 			if (gffline->parents==NULL) {
1711 				//top level feature (transcript, gene), no parents (or parents can be ignored)
1712 				if (!prevseen) newGffRec(gffline, NULL, NULL, prevgflst);
1713 			}
1714 			else { //--- it's a child feature (exon/CDS or even a mRNA with a gene as parent)
1715 				//updates all the declared parents with this child
1716 				bool found_parent=false;
1717 				if (gffline->is_gtf_transcript && prevseen && prevseen->parent) {
1718 					found_parent=true; //parent already found in special GTF case
1719 				}
1720 				else {
1721 					GffObj* newgfo=prevseen;
1722 					GPVec<GffObj>* newgflst=NULL;
1723 					GVec<int> kparents; //kept parents (non-discarded)
1724 					GVec< GPVec<GffObj>* > kgflst(false);
1725 					GPVec<GffObj>* gflst0=NULL;
1726 					for (int i=0;i<gffline->num_parents;i++) {
1727 						newgflst=NULL;
1728 						//if (transcriptsOnly && (
1729 						if (discarded_ids.Find(gffline->parents[i])!=NULL) continue;
1730 						if (!pFind(gffline->parents[i], newgflst))
1731 							continue; //skipping discarded parent feature
1732 						kparents.Add(i);
1733 						if (i==0) gflst0=newgflst;
1734 						kgflst.Add(newgflst);
1735 					}
1736 					if (gffline->num_parents>0 && kparents.Count()==0) {
1737 						kparents.cAdd(0);
1738 						kgflst.Add(gflst0);
1739 					}
1740 					for (int k=0;k<kparents.Count();k++) {
1741 						int i=kparents[k];
1742 						newgflst=kgflst[k];
1743 						GffObj* parentgfo=NULL;
1744 						if (gffline->is_transcript || gffline->exontype==exgffNone) {//likely a transcript
1745 							//parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname,
1746 							//		gffline->strand, gffline->fstart, gffline->fend);
1747 							if (newgflst!=NULL && newgflst->Count()>0)
1748 								parentgfo = newgflst->Get(0);
1749 						}
1750 						else {
1751 							//for exon-like entities we only need a parent to be in locus distance,
1752 							//on the same strand
1753 							parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname,
1754 									gffline->strand, gffline->fstart);
1755 						}
1756 						if (parentgfo!=NULL) { //parent GffObj parsed earlier
1757 							found_parent=true;
1758 							if ((parentgfo->isGene() || parentgfo->isTranscript()) && (gffline->is_transcript ||
1759 									 gffline->exontype==exgffNone)) {
1760 								//not an exon, but could be a transcript parented by a gene
1761 								// *or* by another transcript (! miRNA -> primary_transcript)
1762 								if (newgfo) {
1763 									updateParent(newgfo, parentgfo);
1764 								}
1765 								else {
1766 									newgfo=newGffRec(gffline, parentgfo);
1767 								}
1768 							}
1769 							else { //potential exon subfeature?
1770 								bool addingExon=false;
1771 								if (transcripts_Only) {
1772 									if (gffline->exontype>0) addingExon=true;
1773 								}
1774 								else { //always discard silly "intron" features
1775 									if (! (gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0)))
1776 									  addingExon=true;
1777 								}
1778 								if (addingExon)
1779 									if (!readExonFeature(parentgfo, gffline, &pex))
1780 									   validation_errors=true;
1781 
1782 							}
1783 						} //overlapping parent feature found
1784 					} //for each parsed parent Id
1785 					if (!found_parent) { //new GTF-like record starting directly here as a subfeature
1786 						//or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
1787 						//or it could also be a stray transcript without a parent gene defined previously
1788 						//check if this feature isn't parented by a previously stored "child" subfeature
1789 						char* subp_name=NULL;
1790 						CNonExon* subp=NULL;
1791 						if (!gffline->is_transcript) { //don't bother with this check for obvious transcripts
1792 							if (pex.Count()>0) subp=subfPoolCheck(gffline, pex, subp_name);
1793 							if (subp!=NULL) { //found a subfeature that is the parent of this (!)
1794 								//promote that subfeature to a full GffObj
1795 								GffObj* gfoh=promoteFeature(subp, subp_name, pex);
1796 								//add current gffline as an exon of the newly promoted subfeature
1797 								if (!readExonFeature(gfoh, gffline, &pex))
1798 									validation_errors=true;
1799 							}
1800 						}
1801 						if (subp==NULL) { //no parent subfeature seen before
1802 							//loc_debug=true;
1803 							GffObj* ngfo=prevseen;
1804 							if (ngfo==NULL) {
1805 								//if it's an exon type, create directly the parent with this exon
1806 								//but if it's recognized as a transcript, the object itself is created
1807 								ngfo=newGffRec(gffline, NULL, NULL, newgflst);
1808 							}
1809 							if (!ngfo->isTranscript() &&
1810 									gffline->ID!=NULL && gffline->exontype==0)
1811 								subfPoolAdd(pex, ngfo);
1812 							//even those with errors will be added here!
1813 						}
1814 						GFREE(subp_name);
1815 					} //no previous parent found
1816 				}
1817 			} //parented feature
1818 			//--
1819 			delete gffline;
1820 			gffline=NULL;
1821 		}//while gff lines
1822 	}
1823 	if (gflst.Count()>0) {
1824 		gflst.finalize(this); //force sorting by locus if so constructed
1825 	}
1826 	// all gff records are now loaded in GList gflst
1827 	// so we can free the hash
1828 	phash.Clear();
1829 	//tids.Clear();
1830 	if (validation_errors) {
1831 		exit(1);
1832 	}
1833 }
1834 
finalize(GffReader * gfr)1835 void GfList::finalize(GffReader* gfr) { //if set, enforce sort by locus
1836   GList<GffObj> discarded(false,true,false);
1837   for (int i=0;i<Count();i++) {
1838     //finalize the parsing of each GffObj
1839     fList[i]->finalize(gfr);
1840     if (fList[i]->isDiscarded()) {
1841        discarded.Add(fList[i]);
1842        //inform parent that thiis child is removed
1843        if (fList[i]->parent!=NULL) {
1844     	   GPVec<GffObj>& pchildren=fList[i]->parent->children;
1845     	   for (int c=0;c<pchildren.Count();c++) {
1846     		   if (pchildren[c]==fList[i]) {
1847     			   pchildren.Delete(c);
1848     			   break;
1849     		   }
1850     	   }
1851        }
1852        if (fList[i]->children.Count()>0) { //inform children that the parent was removed
1853       	 for (int c=0;c<fList[i]->children.Count();c++) {
1854       		 fList[i]->children[c]->parent=NULL;
1855       		 if (gfr->keep_Attrs)
1856       			 //inherit the attributes of discarded parent (e.g. pseudo=true; )
1857       			 fList[i]->children[c]->copyAttrs(fList[i]);
1858       	 }
1859        }
1860        this->Forget(i);
1861     }
1862   }
1863   if (discarded.Count()>0) {
1864           this->Pack();
1865   }
1866   if (gfr->sortByLoc) {
1867     this->setSorted(false);
1868     if (gfr->refAlphaSort)
1869       this->setSorted((GCompareProc*)gfo_cmpByLoc);
1870     else
1871       this->setSorted((GCompareProc*)gfo_cmpRefByID);
1872   }
1873 }
1874 
reduceExonAttrs(GList<GffExon> & segs)1875 bool GffObj::reduceExonAttrs(GList<GffExon>& segs) {
1876 	bool attrs_discarded=false;
1877 	for (int a=0;a<segs[0]->attrs->Count();a++) {
1878 		int attr_id=segs[0]->attrs->Get(a)->attr_id;
1879 		char* attr_name=names->attrs.getName(attr_id);
1880 		char* attr_val =segs[0]->attrs->Get(a)->attr_val;
1881 		bool sameExonAttr=true;
1882 		bool discardAll=(GstrEq("exon_id", attr_name) || GstrEq("exon_number", attr_name));
1883 		if (!discardAll)
1884 			for (int i=1;i<segs.Count();i++) {
1885 				char* ov=segs[i]->getAttr(attr_id);
1886 				if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1887 					sameExonAttr=false;
1888 					break;
1889 				}
1890 			}
1891 		if (sameExonAttr) {
1892 			//delete this attribute from exon level
1893 			attrs_discarded=true;
1894 			if (!discardAll) {
1895 				//add the attribute to transcript level
1896 				//rename it if it exists and is different for the transcript!
1897 				char* t_val=NULL;
1898 				bool same_aval=false;
1899 				if (this->attrs!=NULL &&
1900 						(t_val=this->attrs->getAttr(attr_id))!=NULL) {
1901 					//same attribute name already exists for the transcript!
1902 					//write it using CDS_ or exon_ prefix
1903 					same_aval=(strcmp(attr_val, t_val)==0);
1904 					if (!same_aval) {
1905 						//add renamed attribute
1906 						const char* prefix = (&segs==cdss) ? "CDS_" : "exon_";
1907 						char* new_attr_name=NULL;
1908 						GMALLOC(new_attr_name, strlen(prefix)+strlen(attr_name)+1);
1909 						new_attr_name[0]=0;
1910 						strcat(new_attr_name, prefix);
1911 						strcat(new_attr_name, attr_name);
1912 						this->attrs->add_or_update(names, new_attr_name, attr_val);
1913 						GFREE(new_attr_name);
1914 					}
1915 				}
1916 				else { //no such attribute exists for the transcript, copy it from the exon
1917 						this->addAttr(attr_name, attr_val);
1918 				}
1919 			}
1920 			for (int i=1;i<segs.Count();i++) {
1921 				removeExonAttr(*(segs[i]), attr_id);
1922 			}
1923 			segs[0]->attrs->freeItem(a);
1924 		} //sameExonAttr
1925 	}
1926 	if (attrs_discarded) segs[0]->attrs->Pack();
1927 	return attrs_discarded;
1928 }
1929 //return the segs index of segment containing coord:
whichExon(uint coord,GList<GffExon> * segs)1930 int GffObj::whichExon(uint coord, GList<GffExon>* segs) {
1931 	 //segs MUST be sorted by GSeg order (start coordinate)
1932 	if (segs==NULL) segs=&exons;
1933 	if (segs->Count()==0) return -1;
1934 	if (coord<segs->First()->start || coord>segs->Last()->end)
1935 		return -1;
1936 	if (segs->Count()<6) {
1937 		//simple scan
1938 		for (int i=0;i<segs->Count();i++)
1939 			if ((*segs)[i]->overlap(coord))
1940 				return i;
1941 		return -1;
1942 	}
1943 	else { //use quick search
1944 		int i=0;
1945 		int l=0; //lower boundary
1946 		int h=segs->Count()-1; //higher boundary
1947 		while (l<=h) {
1948 			i = (l+h) >> 1; //range midpoint
1949 			if (coord > segs->Get(i)->end)
1950 				l=i+1;
1951 			else { //coord <= segs->Get(i)->end
1952 				if (coord >= segs->Get(i)->start) {
1953 					return i;
1954 				}
1955 				//here: coord < segs->Get(i)->start
1956 				h = i-1;
1957 			}
1958 		}
1959 	}
1960 	return -1;
1961 }
1962 
processGeneSegments(GffReader * gfr)1963 bool GffObj::processGeneSegments(GffReader* gfr) {
1964 	/* procedure:
1965 	 1)store the info about any X_gene_segment entries in a GVec<int>
1966 	     (just storing their index in gene->children[] list)
1967      2)for each CDS, group them by ID in GHash<GeneCDSChain> (and a GPVec<GeneCDSChain> for storage)
1968      3)for each GeneCDSChain, collect _gene_segments having a containment-relationship and rank them by lowest noncov
1969      4)for each GeneCDSChain, pick best _gene_segment match (if any) and transfer CDSs to it
1970 	*/
1971     GVec<int> geneSegs; //X_gene_segment features (children transcripts of this gene)
1972     GHash<GeneCDSChain> cdsChainById(false); // hash of CDS chains: CDS feature grouped by ID
1973     GPVec<GeneCDSChain> cdsChains; // CDS chains storage
1974 	if (cdss==NULL || cdss->Count()==0 || children.Count()==0)
1975 		return false; //we shouldn't be here
1976 	//check if we have any _gene_segment children for this gene
1977     for (int i=0;i<children.Count();i++)
1978     	if (children[i]->flag_GENE_SEGMENT) {
1979     		if (children[i]->hasCDS() || children[i]->cdss!=NULL) {
1980     			GMessage("Warning: will not transfer CDS from %s to gene_segment %s which already has its own\n",
1981     					gffID, children[i]->gffID);
1982     			continue;
1983     		}
1984     		geneSegs.Add(i);
1985     	}
1986     if (geneSegs.Count()==0) {
1987        if (gfr->gff_warns)
1988     	   GMessage("Warning: gene %s has CDS and transcripts but no suitable _gene_segment features\n",gffID);
1989        return false; //nothing to do
1990     }
1991     //group CDSs into CDS chains by their ID:
1992     for (int i=0;i<cdss->Count();i++) {
1993     	char* id=(char*)(cdss->Get(i)->uptr);
1994     	if (id==NULL) continue; //should never happen
1995     	GeneCDSChain *gcc=cdsChainById.Find(id);
1996     	if (gcc!=NULL)
1997              gcc->addCDS(i, cdss->Get(i)->start, cdss->Get(i)->end);
1998     	else { //new CDS chain:
1999     	     gcc=new GeneCDSChain(i, cdss->Get(i)->start, cdss->Get(i)->end);
2000     	     cdsChains.Add(gcc);
2001     	     cdsChainById.shkAdd(id, gcc);
2002     	}
2003     }
2004     for (int i=0;i<cdss->Count();i++) {
2005  	   GFREE(cdss->Get(i)->uptr); //no CDS ID no longer needed
2006     }
2007 
2008     //collect _gene_segment containers for each CDS chain
2009     int cds_moved=0;
2010     for (int i=0;i<cdsChains.Count();i++) {
2011     	GeneCDSChain &gc=*(cdsChains[i]);
2012         for (int si=0;si<geneSegs.Count();si++) {
2013         	GffObj& t=*(children[geneSegs[si]]);
2014         	//if (t.hasCDS() || t.cdss!=NULL) continue; //already transferred
2015         	int novl=-1;
2016         	if (gc.containedBy(t, novl))
2017         		gc.addMatch(geneSegs[si], novl, si);
2018         }
2019         //-- assess the collected matches for this CDS chain:
2020         if (gc.mxs.Count()==0) {
2021     		GMessage("Warning: could not find the corresponding gene segment for CDS chain %d-%d of gene %s\n",
2022     		    				gc.start, gc.end, gffID);
2023         	continue;
2024         }
2025         GffObj* t=children[gc.mxs.First().child_idx];
2026 		for (int c=0;c<gc.cdsList.Count();c++) {
2027 			t->transferCDS(cdss->Get(gc.cdsList[c].idx));
2028 			cdss->Forget(gc.cdsList[c].idx);
2029 			cds_moved++;
2030 		}
2031 		// also remove it from the list of gene_segments to be mapped
2032 		geneSegs.Delete(gc.mxs.First().gsegidx); //assigned, should no longer be checked against other CDS chains
2033 		if (t->isFinalized()) t->finalize(gfr);
2034 
2035     }
2036     if (cds_moved>0) cdss->Pack();
2037     if (cdss->Count()==0) {
2038     	delete cdss;
2039     	cdss=NULL;
2040     	if (exons.Count()==0) isTranscript(false);
2041     }
2042 	return true;
2043 }
2044 
finalize(GffReader * gfr)2045 GffObj* GffObj::finalize(GffReader* gfr) {
2046 	if (this->createdByExon() && this->end-this->start<10 && this->exons.Count()<=1) {
2047 		//? misleading exon-like feature parented by an exon or CDS mistakenly
2048 		//  interpreted as a standalone transcript
2049 		// example: GENCODE gff3 feature "stop_codon_redefined_as_selenocysteine" which is
2050 		// parented by a CDS !
2051 		if (cdss==NULL || cdss->Count()<=1) {
2052 			if (gfr->showWarnings()) {
2053 			 GMessage("Warning: discarding suspicious '%s' record (ID=%s)\n",this->getFeatureName(),gffID);
2054 			}
2055 		   isDiscarded(true);
2056 		}
2057 	}
2058 	if (!isDiscarded()) {
2059 		bool noExons=(exons.Count()==0 && (cdss==NULL || cdss->Count()==0));
2060 		if (noExons) {
2061 			if (isTranscript() || (isGene() && children.Count()==0 && gfr->gene2exon)) {
2062 				//add exon feature to an exonless transcript/gene
2063 				addExon(this->start, this->end, exgffExon);
2064 				//effectively this becomes a transcript (even childless genes if gene2exon)
2065 				isTranscript(true);
2066 			}
2067 		}
2068 		else { //it has exons or CDSs
2069 			if (cdss!=NULL && isGene() && children.Count()>0) {
2070 				//check for X_gene_segment processing
2071 				processGeneSegments(gfr);//distribute the cdss to children _gene_segments
2072 			} // _gene_segment processing
2073 		}
2074 	}
2075 	if (cdss!=NULL && isGene()) //in case we stored IDs for gene_segment features
2076 		for (int i=0;i<cdss->Count();i++) {
2077 			GFREE(cdss->Get(i)->uptr);
2078 		}
2079 	if (gfr->transcripts_Only && !isTranscript() &&
2080 			!(gfr->keep_Genes && isGene())) {
2081 		//discard non-transcripts, unless it's a gene and keepGenes was specified
2082 		isDiscarded(true);
2083 	}
2084 	isFinalized(true);
2085 
2086 	if (isDiscarded()) {
2087 		//just in case we have cds with uptr in use (X_gene_segment), free them
2088 		uptr=NULL;
2089 		udata=0;
2090 		return this;
2091 	}
2092 	if (isTranscript()) {
2093 		isCDSOnly(cdss!=NULL && exons.Count()==0 && cdss->Count()>0);
2094 		subftype_id=isCDSOnly() ? gff_fid_CDS : gff_fid_exon;
2095 	}
2096 	if (cdss!=NULL && cdss->Count()>0) {
2097 		CDstart=cdss->First()->start;
2098 		CDend=cdss->Last()->end;
2099 		CDphase=(strand=='-')? cdss->Last()->phase : cdss->First()->phase;
2100 		bool updatePhase=(CDphase=='.' || CDphase==0);
2101 		if (!updatePhase)
2102 			for (int i=0;i<cdss->Count();++i)
2103 				if ((*cdss)[i]->phase<'0') {
2104 					updatePhase=true;
2105 					break;
2106 				}
2107 		if (updatePhase) updateCDSPhase(*cdss);
2108 		//there are GFFs out there which only provide UTR and CDS records instead of full exons
2109 		//so make sure we add all CDS segments to exons, if they are not already there
2110 		for (int i=0;i<cdss->Count();++i) {
2111 			int eidx=addExon((*cdss)[i]->start, (*cdss)[i]->end, exgffExon, 0, (*cdss)[i]->score);
2112 			if (eidx<0) GError("Error: could not reconcile CDS %d-%d with exons of transcript %s\n",
2113 					(*cdss)[i]->start, (*cdss)[i]->end, gffID);
2114 		}
2115 	}
2116 	else if (CDstart==0) {//no CDS, no phase
2117 		CDphase=0;
2118 		CDend=0;
2119 	}
2120 	//-- attribute reduction for some records which
2121 	//   repeat the exact same attr=value for every exon
2122 	bool reduceAttributes=(gfr->keep_Attrs && !gfr->noExonAttrs &&
2123 			!gfr->keep_AllExonAttrs && exons.Count()>0 && exons[0]->attrs!=NULL);
2124 	if (reduceAttributes) {
2125 		//for each attribute of the 1st exon, if it has the
2126 		//same value for all other exons, move it to transcript level
2127 		//bool reduced=reduceExonAttrs(exons);
2128 		reduceExonAttrs(exons);
2129 		//if (gfr->showWarnings() && reduced)
2130 		//	GMessage("Info: duplicate exon attributes reduced for %s\n", gffID);
2131 		//do the same for CDS segments, if any
2132 		if (cdss!=NULL && cdss->Count()>0 && (*cdss)[0]->attrs!=NULL) {
2133 			//reduced=
2134 			reduceExonAttrs(*cdss);
2135 			//if (gfr->showWarnings() && reduced)
2136 			//	GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID);
2137 		}
2138 	}
2139 	//merge close exons if requested
2140 	if (exons.Count()>0 && isTranscript()) {
2141 		if (gfr->merge_CloseExons) {
2142 			for (int i=0;i<exons.Count()-1;i++) {
2143 				int ni=i+1;
2144 				uint mend=exons[i]->end;
2145 				while (ni<exons.Count()) {
2146 					int dist=(int)(exons[ni]->start-mend-1); //<0 = overlap, 0 = adjacent, >0 = bases apart
2147 					if (dist>GFF_MIN_INTRON) break; //no merging with next segment
2148 					if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
2149 						GMessage("Warning: merging adjacent/overlapping segments (distance=%d) of %s on %s (%d-%d, %d-%d)\n",
2150 								dist, gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
2151 					}
2152 					mend=exons[ni]->end;
2153 					exons[i]->end=mend;
2154 					if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
2155 							exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
2156 						//use the other exon attributes, if it has more
2157 						delete(exons[i]->attrs);
2158 						exons[i]->attrs=exons[ni]->attrs;
2159 						exons[ni]->attrs=NULL;
2160 					}
2161 					exons.Delete(ni);
2162 				} //check for merge with next exon
2163 			} //for each exon
2164 		} //merge close exons
2165 		if (isCDSOnly() && exons.Count()!=cdss->Count())
2166 			isCDSOnly(false);
2167 	}
2168 	//-- check features vs their exons' span
2169 	if (isTranscript()) {
2170 	   if (exons.Count()>0) {
2171 		 if (gfr->gff_warns && (this->start!=exons.First()->start ||
2172 				 this->end!=exons.Last()->end) )
2173 			 GMessage("Warning: adjusted transcript %s boundaries according to terminal exons.\n",
2174 					 gffID);
2175 	     this->start=exons.First()->start;
2176 	     this->end=exons.Last()->end;
2177 	   }
2178 	}
2179 	else { //non-transcripts just have to be at least as wide as their sub-features
2180 	  if (exons.Count()>0) {
2181 		  bool adj=false;
2182 		  if (this->start>exons.First()->start) {
2183 			  this->start=exons.First()->start;
2184 			  adj=true;
2185 		  }
2186 		  if (this->end<exons.Last()->end) {
2187 			  this->end=exons.First()->end;
2188 			  adj=true;
2189 		  }
2190 		  if (gfr->gff_warns && adj)
2191 			  GMessage("Warning: adjusted %s %s boundaries according to terminal sub-features.\n",
2192 			  					 this->getFeatureName(), gffID);
2193 	  }
2194 	}
2195 	//-- update covlen
2196 	covlen=0;
2197 	for (int i=0;i<exons.Count();++i) covlen+=exons[i]->len();
2198 	//-- check if CDS segments are different from exons and thus worth keeping separately in cdss
2199 	if (cdss!=NULL && cdss->Count()>0) {
2200 		bool cds_exComp=true; //CDSs are exon-compatible (no need to keep them separately)
2201 		if (cdss->Count()==1) {
2202 			//check that the CDS segment is within a single exon
2203 			int start_eidx=-1;
2204 			int end_eidx=-1;
2205 			for (int i=0;i<exons.Count();i++) {
2206 				//GMessage("[DBG:] checking if CDS %d-%d is within exon %d-%d\n", CDstart, CDend, exons[i]->start,
2207 				//		exons[i]->end);
2208 				if (CDstart>=exons[i]->start && CDstart<=exons[i]->end) {
2209 					start_eidx=i;
2210 				}
2211 				if (CDend>=exons[i]->start || CDend<=exons[i]->end ) {
2212 					end_eidx=i;
2213 				}
2214 				if (start_eidx>=0 && end_eidx>=0) break;
2215 			}
2216 			cds_exComp=(start_eidx==end_eidx && start_eidx>=0);
2217 			if (!cds_exComp) GMessage("Warning: transcript %s has incorrect CDS segment definition (%d-%d)!\n",
2218 					gffID, CDstart, CDend);
2219 			cds_exComp=true; //just to free cdss, even though it's wrong
2220 		} else {
2221 			if (cdss->Count()>exons.Count()) {
2222 				cds_exComp=false;
2223 			} else { //2 or more CDS segments
2224 				//CDSs should be intron compatible with exons, and CDS ends should be within exons
2225 				int imax=exons.Count()-1;
2226 				int jmax=cdss->Count()-1;
2227 				int i=0;
2228 				int j=0;
2229 				//find which exon has CDstart
2230 				for (i=0;i<=imax;++i)
2231 					if (CDstart>=exons[i]->start
2232 							&& CDstart<=exons[i]->end) break;
2233 				if (i>imax) cds_exComp=false;
2234 				else { //check the introns now
2235 					while (i<imax && j<jmax) {
2236 						if (exons[i]->end!=(*cdss)[j]->end ||
2237 								exons[i+1]->start!=(*cdss)[j+1]->start) {
2238 							cds_exComp=false;
2239 							break;
2240 						}
2241 						++i;
2242 						++j;
2243 					}
2244 					//now j must be the last segment of cdss and CDend must be within exon[i]
2245 					if (cds_exComp)
2246 						if (j!=jmax || CDend>exons[i]->end || CDend<exons[i]->start)
2247 							cds_exComp=false;
2248 				}
2249 			}
2250 		} //multiple CDS segments
2251 		if (cds_exComp) {
2252 			if (isCDSOnly() && cdss->Count()==exons.Count())
2253 				for (int i=0;i<cdss->Count();i++)
2254 					exons[i]->phase=cdss->Get(i)->phase;
2255 			if (gfr->keep_Attrs && !gfr->noExonAttrs) {
2256 				int eidx=whichExon((*cdss)[0]->start, &exons);
2257 				if (eidx<0)
2258 					GError("Error finding CDS coordinate inside exons (?) for %s\n",
2259 						    gffID);
2260 				for (int i=0;i<cdss->Count();i++) {
2261 					if (isCDSOnly()) //eidx should be the same with i
2262 						exons[eidx]->phase=cdss->Get(i)->phase;
2263 					if ((*cdss)[i]->attrs!=NULL && (*cdss)[i]->attrs->Count()>0) {
2264 						if (exons[eidx]->attrs==NULL)
2265 							exons[eidx]->attrs=new GffAttrs();
2266 						exons[eidx]->attrs->copyAttrs((*cdss)[i]->attrs, true);
2267 						if (exons[eidx]->attrs->Count()==0) {
2268 							delete exons[eidx]->attrs;
2269 							exons[eidx]->attrs=NULL;
2270 						}
2271 					}
2272 					++eidx;
2273 				}
2274 			}
2275 			delete cdss;
2276 			cdss=NULL;
2277 			//this->isXCDS(false);
2278 		} else this->isXCDS(true);
2279 	}//cdss check
2280 
2281 	//--- collect stats for the reference genomic sequence
2282 	if (gfr->gseqtable.Count()<=gseq_id) {
2283 		gfr->gseqtable.setCount(gseq_id+1);
2284 	}
2285 	GSeqStat* gsd=gfr->gseqtable[gseq_id];
2286 	if (gsd==NULL) {
2287 		gsd=new GSeqStat(gseq_id,names->gseqs.getName(gseq_id));
2288 		//gfr->gseqtable.Put(gseq_id, gsd);
2289 		gfr->gseqtable[gseq_id]=gsd;
2290 		gfr->gseqStats.Add(gsd);
2291 	}
2292 	gsd->fcount++;
2293 	if (start<gsd->mincoord) gsd->mincoord=start;
2294 	if (end>gsd->maxcoord) gsd->maxcoord=end;
2295 	if (this->len()>gsd->maxfeat_len) {
2296 		gsd->maxfeat_len=this->len();
2297 		gsd->maxfeat=this;
2298 	}
2299 	uptr=NULL;
2300 	udata=0;
2301 	return this;
2302 }
2303 
printExonList(FILE * fout)2304 void GffObj::printExonList(FILE* fout) {
2305 	//print comma delimited list of exon intervals
2306 	for (int i=0;i<exons.Count();++i) {
2307 		if (i>0) fprintf(fout, ",");
2308 		fprintf(fout, "%d-%d",exons[i]->start, exons[i]->end);
2309 	}
2310 }
2311 
printCDSList(FILE * fout)2312 void GffObj::printCDSList(FILE* fout) {
2313 	//print comma delimited list of CDS intervals
2314 	if (!hasCDS()) return;
2315 	GVec<GffExon> cds;
2316 	this->getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment
2317 	for (int i=0;i<cds.Count();i++) {
2318 		if (i>0) fprintf(fout, ",");
2319 		fprintf(fout, "%d-%d", cds[i].start, cds[i].end);
2320 	}
2321 }
2322 
BED_addAttribute(FILE * fout,int & acc,const char * format,...)2323 void BED_addAttribute(FILE* fout, int& acc, const char* format,... ) {
2324 	++acc;
2325 	if (acc==1) fprintf(fout, "\t");
2326 	       else fprintf(fout, ";");
2327     va_list arguments;
2328     va_start(arguments,format);
2329     vfprintf(fout,format,arguments);
2330     va_end(arguments);
2331 }
2332 
printBED(FILE * fout,bool cvtChars,char * dbuf,int dbuf_len)2333 void GffObj::printBED(FILE* fout, bool cvtChars, char* dbuf, int dbuf_len) {
2334 //print a BED-12 line + GFF3 attributes in 13th field
2335  int cd_start=CDstart>0? CDstart-1 : start-1;
2336  int cd_end=CDend>0 ? CDend : end;
2337  char cdphase=(CDphase>0) ? CDphase : '0';
2338  fprintf(fout, "%s\t%d\t%d\t%s\t%d\t%c\t%d\t%d\t%c,0,0", getGSeqName(), start-1, end, getID(),
2339 		 100, strand, cd_start, cd_end, cdphase);
2340  if (exons.Count()>0) {
2341 	 int i;
2342 	 fprintf(fout, "\t%d\t", exons.Count());
2343 	 for (i=0;i<exons.Count();++i)
2344 		 fprintf(fout,"%d,",exons[i]->len());
2345 	 fprintf(fout, "\t");
2346 	 for (i=0;i<exons.Count();++i)
2347 		 fprintf(fout,"%d,",exons[i]->start-start);
2348  } else { //no-exon feature(!), shouldn't happen
2349 	 fprintf(fout, "\t1\t%d,\t0,", len());
2350  }
2351  //now add the GFF3 attributes for in the 13th field
2352  int numattrs=0;
2353  if (CDstart>0) BED_addAttribute(fout, numattrs,"CDS=%d:%d",CDstart-1, CDend);
2354  if (CDphase>0) BED_addAttribute(fout, numattrs,"CDSphase=%c", CDphase);
2355  if (geneID!=NULL)
2356 	 BED_addAttribute(fout, numattrs, "geneID=%s",geneID);
2357  if (gene_name!=NULL)
2358     fprintf(fout, ";gene_name=%s",gene_name);
2359  if (attrs!=NULL) {
2360     for (int i=0;i<attrs->Count();i++) {
2361       const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
2362       const char* attrval=attrs->Get(i)->attr_val;
2363       if (attrval==NULL || attrval[0]=='\0') {
2364     	  BED_addAttribute(fout, numattrs,"%s",attrname);
2365     	  continue;
2366       }
2367       if (cvtChars) {
2368     	  decodeHexChars(dbuf, attrval, dbuf_len-1);
2369     	  BED_addAttribute(fout, numattrs, "%s=%s", attrname, dbuf);
2370       }
2371       else
2372     	  BED_addAttribute(fout, numattrs,"%s=%s", attrname, attrs->Get(i)->attr_val);
2373     }
2374  }
2375  fprintf(fout, "\n");
2376 }
2377 
parseAttrs(GffAttrs * & atrlist,char * info,bool isExon,bool CDSsrc)2378 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon, bool CDSsrc) {
2379   if (names==NULL)
2380      GError(ERR_NULL_GFNAMES, "parseAttrs()");
2381   if (atrlist==NULL) {
2382       atrlist=new GffAttrs();
2383   }
2384   bool exon2transcript=(isExon && atrlist==this->attrs);
2385   char* endinfo=info+strlen(info);
2386   char* start=info;
2387   char* pch=start;
2388   while (start<endinfo) {
2389     while (*start==' ' && start<endinfo) start++;
2390     pch=strchr(start, ';');
2391     if (pch==NULL) pch=endinfo;
2392        else {  *pch='\0'; pch++; }
2393     char* ech=strchr(start,'=');
2394     if (ech!=NULL) { // attr=value format found
2395        *ech='\0';
2396        if (exon2transcript) { //we do NOT want these exon attributes at transcript level
2397           if (startsiWith(start, "exon_") || strcmp(start, "exon")==0)  {
2398         	  start=pch; continue;
2399           }
2400        }
2401        ech++;
2402        while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
2403        /*
2404        if (isExon && startsiWith(start, "protein")) {
2405          //--protein info should never be left at exon level
2406          // ( attribute policing much? :p )
2407          this->addAttr(start, ech);
2408          start=pch;
2409          continue;
2410        }
2411        */
2412        if (exon2transcript)
2413             atrlist->add_if_new(this->names, start, ech); //never override transcript attribute with exon's
2414        else atrlist->add_or_update(this->names, start, ech, CDSsrc); //overwrite previous attr with the same name
2415     }
2416     start=pch;
2417   } //while info characters
2418   if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
2419 }
2420 
addAttr(const char * attrname,const char * attrvalue)2421 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
2422   if (this->attrs==NULL)
2423       this->attrs=new GffAttrs();
2424   //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
2425   this->attrs->add_or_update(names, attrname, attrvalue);
2426 }
2427 
copyAttrs(GffObj * from)2428 void GffObj::copyAttrs(GffObj* from) { //typically from is the parent gene, and this is a transcript
2429 	if (from==NULL || from->attrs==NULL || from->attrs->Count()==0) return;
2430 	if (this->attrs==NULL) {
2431 		this->attrs=new GffAttrs();
2432 	}
2433 	//special RefSeq case
2434 	int desc_attr_id=names->attrs.getId("description"); //from gene
2435 	int prod_attr_id=names->attrs.getId("product"); //from transcript (this)
2436 	char* prod = (prod_attr_id>=0) ? this->attrs->getAttr(prod_attr_id) : NULL;
2437 
2438 	for (int i=0;i<from->attrs->Count();++i) {
2439 		//this->attrs->add_no_update(names, from->attrs->Get(i)->attr_id, from->attrs->Get(i)->attr_val);
2440 		int aid=from->attrs->Get(i)->attr_id;
2441 		//special case for GenBank refseq genes vs transcripts:
2442 		if (prod && aid==desc_attr_id && strcmp(from->attrs->getAttr(desc_attr_id), prod)==0)
2443 			continue; //skip description if product already there and the same
2444 		bool haveit=false;
2445 		for (int ai=0;ai<this->attrs->Count();++ai) {
2446 			//do we have it already?
2447 			if (aid==this->attrs->Get(ai)->attr_id) {
2448 				haveit=true;
2449 				break; //skip this, don't replace
2450 			}
2451 		}
2452 		if (!haveit)
2453 			this->attrs->Add(new GffAttr(aid, from->attrs->Get(i)->attr_val));
2454 	}
2455 }
2456 
setFeatureName(const char * feature)2457 void GffObj::setFeatureName(const char* feature) {
2458  //change the feature name/type for a transcript
2459  int fid=names->feats.addName(feature);
2460  if (monoFeature() && exons.Count()>0)
2461     this->subftype_id=fid;
2462  this->ftype_id=fid;
2463 }
2464 
setRefName(const char * newname)2465 void GffObj::setRefName(const char* newname) {
2466  //change the feature name/type for a transcript
2467  int rid=names->gseqs.addName(newname);
2468  this->gseq_id=rid;
2469 }
2470 
2471 
2472 
removeAttr(const char * attrname,const char * attrval)2473 int GffObj::removeAttr(const char* attrname, const char* attrval) {
2474   if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
2475   int aid=this->names->attrs.getId(attrname);
2476   if (aid<0) return 0;
2477   int delcount=0;  //could be more than one ?
2478   for (int i=0;i<this->attrs->Count();i++) {
2479      if (aid==this->attrs->Get(i)->attr_id) {
2480        if (attrval==NULL ||
2481           strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
2482              delcount++;
2483              this->attrs->freeItem(i);
2484              }
2485        }
2486      }
2487   if (delcount>0) this->attrs->Pack();
2488   return delcount;
2489 }
2490 
removeAttr(int aid,const char * attrval)2491 int GffObj::removeAttr(int aid, const char* attrval) {
2492   if (this->attrs==NULL || aid<0) return 0;
2493   int delcount=0;  //could be more than one ?
2494   for (int i=0;i<this->attrs->Count();i++) {
2495      if (aid==this->attrs->Get(i)->attr_id) {
2496        if (attrval==NULL ||
2497           strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
2498              delcount++;
2499              this->attrs->freeItem(i);
2500              }
2501        }
2502      }
2503   if (delcount>0) this->attrs->Pack();
2504   return delcount;
2505 }
2506 
2507 
removeExonAttr(GffExon & exon,const char * attrname,const char * attrval)2508 int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
2509   if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
2510   int aid=this->names->attrs.getId(attrname);
2511   if (aid<0) return 0;
2512   int delcount=0;  //could be more than one
2513   for (int i=0;i<exon.attrs->Count();i++) {
2514      if (aid==exon.attrs->Get(i)->attr_id) {
2515        if (attrval==NULL ||
2516           strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
2517              delcount++;
2518              exon.attrs->freeItem(i);
2519              }
2520        }
2521      }
2522   if (delcount>0) exon.attrs->Pack();
2523   return delcount;
2524 }
2525 
removeExonAttr(GffExon & exon,int aid,const char * attrval)2526 int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
2527   if (exon.attrs==NULL || aid<0) return 0;
2528   int delcount=0;  //could be more than one
2529   for (int i=0;i<exon.attrs->Count();i++) {
2530      if (aid==exon.attrs->Get(i)->attr_id) {
2531        if (attrval==NULL ||
2532           strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
2533              delcount++;
2534              exon.attrs->freeItem(i);
2535              }
2536        }
2537      }
2538   if (delcount>0) exon.attrs->Pack();
2539   return delcount;
2540 }
2541 
2542 
getUnspliced(GFaSeqGet * faseq,int * rlen,GMapSegments * seglst)2543 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst) {
2544 
2545     if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
2546         return NULL;
2547     }
2548     //restore normal coordinates:
2549     if (exons.Count()==0) return NULL;
2550     int fspan=end-start+1;
2551     const char* gsubseq=faseq->subseq(start, fspan);
2552     if (gsubseq==NULL) {
2553         GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
2554     }
2555     char* unspliced=NULL;
2556 
2557     int seqstart=exons.First()->start;
2558     int seqend=exons.Last()->end;
2559 
2560     int unsplicedlen = 0;
2561     if (seglst)
2562     	seglst->Clear(strand);
2563     unsplicedlen += seqend - seqstart + 1;
2564 
2565     GMALLOC(unspliced, unsplicedlen+1); //allocate more here
2566     //uint seqstart, seqend;
2567     int s = 0; //resulting nucleotide counter
2568     if (strand=='-')
2569     {
2570         if (seglst!=NULL)
2571             seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend);
2572         for (int i=seqend;i>=seqstart;i--)   {
2573             unspliced[s] = ntComplement(gsubseq[i-start]);
2574             s++;
2575         }//for each nt
2576     } // - strand
2577     else {
2578       // + strand
2579         if (seglst!=NULL)
2580             seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend);
2581         for (int i=seqstart;i<=seqend;i++) {
2582             unspliced[s]=gsubseq[i-start];
2583             s++;
2584         }//for each nt
2585     } // + strand
2586     //assert(s <= unsplicedlen);
2587     unspliced[s]=0;
2588     if (rlen!=NULL) *rlen=s;
2589     return unspliced;
2590 }
2591 
addPadding(int padLeft,int padRight)2592  void GffObj::addPadding(int padLeft, int padRight) {
2593 	 this->start-=padLeft;
2594 	 this->end+=padRight;
2595 	 if (exons.Count()>0) {
2596 		 exons[0]->start-=padLeft;
2597 		 exons.Last()->end+=padRight;
2598 	 }
2599 	 covlen+=padLeft+padRight;
2600  }
2601 
removePadding(int padLeft,int padRight)2602  void GffObj::removePadding(int padLeft, int padRight) {
2603 	 this->start+=padLeft;
2604 	 this->end-=padRight;
2605 	 if (exons.Count()>0) {
2606 		 exons[0]->start+=padLeft;
2607 		 exons.Last()->end-=padRight;
2608 	 }
2609 	 covlen-=padLeft+padRight;
2610  }
2611 
getSpliced(GFaSeqGet * faseq,bool CDSonly,int * rlen,uint * cds_start,uint * cds_end,GMapSegments * seglst,bool cds_open)2612 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
2613           GMapSegments* seglst, bool cds_open) {
2614 	//cds_open only makes sense when CDSonly is true by overriding CDS 3'end such that the end of
2615 	//the sequence beyond the 3' CDS end is also returned (the 3' UTR is appended to the CDS)
2616   if (CDSonly && CDstart==0) {
2617 	  GMessage("Warning: getSpliced(CDSOnly) requested for transcript with no CDS (%s)!\n", gffID); //should never happen
2618 	  return NULL;
2619   }
2620   if (faseq==NULL) {
2621 	  GMessage("Warning: getSpliced() called with uninitialized GFaSeqGet object!\n"); //should never happen
2622       return NULL;
2623   }
2624   GList<GffExon>* xsegs=&exons;
2625   if (CDSonly && this->cdss!=NULL)
2626 	  xsegs=this->cdss;
2627   if (xsegs->Count()==0) return NULL;
2628   int fspan=end-start+1;
2629   const char* gsubseq=faseq->subseq(start, fspan);
2630   if (gsubseq==NULL) {
2631         GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
2632   }
2633   if (fspan<(int)(end-start+1)) {
2634 	  //special case: stop coordinate was extended past the gseq length, must adjust
2635      int endadj=end-start+1-fspan;
2636      uint prevend=end;
2637      end-=endadj;
2638      if (CDend>end) CDend=end;
2639      if (xsegs->Last()->end>end) {
2640          xsegs->Last()->end=end; //this could be trouble if exon start is also > end
2641          if (xsegs->Last()->start>xsegs->Last()->end) {
2642             GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
2643                   prevend,getGSeqName(), getID());
2644          }
2645          covlen-=endadj;
2646      }
2647   }
2648   char* spliced=NULL;
2649   GMALLOC(spliced, covlen+1); //IMPORTANT: covlen must be correct here!
2650   uint g_start=0, g_end=0;
2651   int cdsadj=0;
2652   if (CDphase=='1' || CDphase=='2') {
2653       cdsadj=CDphase-'0';
2654   }
2655   uint CDS_start=CDstart;
2656   uint CDS_stop=CDend;
2657   if (cdsadj>0) {
2658      if (strand=='-') CDS_stop-=cdsadj;
2659      else CDS_start+=cdsadj;
2660   }
2661   if (CDSonly) {
2662     g_start=CDS_start;
2663     g_end=CDS_stop;
2664     if (g_end-g_start<3)
2665     	 GMessage("Warning: CDS %d-%d too short for %s, check your data.\n",
2666     			 g_start, g_end, gffID);
2667   } else { //all exon content, not just CDS
2668     g_start=xsegs->First()->start;
2669     g_end=xsegs->Last()->end;
2670     cds_open=false; //override mistaken user request
2671   }
2672   if (seglst!=NULL) seglst->Clear(strand);
2673   int s=0; //resulting nucleotide counter
2674   if (strand=='-') {
2675     if (cds_open) {// appending 3'UTR
2676     	g_start=xsegs->First()->start;
2677     	//CDS_start=g_start;
2678     }
2679     for (int x=xsegs->Count()-1;x>=0;x--) {
2680        uint sgstart=xsegs->Get(x)->start;
2681        uint sgend=xsegs->Get(x)->end;
2682        if (g_end<sgstart || g_start>sgend) continue;
2683        if (g_start>=sgstart && g_start<=sgend)
2684           sgstart=g_start; //3' end within this segment
2685        if (g_end>=sgstart && g_end<=sgend)
2686           sgend=g_end; //5' end within this segment
2687        if (seglst!=NULL)
2688           seglst->add(s+1,s+1+sgend-sgstart,sgend,sgstart);
2689        for (uint i=sgend;i>=sgstart;i--) {
2690          spliced[s] = ntComplement(gsubseq[i-start]);
2691          s++;
2692        }//for each nt
2693        //--update local CDS start-end coordinates
2694        if (cds_start!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) {
2695          //CDS start in this segment
2696          *cds_start=s-(CDS_stop-sgstart);
2697        }
2698        if (cds_end!=NULL && CDS_start>=sgstart && CDS_start<=sgend) {
2699          //CDS stop in this segment
2700          *cds_end=s-(CDS_start-sgstart);
2701        }
2702       } //for each exon
2703     } // - strand
2704    else { // + strand
2705     if (cds_open) { // appending 3'UTR
2706       	g_end=xsegs->Last()->end;
2707       	//CDS_stop=g_end;
2708     }
2709     for (int x=0;x<xsegs->Count();x++) {
2710       uint sgstart=xsegs->Get(x)->start;
2711       uint sgend=xsegs->Get(x)->end;
2712       if (g_end<sgstart || g_start>sgend) continue;
2713       if (g_start>=sgstart && g_start<=sgend)
2714             sgstart=g_start; //seqstart within this segment
2715       if (g_end>=sgstart && g_end<=sgend)
2716             sgend=g_end; //seqend within this segment
2717       if (seglst!=NULL)
2718           seglst->add(s+1,s+1+sgend-sgstart, sgstart, sgend);
2719       for (uint i=sgstart;i<=sgend;i++) {
2720           spliced[s]=gsubseq[i-start];
2721           s++;
2722       }//for each nt
2723       //--update local CDS start-end coordinates
2724       if (cds_start!=NULL && CDS_start>=sgstart && CDS_start<=sgend) {
2725         //CDS start in this segment
2726         *cds_start=s-(sgend-CDS_start);
2727       }
2728       if (cds_end!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) {
2729         //CDS stop in this segment
2730         *cds_end=s-(sgend-CDS_stop);
2731       }
2732     } //for each exon
2733   } // + strand
2734   spliced[s]=0;
2735   if (rlen!=NULL) *rlen=s;
2736   return spliced;
2737 }
2738 
printSummary(FILE * fout)2739 void GffObj::printSummary(FILE* fout) {
2740  if (fout==NULL) fout=stdout;
2741  fprintf(fout, "%s\t%c\t%d\t%d\t", gffID,
2742           strand, start, end);
2743  gscore.print(fout);
2744  fprintf(fout, "\n");
2745 }
2746 //TODO we should also have an escapeChars function for some situations
2747 //when we want to write a GFF3 strictly compliant to the dang specification
decodeHexChars(char * dbuf,const char * s,int maxlen)2748 void GffObj::decodeHexChars(char* dbuf, const char* s, int maxlen) {
2749 	int dlen=0;
2750 	dbuf[0]=0;
2751 	if (s==NULL) return;
2752 	for (const char* p=s;(*p)!=0 && dlen<maxlen;++p) {
2753 		if (p[0]=='%' && isxdigit(p[1]) && isxdigit(p[2])) {
2754 			int a=*(++p);
2755 			if (a>'Z') a^=0x20; //toupper()
2756 			if (a>'9') a=10+(a-'A');
2757 			      else a-='0';
2758 			int b=*(++p);
2759 			if (b>'Z') b^=0x20;
2760 			if (b>'9') b=10+(b-'A');
2761 			      else b-='0';
2762 			char c=(char)((a<<4)+b);
2763 			if (c=='%') {
2764 				dbuf[dlen]='p';
2765 				++dlen;
2766 				dbuf[dlen]='r';
2767 				++dlen;
2768 				c='c';
2769 			}
2770 			else if (c==';') c='.';
2771 			else if (c<='\t') c=' ';
2772 			if (c>=' ') {
2773 				dbuf[dlen]=c;
2774 				++dlen;
2775 				continue;
2776 			}
2777 		}
2778 		dbuf[dlen]=*p;
2779 		++dlen;
2780 	}
2781 	dbuf[dlen]=0;
2782 }
2783 
printGTab(FILE * fout,char ** extraAttrs)2784 void GffObj::printGTab(FILE* fout, char** extraAttrs) {
2785 	fprintf(fout, "%s\t%c\t%d\t%d\t%s\t", this->getGSeqName(), this->strand,
2786 			this->start, this->end, this->getID());
2787 	if (exons.Count()) printExonList(fout);
2788 	else fprintf(fout, ".");
2789 	if (extraAttrs!=NULL) {
2790 		//print a list of "attr=value;" pairs here as the last column
2791 		//for requested attributes
2792 		bool t1=true;
2793 		for (int i=0;extraAttrs[i]!=NULL;++i) {
2794 			const char* v=this->getAttr(extraAttrs[i]);
2795 			if (v==NULL) continue;
2796 			if (t1) { fprintf(fout, "\t"); t1=false; }
2797 			fprintf(fout, "%s=%s;", extraAttrs[i], v);
2798 		}
2799 	}
2800 	fprintf(fout,"\n");
2801 }
2802 
printGxfExon(FILE * fout,const char * tlabel,const char * gseqname,bool iscds,GffExon * exon,bool gff3,bool cvtChars,char * dbuf,int dbuf_len)2803 void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
2804                              GffExon* exon, bool gff3, bool cvtChars,
2805 							 char* dbuf, int dbuf_len) {
2806   //strcpy(dbuf,".");
2807   //if (exon->score>0) sprintf(dbuf,"%.2f", exon->score);
2808   exon->score.sprint(dbuf);
2809   if (exon->phase==0 || !iscds) exon->phase='.';
2810   const char* ftype=iscds ? "CDS" : getSubfName();
2811   const char* attrname=NULL;
2812   const char* attrval=NULL;
2813   if (gff3) {
2814     fprintf(fout,
2815       "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
2816       gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand,
2817 	  exon->phase, gffID);
2818     if (exon->attrs!=NULL) {
2819       for (int i=0;i<exon->attrs->Count();i++) {
2820         if (exon->attrs->Get(i)->cds!=iscds) continue;
2821         attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id);
2822         if (cvtChars) {
2823           decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1);
2824           fprintf(fout,";%s=%s", attrname, dbuf);
2825         } else {
2826           fprintf(fout,";%s=%s", attrname, exon->attrs->Get(i)->attr_val);
2827         }
2828       }
2829     }
2830     fprintf(fout, "\n");
2831     } //GFF3
2832   else {//GTF
2833     fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
2834            gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand, exon->phase, gffID);
2835     if (geneID)
2836       fprintf(fout," gene_id \"%s\";",geneID);
2837     if (gene_name!=NULL) {
2838        fprintf(fout," gene_name \"%s\";",gene_name);
2839     }
2840     if (exon->attrs!=NULL) {
2841        bool trId=false;
2842        bool gId=false;
2843        for (int i=0;i<exon->attrs->Count();i++) {
2844             if (exon->attrs->Get(i)->attr_val==NULL) continue;
2845             if (exon->attrs->Get(i)->cds!=iscds) continue;
2846             attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id);
2847             if (strcmp(attrname, "transcriptID")==0) {
2848             	if (trId) continue;
2849             	trId=true;
2850             }
2851             if (strcmp(attrname, "transcript_id")==0 && !trId) {
2852             	attrname="transcriptID";
2853             	trId=true;
2854             }
2855             if (strcmp(attrname, "geneID")==0) {
2856             	if (gId) continue;
2857             	gId=true;
2858             }
2859             if (strcmp(attrname, "gene_id")==0 && !gId) {
2860             	attrname="geneID";
2861             	gId=true;
2862             }
2863             if (Gstricmp(attrname, "gene_name")==0 && gene_name!=NULL) {
2864             	continue;
2865             }
2866             fprintf(fout, " %s ",attrname);
2867             if (cvtChars) {
2868               decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1);
2869               attrval=dbuf;
2870             } else {
2871               attrval=exon->attrs->Get(i)->attr_val;
2872             }
2873 
2874             if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
2875                            else fprintf(fout, "\"%s\";",attrval);
2876         }
2877     }
2878     //for GTF, also append the GffObj attributes to each exon line
2879     // - do not do this when the transcript line is also printed!
2880     /*
2881     if (attrs!=NULL) {
2882          for (int i=0;i<attrs->Count();i++) {
2883             if (attrs->Get(i)->attr_val==NULL) continue;
2884             attrname=names->attrs.getName(attrs->Get(i)->attr_id);
2885             fprintf(fout, " %s ",attrname);
2886             if (cvtChars) {
2887               decodeHexChars(dbuf, attrs->Get(i)->attr_val, dbuf_len-1);
2888               attrval=dbuf;
2889             } else {
2890               attrval=attrs->Get(i)->attr_val;
2891             }
2892             if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
2893                            else fprintf(fout, "\"%s\";",attrval);
2894          }
2895     }
2896     */
2897     fprintf(fout, "\n");
2898  }//GTF
2899 }
2900 
printGxf(FILE * fout,GffPrintMode gffp,const char * tlabel,const char * gfparent,bool cvtChars)2901 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
2902                    const char* tlabel, const char* gfparent, bool cvtChars) {
2903  const int DBUF_LEN=1024; //there should not be attribute values longer than 1K!
2904  char dbuf[DBUF_LEN];
2905  if (tlabel==NULL) {
2906     tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
2907          (char*)"gffobj" ;
2908     }
2909  if (gffp==pgffBED) {
2910 	 printBED(fout, cvtChars, dbuf, DBUF_LEN);
2911 	 return;
2912  }
2913  const char* gseqname=names->gseqs.Get(gseq_id)->name;
2914  bool gff3 = (gffp>=pgffAny && gffp<=pgffTLF);
2915  bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
2916  bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
2917  //if (gscore>0.0) sprintf(dbuf,"%.2f", gscore);
2918  //       else strcpy(dbuf,".");
2919  gscore.sprint(dbuf);
2920  if (gffp<=pgtfCDS && gffp>=pgtfAny) { //GTF output
2921 	   fprintf(fout,
2922 	     "%s\t%s\ttranscript\t%d\t%d\t%s\t%c\t.\ttranscript_id \"%s\"",
2923 	     gseqname, tlabel, start, end, dbuf, strand, gffID);
2924 	   char* gid=NULL;
2925 	   if (geneID!=NULL) {
2926 	      gid=geneID;
2927 	   }
2928 	   else {
2929 		   gid=getAttr("gene_id");
2930 		   if (gid==NULL)
2931 			   gid=gffID; //last resort, write gid the same with gffID
2932 	   }
2933 	   if (gid!=NULL) fprintf(fout, "; gene_id \"%s\"",gid);
2934 	   if (gene_name!=NULL && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL)
2935 	      fprintf(fout, "; gene_name \"%s\"",gene_name);
2936 	   if (attrs!=NULL) {
2937 		    bool trId=false;
2938 		    //bool gId=false;
2939 		    for (int i=0;i<attrs->Count();i++) {
2940 		      const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
2941 		      const char* attrval=attrs->Get(i)->attr_val;
2942 		      if (attrval==NULL || attrval[0]=='\0') continue;
2943 		      if (strcmp(attrname, "transcriptID")==0) {
2944 	            	if (trId) continue;
2945 	            	trId=true;
2946 		      }
2947 		      if (strcmp(attrname, "transcript_id")==0 && !trId) {
2948 	            	attrname="transcriptID";
2949 	            	trId=true;
2950 		      }
2951 		      if (Gstrcmp(attrname, "geneID")==0 && gid!=NULL &&
2952 	            		strcmp(attrval, gid)==0) continue;
2953 		      if (strcmp(attrname, "gene_id")==0) continue;
2954 		      if (cvtChars) {
2955 		    	  decodeHexChars(dbuf, attrval, DBUF_LEN-1);
2956 		    	  fprintf(fout,"; %s \"%s\"", attrname, dbuf);
2957 		      }
2958 		      else
2959 		    	 fprintf(fout,"; %s \"%s\"", attrname, attrs->Get(i)->attr_val);
2960 		    }
2961 	   }
2962 	   fprintf(fout,";\n");
2963  }
2964  else if (gff3) {
2965    //print GFF3 transcript line:
2966    uint pstart, pend;
2967    if (gffp==pgffCDS) {
2968       pstart=CDstart;
2969       pend=CDend;
2970    }
2971    else { pstart=start;pend=end; }
2972    //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
2973    const char* ftype=getFeatureName();
2974    fprintf(fout,
2975      "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
2976      gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID);
2977    bool parentPrint=false;
2978    if (gfparent!=NULL && gffp!=pgffTLF) {
2979       //parent override - also prevents printing gene_name and gene_id
2980       fprintf(fout, ";Parent=%s",gfparent);
2981       parentPrint=true;
2982    }
2983    else if (parent!=NULL && !parent->isDiscarded() && gffp!=pgffTLF) {
2984            fprintf(fout, ";Parent=%s",parent->getID());
2985            if (parent->isGene()) parentPrint=true;
2986    }
2987    if (gffp==pgffTLF) {
2988 	   fprintf(fout, ";exonCount=%d",exons.Count());
2989 	   if (exons.Count()>0)
2990 		   fprintf(fout, ";exons=%d-%d", exons[0]->start, exons[0]->end);
2991 	   for (int i=1;i<exons.Count();++i) {
2992 		   fprintf(fout, ",%d-%d",exons[i]->start, exons[i]->end);
2993 	   }
2994    }
2995    if (CDstart>0 && (gffp==pgffTLF || !showCDS)) {
2996 	   if (cdss==NULL) fprintf(fout,";CDS=%d:%d",CDstart,CDend);
2997 	   else {
2998 		   fprintf(fout, ";CDS=");
2999 		   for (int i=0;i<cdss->Count();++i) {
3000 			   if (i>0) fprintf(fout, ",");
3001 			   fprintf(fout, "%d-%d", (*cdss)[i]->start, (*cdss)[i]->end);
3002 		   }
3003 	   }
3004    }
3005    if (CDphase>0 && (gffp==pgffTLF || !showCDS)) fprintf(fout,";CDSphase=%c", CDphase);
3006    char* g_id=NULL;
3007    if (geneID!=NULL && !parentPrint && getAttr("geneID")==NULL &&
3008 		   ((g_id=getAttr("gene_id"))==NULL || strcmp(g_id, geneID)!=0))
3009       fprintf(fout, ";geneID=%s",geneID);
3010    if (gene_name!=NULL && !parentPrint && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL)
3011       fprintf(fout, ";gene_name=%s",gene_name);
3012    if (attrs!=NULL) {
3013 	    for (int i=0;i<attrs->Count();i++) {
3014 	      const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
3015 	      const char* attrval=attrs->Get(i)->attr_val;
3016 	      if (attrval==NULL || attrval[0]=='\0') continue;
3017 	    	  //fprintf(fout,";%s",attrname);
3018 	      if (cvtChars) {
3019 	    	  decodeHexChars(dbuf, attrval, DBUF_LEN-1);
3020 	    	  fprintf(fout,";%s=%s", attrname, dbuf);
3021 	      }
3022 	      else
3023 	    	 fprintf(fout,";%s=%s", attrname, attrs->Get(i)->attr_val);
3024 	    }
3025    }
3026    fprintf(fout,"\n");
3027  }// gff3 transcript line
3028  if (gffp==pgffTLF) return;
3029  bool is_cds_only = (gffp==pgffBoth) ? false : isCDSOnly();
3030  if (showExon) {
3031     //print exons
3032     for (int i=0;i<exons.Count();i++) {
3033       printGxfExon(fout, tlabel, gseqname, is_cds_only, exons[i], gff3, cvtChars, dbuf, DBUF_LEN);
3034     }
3035  }//printing exons
3036  if (showCDS && !is_cds_only && CDstart>0) {
3037 	GVec<GffExon> cds;
3038 	getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment
3039 	for (int i=0;i<cds.Count();i++) {
3040 		printGxfExon(fout, tlabel, gseqname, true, &(cds[i]), gff3, cvtChars, dbuf, DBUF_LEN);
3041 	}
3042   } //showCDS
3043 }
3044 
updateCDSPhase(GList<GffExon> & segs)3045 void GffObj::updateCDSPhase(GList<GffExon>& segs) {
3046   int cdsacc=0;
3047   if (CDphase=='1' || CDphase=='2') {
3048       cdsacc+= 3-(CDphase-'0');
3049   }
3050   else CDphase='0';
3051   if (strand=='-') { //reverse strand
3052      for (int i=segs.Count()-1;i>=0;i--) {
3053          segs[i]->phase='0'+ (3-cdsacc%3)%3;
3054          cdsacc+=segs[i]->end-segs[i]->start+1;
3055      }
3056   }
3057     else { //forward strand
3058      for (int i=0;i<segs.Count();i++) {
3059          segs[i]->phase='0'+ (3-cdsacc%3)%3;
3060          cdsacc+=segs[i]->end-segs[i]->start+1;
3061      }
3062   }
3063 }
3064 
getCDSegs(GVec<GffExon> & cds)3065 void GffObj::getCDSegs(GVec<GffExon>& cds) {
3066   //like updateCDSPhase() above, also updates phase for each segment
3067   GffExon cdseg(true);
3068   cds.Clear();
3069   if (cdss!=NULL) {
3070 	//copy directly from cdss list
3071 	for (int i=0;i<cdss->Count();i++) {
3072 		cdseg=(*cdss->Get(i));
3073 		cdseg.sharedAttrs=true;
3074 		cds.Add(cdseg);
3075 	}
3076     return;
3077   }
3078   int cdsacc=0;
3079   if (CDphase=='1' || CDphase=='2') {
3080       cdsacc+= 3-(CDphase-'0');
3081   }
3082   if (strand=='-') {
3083      for (int x=exons.Count()-1;x>=0;x--) {
3084         uint sgstart=exons[x]->start;
3085         uint sgend=exons[x]->end;
3086         if (CDend<sgstart || CDstart>sgend) continue;
3087         if (CDstart>=sgstart && CDstart<=sgend)
3088               sgstart=CDstart; //cdstart within this segment
3089         if (CDend>=sgstart && CDend<=sgend)
3090               sgend=CDend; //cdend within this segment
3091         cdseg.start=sgstart;
3092         cdseg.end=sgend;
3093         //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
3094         cdseg.phase='0'+ (3-cdsacc%3)%3;
3095         cdsacc+=sgend-sgstart+1;
3096         cdseg.attrs=exons[x]->attrs;
3097         cdseg.sharedAttrs=true;
3098         cds.Add(cdseg);
3099        } //for each exon
3100      cds.Reverse();
3101      } // - strand
3102     else { // + strand
3103      for (int x=0;x<exons.Count();x++) {
3104        uint sgstart=exons[x]->start;
3105        uint sgend=exons[x]->end;
3106        if (CDend<sgstart || CDstart>sgend) continue;
3107        if (CDstart>=sgstart && CDstart<=sgend)
3108              sgstart=CDstart; //seqstart within this segment
3109        if (CDend>=sgstart && CDend<=sgend)
3110              sgend=CDend; //seqend within this segment
3111        cdseg.start=sgstart;
3112        cdseg.end=sgend;
3113        //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
3114        cdseg.phase='0' + (3-cdsacc%3)%3 ;
3115        cdsacc+=sgend-sgstart+1;
3116        cdseg.attrs=exons[x]->attrs;
3117        cdseg.sharedAttrs=true;
3118        cds.Add(cdseg);
3119        } //for each exon
3120    } // + strand
3121 
3122 }
3123 
3124 //-- transcript match/overlap classification functions
3125 
3126 
transcriptMatch(GffObj & a,GffObj & b,int & ovlen)3127 char transcriptMatch(GffObj& a, GffObj& b, int& ovlen) {
3128 	//return '=' if exact exon match, '~' if intron-chain match (or 80% overlap for single-exon)
3129 	// or 0 otherwise
3130 	int imax=a.exons.Count()-1;
3131 	int jmax=b.exons.Count()-1;
3132 	ovlen=0;
3133 	if (imax!=jmax) return false; //different number of exons, cannot match
3134 	if (imax==0) //single-exon mRNAs
3135 	    return (singleExonTMatch(a,b,ovlen));
3136 	if ( a.exons[imax]->start<b.exons[0]->end ||
3137 		b.exons[jmax]->start<a.exons[0]->end )
3138 		return 0; //intron chains do not overlap at all
3139 	//check intron overlaps
3140 	ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
3141 	ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
3142 	for (int i=1;i<=imax;i++) {
3143 		if (i<imax) ovlen+=a.exons[i]->len();
3144 		if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
3145 			(a.exons[i]->start!=b.exons[i]->start)) {
3146 			return 0; //intron mismatch
3147 		}
3148 	}
3149 	//--- full intron chain match:
3150 	if (a.exons[0]->start==b.exons[0]->start &&
3151 		a.exons.Last()->end==b.exons.Last()->end)
3152 		   return '=';
3153 	return '~';
3154 }
3155 
3156 
singleExonTMatch(GffObj & m,GffObj & r,int & ovlen)3157 char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) {
3158  //return '=' if exact match, '~' if the overlap is >=80% of the longer sequence length
3159  // return 0 if there is no overlap
3160  GSeg mseg(m.start, m.end);
3161  ovlen=mseg.overlapLen(r.start,r.end);
3162  if (ovlen<=0) return 0;
3163  // fuzzy matching for single-exon transcripts:
3164  // matching = overlap is at least 80% of the length of the longer transcript
3165  // *OR* in case of reverse containment (reference contained in m)
3166  //   it's also considered "matching" if the overlap is at least 80% of
3167  //   the reference len AND at least 70% of the query len
3168  if (m.start==r.start && m.end==r.end) return '=';
3169  if (m.covlen>r.covlen) {
3170    if ( (ovlen >= m.covlen*0.8) ||
3171 		   (ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 ) )
3172 		   //allow also some fuzzy reverse containment
3173            return '~';
3174  } else {
3175    if (ovlen >= r.covlen*0.8) return '~';
3176  }
3177  return 0;
3178 }
3179 
3180 //formerly in gffcompare
getOvlCode(GffObj & m,GffObj & r,int & ovlen,bool strictMatch)3181 char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch) {
3182 	ovlen=0; //total actual exonic overlap
3183 	if (!m.overlap(r.start,r.end)) return 0;
3184 	int jmax=r.exons.Count()-1;
3185 	//int iovlen=0; //total m.exons overlap with ref introns
3186 	char rcode=0;
3187 	if (m.exons.Count()==1) { //single-exon transfrag
3188 		GSeg mseg(m.start, m.end);
3189 		if (jmax==0) { //also single-exon ref
3190 			//ovlen=mseg.overlapLen(r.start,r.end);
3191 			char eqcode=0;
3192 			if ((eqcode=singleExonTMatch(m, r, ovlen))>0) {
3193 				if (strictMatch) return eqcode;
3194 				            else return '=';
3195 			}
3196 			if (m.covlen<r.covlen)
3197 			   { if (ovlen >= m.covlen*0.8) return 'c'; } // fuzzy containment
3198 			else
3199 				if (ovlen >= r.covlen*0.8 ) return 'k';   // fuzzy reverse containment
3200 			return 'o'; //just plain overlapping
3201 		}
3202 		//-- single-exon qry overlaping multi-exon ref
3203 		//check full pre-mRNA case (all introns retained): code 'm'
3204 
3205 		if (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start)
3206 			return 'm';
3207 
3208 		for (int j=0;j<=jmax;j++) {
3209 			//check if it's ~contained by an exon
3210 			int exovlen=mseg.overlapLen(r.exons[j]);
3211 			if (exovlen>0) {
3212 				ovlen+=exovlen;
3213 				if (m.start>r.exons[j]->start-4 && m.end<r.exons[j]->end+4) {
3214 					return 'c'; //close enough to be considered contained in this exon
3215 				}
3216 			}
3217 			if (j==jmax) break; //last exon here, no intron to check
3218 			//check if it fully covers an intron (retained intron)
3219 			if (m.start<r.exons[j]->end && m.end>r.exons[j+1]->start)
3220 				return 'n';
3221 			//check if it's fully contained by an intron
3222 			if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
3223 				return 'i';
3224 			// check if it's a potential pre-mRNA transcript
3225 			// (if overlaps this intron at least 10 bases)
3226 			uint introvl=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
3227 			//iovlen+=introvl;
3228 			if (introvl>=10 && mseg.len()>introvl+10) { rcode='e'; }
3229 		} //for each ref exon
3230 		if (rcode>0) return rcode;
3231 		return 'o'; //plain overlap, uncategorized
3232 	} //single-exon transfrag
3233 	//-- multi-exon transfrag --
3234 	int imax=m.exons.Count()-1;// imax>0 here
3235 	if (jmax==0) { //single-exon reference overlap
3236 		//any exon overlap?
3237 		GSeg rseg(r.start, r.end);
3238 		for (int i=0;i<=imax;i++) {
3239 			//check if it's ~contained by an exon
3240 			int exovlen=rseg.overlapLen(m.exons[i]);
3241 			if (exovlen>0) {
3242 				ovlen+=exovlen;
3243 				if (r.start>m.exons[i]->start-4 && r.end<m.exons[i]->end+4) {
3244 					return 'k'; //reference contained in this assembled exon
3245 				}
3246 			}
3247 			if (i==imax) break;
3248 			if (r.end<m.exons[i+1]->start && r.start>m.exons[i]->end)
3249 				return 'y'; //ref contained in this transfrag intron
3250 		}
3251 		return 'o';
3252 	}
3253 	// * check if transfrag contained by a ref intron
3254 	for (int j=0;j<jmax;j++) {
3255 		if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
3256 			return 'i';
3257 	}
3258 	if (m.exons[imax]->start<r.exons[0]->end) {
3259 		//qry intron chain ends before ref intron chain starts
3260 		//check if last qry exon plugs the 1st ref intron
3261 		if (m.exons[imax]->start<=r.exons[0]->end &&
3262 			m.exons[imax]->end>=r.exons[1]->start) return 'n';
3263 		return 'o'; //only terminal exons overlap
3264 	}
3265 	else if (r.exons[jmax]->start<m.exons[0]->end) {
3266 		//qry intron chain starts after ref intron chain ends
3267 		//check if first qry exon plugs the last ref intron
3268 		if (m.exons[0]->start<=r.exons[jmax-1]->end &&
3269 			m.exons[0]->end>=r.exons[jmax]->start) return 'n';
3270 		return 'o'; //only terminal exons overlap
3271 	}
3272 	//check intron chain overlap (match, containment, intron retention etc.)
3273 	int i=1; //index of exon to the right of current qry intron
3274 	int j=1; //index of exon to the right of current ref intron
3275 	bool intron_conflict=false; //used for checking for retained introns
3276 	//from here on we check all qry introns against ref introns
3277 	bool junct_match=false; //true if at least a junction match is found
3278 	bool ichain_match=true; //if there is intron (sub-)chain match, to be updated by any mismatch
3279 	bool intron_ovl=false; //if any intron overlap is found
3280 	bool intron_retention=false; //if any ref intron is covered by a qry exon
3281 	int imfirst=0; //index of first intron match in query (valid>0)
3282 	int jmfirst=0; //index of first intron match in reference (valid>0)
3283 	int imlast=0;  //index of first intron match in query
3284 	int jmlast=0;  //index of first intron match in reference
3285 	//check for intron matches
3286 	while (i<=imax && j<=jmax) {
3287 		uint mstart=m.exons[i-1]->end;
3288 		uint mend=m.exons[i]->start;
3289 		uint rstart=r.exons[j-1]->end;
3290 		uint rend=r.exons[j]->start;
3291 		if (rend<mstart) { //qry intron starts after ref intron ends
3292 			if (!intron_conflict && r.exons[j]->overlap(mstart+1, mend-1))
3293 				intron_conflict=true;
3294 			if (!intron_retention && rstart>=m.exons[i-1]->start)
3295 				intron_retention=true;
3296 			if (intron_ovl) ichain_match=false;
3297 			j++;
3298 			continue;
3299 		} //no intron overlap, skipping ref intron
3300 		if (rstart>mend) { //qry intron ends before ref intron starts
3301 			//if qry intron overlaps the exon on the left, we have an intron conflict
3302 			if (!intron_conflict && r.exons[j-1]->overlap(mstart+1, mend-1))
3303 				intron_conflict=true;
3304 			if (!intron_retention && rend<=m.exons[i]->end)
3305 				intron_retention=true;
3306 			if (intron_ovl) ichain_match=false;
3307 			i++;
3308 			continue;
3309 		} //no intron overlap, skipping qry intron
3310 		intron_ovl=true;
3311 		//overlapping introns, test junction matching
3312 		bool smatch=(mstart==rstart);
3313 		bool ematch=(mend==rend);
3314 		if (smatch || ematch) junct_match=true;
3315 		if (smatch && ematch) {
3316 			//perfect match for this intron
3317 			if (ichain_match) { //chain matching still possible
3318 			  if (jmfirst==0) jmfirst=j;
3319 			  if (imfirst==0) imfirst=i;
3320 			  imlast=i;
3321 			  jmlast=j;
3322 			}
3323 			i++; j++;
3324 			continue;
3325 		}
3326 		//intron overlapping but with at least a junction mismatch
3327 		intron_conflict=true;
3328 		ichain_match=false;
3329 		if (mend>rend) j++; else i++;
3330 	} //while checking intron overlaps
3331 	if (ichain_match) { //intron sub-chain match
3332 		if (imfirst==1 && imlast==imax) { // qry full intron chain match
3333 			if (jmfirst==1 && jmlast==jmax) {//identical intron chains
3334 				if (strictMatch) return (r.exons[0]->start==m.exons[0]->start &&
3335 						              r.exons.Last()->end && m.exons.Last()->end) ? '=' : '~';
3336 				else return '=';
3337 			}
3338 			// -- qry intron chain is shorter than ref intron chain --
3339 			int l_iovh=0;   // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
3340 			int r_iovh=0;   // same type of overhang through the ref intron on the right
3341 			if (jmfirst>1 && r.exons[jmfirst-1]->start>m.start)
3342 				l_iovh = r.exons[jmfirst-1]->start - m.start;
3343 			if (jmlast<jmax && m.end > r.exons[jmlast]->end)
3344 				r_iovh = m.end - r.exons[jmlast]->end;
3345 			if (l_iovh<4 && r_iovh<4) return 'c';
3346 		} else if ((jmfirst==1 && jmlast==jmax)) {//ref full intron chain match
3347 			//check if the reference i-chain is contained in qry i-chain
3348 			int l_jovh=0;   // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
3349 			int r_jovh=0;   // same type of overhang through the ref intron on the right
3350 			if (imfirst>1 && m.exons[imfirst-1]->start>r.start)
3351 				l_jovh = m.exons[imfirst-1]->start - r.start;
3352 			if (imlast<imax && r.end > m.exons[imlast]->end)
3353 				r_jovh = r.end - m.exons[imlast]->end;
3354 			if (l_jovh<4 && r_jovh<4) return 'k'; //reverse containment
3355 		}
3356 	}
3357 	//'=', 'c' and 'k' were checked and assigned, check for 'm' and 'n' before falling back to 'j'
3358 	if (!intron_conflict && (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start)) {
3359 			return 'm';
3360 	}
3361 	if (intron_retention) return 'n';
3362 	if (junct_match) return 'j';
3363 	//we could have 'o' or 'y' here
3364 	//any real exon overlaps?
3365 	ovlen=m.exonOverlapLen(r);
3366 	if (ovlen>4) return 'o';
3367 	return 'y'; //all reference exons are within transfrag introns!
3368 }
3369