1 #include "gff.h"
2
3 GffNames* GffObj::names=NULL;
4 //global set of feature names, attribute names etc.
5 // -- common for all GffObjs in current application!
6
7 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
8 const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
9 const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB mouse intron
10 const int GFF_MIN_INTRON = 4; //for mergeCloseExons option
11 //bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 int gff_fid_mRNA=0; //mRNA (has CDS)
13 int gff_fid_transcript=1; // generic "transcript" feature
14 int gff_fid_exon=2; // generic "exon"-like feature (exon,CDS,UTR,start/stop codon)
15 int gff_fid_CDS=3; // CDS feature (CDS, start/stop codon)
16 const char* exonTypes[]={ "None", "StartCodon", "StopCodon",
17 "CDS", "UTR", "CDS+UTR", "exon" };
18
19 const GffScore GFFSCORE_NONE;
20 //const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
21 //const byte gfo_flagShift_LEVEL = 16;
22
gffnames_ref(GffNames * & n)23 void gffnames_ref(GffNames* &n) {
24 if (n==NULL) n=new GffNames();
25 n->numrefs++;
26 }
27
gffnames_unref(GffNames * & n)28 void gffnames_unref(GffNames* &n) {
29 if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
30 n->numrefs--;
31 if (n->numrefs==0) { delete n; n=NULL; }
32 }
33
34 const int CLASSCODE_OVL_RANK = 15;
classcode_rank(char c)35 int classcode_rank(char c) {
36 switch (c) {
37 case '=': return 0; //intron chain match or full exon chain match if strict matching is enabled
38 case '~': return 1; //intron chain match when strict matching is enabled
39 case 'c': return 2; //containment, perfect partial match (transfrag < reference)
40 case 'k': return 6; // reverse containment (reference < transfrag)
41 case 'm': return 6; // full span overlap with all reference introns either matching or retained
42 case 'n': return 6; // partial overlap transfrag with at least one intron retention
43 case 'j': return 6; // multi-exon transfrag with at least one junction match
44 case 'e': return 12; // single exon transfrag partially overlapping an intron of reference (possible pre-mRNA fragment)
45 case 'o': return 14; // other generic exon overlap
46 //**** >15 = no-overlaps (not on the same strand) from here on *****
47 case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand (wrong strand mapping?)
48 case 'x': return 18; // generic overlap on opposite strand (usually wrong strand mapping)
49 case 'i': return 20; // intra-intron (transfrag fully contained within a reference intron)
50 case 'y': return 30; // no exon overlap: ref exons fall within transfrag introns!
51 case 'p': return 90; //polymerase run
52 case 'r': return 92; //repeats
53 case 'u': return 94; //intergenic
54 case 0 : return 100;
55 default: return 96;
56 }
57 }
58
strExonType(char xtype)59 const char* strExonType(char xtype) {
60 static const char* extbl[7]={"None", "start_codon", "stop_codon", "CDS", "UTR", "CDS_UTR", "exon"};
61 if (xtype>0 && xtype<7)
62 return extbl[(int)xtype];
63 else return "NULL";
64 }
65
gfo_cmpByLoc(const pointer p1,const pointer p2)66 int gfo_cmpByLoc(const pointer p1, const pointer p2) {
67 GffObj& g1=*((GffObj*)p1);
68 GffObj& g2=*((GffObj*)p2);
69 if (g1.gseq_id==g2.gseq_id) {
70 if (g1.start!=g2.start)
71 return (int)(g1.start-g2.start);
72 else if (g1.getLevel()!=g2.getLevel())
73 return (int)(g1.getLevel()-g2.getLevel());
74 else
75 if (g1.end!=g2.end)
76 return (int)(g1.end-g2.end);
77 else return strcmp(g1.getID(), g2.getID());
78 }
79 else //return (int)(g1.gseq_id-g2.gseq_id); // input order !
80 return strcmp(g1.getGSeqName(), g2.getGSeqName()); //lexicographic !
81 }
82
83 //comparator for ordering by reference sequence (chromosome) index
gfo_cmpRefByID(const pointer p1,const pointer p2)84 int gfo_cmpRefByID(const pointer p1, const pointer p2) {
85 GffObj& g1=*((GffObj*)p1);
86 GffObj& g2=*((GffObj*)p2);
87 if (g1.gseq_id==g2.gseq_id) {
88 if (g1.start!=g2.start)
89 return (int)(g1.start-g2.start);
90 else if (g1.getLevel()!=g2.getLevel())
91 return (int)(g1.getLevel()-g2.getLevel());
92 else
93 if (g1.end!=g2.end)
94 return (int)(g1.end-g2.end);
95 else return strcmp(g1.getID(), g2.getID());
96 }
97 else return (g1.gseq_id-g2.gseq_id); // sort refs by their id# order
98 }
99
extractGFFAttr(char * & infostr,const char * oline,const char * attr,bool caseStrict,bool enforce_GTF2,int * rlen,bool deleteAttr)100 char* GffLine::extractGFFAttr(char* & infostr, const char* oline, const char* attr, bool caseStrict,
101 bool enforce_GTF2, int* rlen, bool deleteAttr) {
102 //parse a key attribute and remove it from the info string
103 //(only works for attributes that have values following them after ' ' or '=')
104 static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required for GTF) at line:\n%s\n";
105 int attrlen=strlen(attr);
106 char cend=attr[attrlen-1];
107 //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
108 //must make sure attr is not found in quoted text
109 char* pos=infostr;
110 char prevch=0;
111 bool in_str=false;
112 bool notfound=true;
113 int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
114 while (notfound && *pos) {
115 char ch=*pos;
116 if (ch=='"') {
117 in_str=!in_str;
118 pos++;
119 prevch=ch;
120 continue;
121 }
122 if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
123 && strcmpfn(attr, pos, attrlen)==0) {
124 //attr match found
125 //check for word boundary on right
126 char* epos=pos+attrlen;
127 if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
128 notfound=false;
129 break;
130 }
131 //not a perfect match, move on
132 pos=epos;
133 prevch=*(pos-1);
134 continue;
135 }
136 //not a match or in_str
137 prevch=ch;
138 pos++;
139 }
140 if (notfound) return NULL;
141 char* vp=pos+attrlen;
142 while (*vp==' ') vp++;
143 if (*vp==';' || *vp==0) {
144 GMessage("Warning: cannot parse value of GFF attribute \"%s\" at line:\n%s\n", attr, oline);
145 return NULL;
146 }
147 bool dq_enclosed=false; //value string enclosed by double quotes
148 if (*vp=='"') {
149 dq_enclosed=true;
150 vp++;
151 }
152 if (enforce_GTF2 && !dq_enclosed)
153 GError(GTF2_ERR, attr, oline);
154 char* vend=vp;
155 if (dq_enclosed) {
156 while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
157 }
158 else {
159 while (*vend!=';' && *vend!=0) vend++;
160 }
161 if (enforce_GTF2 && *vend!='"')
162 GError(GTF2_ERR, attr, oline);
163 char *r=Gstrdup(vp, vend-1);
164 if (rlen) *rlen = vend-vp;
165 if (deleteAttr) {//-- remove this attribute from infostr
166 while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
167 if (*vend==0) vend--;
168 for (char *src=vend, *dest=pos;;src++,dest++) {
169 *dest=*src; //shift the rest of infostr (copy over)
170 if (*src==0) break;
171 }
172 }
173 return r;
174 }
BEDLine(GffReader * reader,const char * l)175 BEDLine::BEDLine(GffReader* reader, const char* l): skip(true), dupline(NULL), line(NULL), llen(0),
176 gseqname(NULL), fstart(0), fend(0), strand(0), ID(NULL), info(NULL),
177 cds_start(0), cds_end(0), cds_phase(0), exons(1) {
178 if (reader==NULL || l==NULL) return;
179 llen=strlen(l);
180 GMALLOC(line,llen+1);
181 memcpy(line, l, llen+1);
182 GMALLOC(dupline, llen+1);
183 memcpy(dupline, l, llen+1);
184 char* t[14];
185 int i=0;
186 int tidx=1;
187 t[0]=line;
188 if (startsWith(line, "browser ") || startsWith(line, "track "))
189 return;
190 while (line[i]!=0) {
191 if (line[i]=='\t') {
192 line[i]=0;
193 t[tidx]=line+i+1;
194 tidx++;
195 //our custom BED-13+ format, with GFF3 attributes in 13th column
196 if (tidx>12) { info=t[12]; break; }
197 }
198 i++;
199 }
200 /* if (tidx<6) { // require BED-6+ lines
201 GMessage("Warning: 6+ BED columns expected, instead found:\n%s\n", l);
202 return;
203 }
204 */
205 gseqname=t[0];
206 char* p=t[1];
207 if (!parseUInt(p,fstart)) {
208 GMessage("Warning: invalid BED start coordinate at line:\n%s\n",l);
209 return;
210 }
211 ++fstart; //BED start is 0 based
212 p=t[2];
213 if (!parseUInt(p,fend)) {
214 GMessage("Warning: invalid BED end coordinate at line:\n%s\n",l);
215 return;
216 }
217 if (fend<fstart) Gswap(fend,fstart); //make sure fstart<=fend, always
218 if (tidx>5) {
219 strand=*t[5];
220 if (strand!='-' && strand !='.' && strand !='+') {
221 GMessage("Warning: unrecognized BED strand at line:\n%s\n",l);
222 return;
223 }
224 }
225 else strand='.';
226 //if (tidx>12) ID=t[12];
227 // else ID=t[3];
228 ID=t[3];
229 //now parse the exons, if any
230 if (tidx>11) {
231 int numexons=0;
232 p=t[9];
233 if (!parseInt(p, numexons) || numexons<=0) {
234 GMessage("Warning: invalid BED block count at line:\n%s\n",l);
235 return;
236 }
237 char** blen;
238 char** bstart;
239 GMALLOC(blen, numexons * sizeof(char*));
240 GMALLOC(bstart, numexons * sizeof(char*));
241 i=0;
242 int b=1;
243 blen[0]=t[10];
244 while (t[10][i]!=0 && b<=numexons) {
245 if (t[10][i]==',') {
246 t[10][i]=0;
247 if (b<numexons)
248 blen[b]=t[10]+i+1;
249 b++;
250 }
251 i++;
252 }
253 b=1;i=0;
254 bstart[0]=t[11];
255 while (t[11][i]!=0 && b<=numexons) {
256 if (t[11][i]==',') {
257 t[11][i]=0;
258 if (b<numexons)
259 bstart[b]=t[11]+i+1;
260 b++;
261 }
262 i++;
263 }
264 GSeg ex;
265 for (i=0;i<numexons;++i) {
266 int exonlen;
267 if (!strToInt(blen[i], exonlen) || exonlen<=0) {
268 GMessage("Warning: invalid BED block size %s at line:\n%s\n",blen[i], l);
269 return;
270 }
271 int exonstart;
272 if (!strToInt(bstart[i], exonstart) || exonstart<0) {
273 GMessage("Warning: invalid BED block start %s at line:\n%s\n",bstart[i], l);
274 return;
275 }
276 if (i==0 && exonstart!=0) {
277 GMessage("Warning: first BED block start is %d>0 at line:\n%s\n",exonstart, l);
278 return;
279 }
280 exonstart+=fstart;
281 uint exonend=exonstart+exonlen-1;
282 if ((uint)exonstart>fend || exonend>fend) {
283 GMessage("Warning: BED exon %d-%d is outside record boundary at line:\n%s\n",exonstart,exonend, l);
284 return;
285 }
286 ex.start=exonstart;ex.end=exonend;
287 exons.Add(ex);
288 }
289 GFREE(blen);
290 GFREE(bstart);
291 }
292 else { //take it as single-exon transcript
293 GSeg v(fstart, fend);
294 exons.Add(v);
295 }
296 if (info!=NULL) {
297 char* cdstr=GffLine::extractGFFAttr(info, dupline, "CDS=");
298 if (cdstr) {
299 char* p=strchr(cdstr, ':');
300 if (p!=NULL) {
301 *p='\0'; ++p;
302 }
303 if (strToUInt(cdstr, cds_start) && cds_start>=fstart-1) {
304 ++cds_start;
305 if (!strToUInt(p, cds_end) || cds_end>fend) {
306 GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n",
307 cds_start, cds_end, dupline);
308 cds_start=0;
309 cds_end=0; //invalid CDS coordinates
310 }
311 }
312 char* cdstr_phase=NULL;
313 if (cds_start>0 && (cdstr_phase=GffLine::extractGFFAttr(info, dupline, "CDSphase="))!=NULL) {
314 cds_phase=cdstr_phase[0];
315 GFREE(cdstr_phase);
316 }
317 GFREE(cdstr);
318 }
319 }
320 if (cds_start==0 && cds_end==0 && tidx>7) {
321 //check if columns 7,8 can be reasonably assumed to be CDS start-end coordinates
322 if (strToUInt(t[6], cds_start) && strToUInt(t[7], cds_end) && cds_end>cds_start) {
323 if (cds_start>=fstart-1 && cds_end<=fend)
324 cds_start++;
325 else { cds_start=0; cds_end=0; }
326 }
327 }
328 skip=false;
329 }
330
parseSegmentList(GVec<GSeg> & segs,char * str)331 bool GffLine::parseSegmentList(GVec<GSeg>& segs, char* str) {
332 bool segs_valid=false;
333 char* p=strchr(str, '-');
334 if (p!=NULL && p>str) {
335 GDynArray<char*> ss;
336 strsplit(str, ss, ',');
337 GSeg seg;
338 segs_valid=true;
339 for (uint i=0;i<ss.Count();++i) {
340 char* p=strchr(ss[i], '-');
341 if (p==NULL) {
342 segs_valid=false;
343 break;
344 }
345 *p='\0'; ++p;
346 int xstart=0, xend=0;
347 if (!strToInt(ss[i], xstart) || xstart<(int)fstart || xstart>(int)fend){
348 segs_valid=false;
349 break;
350 }
351 if (!strToInt(p, xend) || xend<(int)fstart || xend>(int)fend) {
352 segs_valid=false;
353 break;
354 }
355 if (xstart>xend) { seg.start=(uint)xend;seg.end=(uint)xstart; }
356 else { seg.start=(uint)xstart;seg.end=(uint)xend; }
357 segs.Add(seg);
358 } //parse all CDS segments
359 if (segs_valid) {
360 if (segs.Count()>1) segs.Sort();
361 } else
362 segs.Clear();
363 }
364 return segs_valid;
365 }
366
GffLine(GffReader * reader,const char * l)367 GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len(0),
368 dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
369 ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), //qstart(0), qend(0), qlen(0),
370 score(0), score_decimals(-1), strand(0), flags(0), exontype(exgffNone), phase(0), cds_start(0), cds_end(0),
371 exons(), cdss(), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) {
372 llen=strlen(l);
373 GMALLOC(line,llen+1);
374 memcpy(line, l, llen+1);
375 GMALLOC(dupline, llen+1);
376 memcpy(dupline, l, llen+1);
377 skipLine=true; //clear only if we make it to the end of this function
378 char* t[9];
379 int i=0;
380 int tidx=1;
381 t[0]=line;
382 char fnamelc[128];
383 while (line[i]!=0) {
384 if (line[i]=='\t') {
385 line[i]=0;
386 t[tidx]=line+i+1;
387 tidx++;
388 if (tidx>8) break;
389 }
390 i++;
391 }
392 if (tidx<8) { // ignore non-GFF lines
393 return;
394 }
395 gffWarnings=reader->gff_warns;
396 gseqname=t[0];
397 track=t[1];
398 ftype=t[2];
399 info=t[8];
400 char* p=t[3];
401 if (!parseUInt(p,fstart)) {
402 //chromosome_band entries in Flybase
403 GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
404 return;
405 }
406 p=t[4];
407 if (!parseUInt(p,fend)) {
408 GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
409 return;
410 }
411 if (fend<fstart) {
412 GMessage("Error: invalid feature coordinates (end<start!) at line:\n%s\n",l);
413 Gswap(fend,fstart); //make sure fstart<=fend, always
414 //return;
415 }
416 p=t[5];
417 if (p[0]=='.' && p[1]==0) {
418 score=0;
419 score_decimals=-1;
420 }
421 else {
422 score_decimals=0;
423 //count decimals
424 char* pd=strchr(p,'.');
425 if (pd) {
426 ++pd;
427 char* pde=pd;
428 while ((*pde)!=0) ++pde;
429 score_decimals=pde-pd;
430 }
431 if (!parseFloat(p, score))
432 GError("Error parsing feature score from GFF line:\n%s\n",l);
433
434 }
435 strand=*t[6];
436 if (strand!='+' && strand!='-' && strand!='.')
437 GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
438 phase=*t[7]; // must be '.', '0', '1' or '2'
439 // exon/CDS/mrna filter
440 strncpy(fnamelc, ftype, 127);
441 fnamelc[127]=0;
442 strlower(fnamelc); //convert to lower case
443 if (endsWith(fnamelc, "match")) {
444 //TODO: do not discard generic cDNA_match/protein_match features, convert them internally
445 // (when a hit chain has multiple _match features with the same ID, e.g. multiple HSPs with the same subject)
446 // and set GffObj::flag_DISCONTINUOUS
447 return;
448 }
449 bool is_t_data=false;
450 bool someRNA=false;
451 if (strstr(fnamelc, "utr")!=NULL) {
452 exontype=exgffUTR;
453 is_exon=true;
454 is_t_data=true;
455 }
456 else if (endsWith(fnamelc, "exon")) {
457 exontype=exgffExon;
458 is_exon=true;
459 is_t_data=true;
460 }
461 else if (strstr(fnamelc, "stop")!=NULL &&
462 (strstr(fnamelc, "codon")!=NULL || strstr(fnamelc, "cds")!=NULL) &&
463 strstr(fnamelc, "redefined")==NULL && strstr(fnamelc, "selenocysteine")==NULL) {
464 exontype=exgffStopCodon;
465 is_exon=true;
466 is_cds=true; //though some place it outside the last CDS segment
467 is_t_data=true;
468 }
469 else if (strstr(fnamelc, "start") &&
470 ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
471 exontype=exgffStartCodon;
472 is_exon=true;
473 is_cds=true;
474 is_t_data=true;
475 }
476 else if (strcmp(fnamelc, "cds")==0) {
477 exontype=exgffCDS;
478 is_exon=true;
479 is_cds=true;
480 is_t_data=true;
481 }
482 else if (startsWith(fnamelc, "intron") || endsWith(fnamelc, "intron")) {
483 exontype=exgffIntron;
484 }
485 else if ((someRNA=endsWith(fnamelc,"rna")) || endsWith(fnamelc,"transcript")) { // || startsWith(fnamelc+1, "rna")) {
486 is_transcript=true;
487 is_t_data=true;
488 if (someRNA) ftype_id=GffObj::names->feats.addName(ftype);
489 }
490 else if (endsWith(fnamelc, "_gene_segment")) {
491 is_transcript=true;
492 is_t_data=true;
493 is_gene_segment=true;
494 }
495 else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
496 is_gene=true;
497 is_t_data=true; //because its name will be attached to parented transcripts
498 }
499 char* Parent=NULL;
500 /*
501 Rejecting non-transcript lines early if only transcripts are requested ?!
502 It would be faster to do this here but there are GFF cases when we reject an
503 unusual parent feature here (e.g. protein with CDS children) and then
504 their exon/CDS children show up and get assigned to an implicit parent mRNA
505 The solution is to still load this parent as GffObj for now and BAN it later
506 so its children get dismissed/discarded as well.
507 */
508 if (reader->ignoreLocus) {
509 if (strcmp(ftype, "locus")==0) return;
510 if (is_transcript || is_gene) {
511 char* locus=NULL;
512 if (reader->is_gff3 || reader->gff_type==0)
513 locus=extractAttr("locus=");
514 else locus=extractAttr("locus");
515 if (locus!=NULL) { GFREE(locus); }
516 }
517 }
518 char *gtf_tid=NULL;
519 char *gtf_gid=NULL;
520 if (reader->is_gff3 || reader->gff_type==0) {
521 ID=extractAttr("ID=",true);
522 Parent=extractAttr("Parent=",true);
523 if (reader->gff_type==0) {
524 if (ID!=NULL || Parent!=NULL) reader->is_gff3=true;
525 else { //check if it looks like a GTF
526 gtf_tid=extractAttr("transcript_id", true, true);
527 if (gtf_tid==NULL) {
528 gtf_gid=extractAttr("gene_id", true, true);
529 if (gtf_gid==NULL) return; //cannot determine file type yet
530 }
531 reader->is_gtf=true;
532 }
533 }
534 }
535
536 if (reader->is_gff3) {
537 //parse as GFF3
538 //if (ID==NULL && Parent==NULL) return; //silently ignore unidentified/unlinked features
539 if (ID!=NULL) {
540 //has ID attr so it's likely to be a parent feature
541
542 //look for explicit gene name
543 gene_name=getAttrValue("gene_name=");
544 if (gene_name==NULL) {
545 gene_name=getAttrValue("geneName=");
546 if (gene_name==NULL) {
547 gene_name=getAttrValue("gene_sym=");
548 if (gene_name==NULL) {
549 gene_name=getAttrValue("gene=");
550 }
551 }
552 }
553 gene_id=getAttrValue("geneID=");
554 if (gene_id==NULL) {
555 gene_id=getAttrValue("gene_id=");
556 }
557 /*
558 if (is_gene) { //--WARNING: this might be mislabeled (e.g. TAIR: "mRNA_TE_gene")
559 //---special case: keep the Name and ID attributes of the gene feature
560 //if (gene_name==NULL)
561 // gene_name=extractAttr("Name=");
562 if (gene_id==NULL) //the ID is also gene_id in this case
563 gene_id=Gstrdup(ID);
564 //skip=false;
565 //return;
566 //-- we don't care about gene parents.. unless it's a mislabeled "gene" feature
567 } //gene feature (probably)
568 */
569 //--parse exons for TLF
570 char* segstr=extractAttr("exons=");
571 bool exons_valid=false;
572 if (segstr) {
573 exons_valid=parseSegmentList(exons, segstr);
574 char* exoncountstr=extractAttr("exonCount=");
575 if (exoncountstr) {
576 int exoncount=0;
577 if (!strToInt(exoncountstr, exoncount) || exoncount!=(int)exons.Count())
578 GMessage("Warning: exonCount attribute value doesn't match the exons attribute!\n");
579 GFREE(exoncountstr);
580 }
581 GFREE(segstr);
582 }
583 if (exons_valid) {
584 bool validCDS=false;
585 segstr=extractAttr("CDS=");
586 if (segstr) {
587 char* p=strchr(segstr, ':');
588 if (p!=NULL) { // CDS=start:end format
589 *p='\0'; ++p;
590 validCDS=true;
591 if (validCDS && strToUInt(segstr, cds_start) && cds_start>=fstart) {
592 if (!strToUInt(p, cds_end) || cds_end>fend) {
593 validCDS=false;
594 }
595 }
596 if (!validCDS || (int)cds_start<=0 || (int)cds_end<=0) {
597 GMessage("Warning: invalid CDS (%d-%d) discarded for line:\n%s\n",
598 cds_start, cds_end, dupline);
599 cds_start=0;
600 cds_end=0;
601 }
602 } //CDS=start:end format
603 else { //CDS = list of start-end segments, just like the exons
604 validCDS=parseSegmentList(cdss, segstr);
605 if (validCDS && cdss.Count()>0) {
606 if (cds_start==0) cds_start=cdss.First().start;
607 if (cds_end==0) cds_end=cdss.Last().end;
608 }
609 }
610 GFREE(segstr);
611 }
612 if (validCDS) {
613 char* cds_phase=NULL;
614 if ((cds_phase=extractAttr("CDSphase="))!=NULL) {
615 phase=cds_phase[0];
616 GFREE(cds_phase);
617 }
618 } //CDS found
619 }//has valid exons
620 }// has GFF3 ID
621 if (Parent!=NULL) {
622 //keep Parent attr
623 //parse multiple parents
624 num_parents=1;
625 p=Parent;
626 int last_delim_pos=-1;
627 while (*p!=';' && *p!=0) {
628 if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
629 num_parents++;
630 last_delim_pos=(p-Parent);
631 }
632 p++;
633 }
634 _parents_len=p-Parent+1;
635 _parents=Parent;
636 GMALLOC(parents, num_parents*sizeof(char*));
637 parents[0]=_parents;
638 int i=1;
639 if (last_delim_pos>0) {
640 for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
641 if (*p==',') {
642 char* ep=p-1;
643 while (*ep==' ' && ep>_parents) ep--;
644 *(ep+1)=0; //end the string there
645 parents[i]=p+1;
646 i++;
647 }
648 }
649 }
650 } //has Parent field
651 //special case for gene_id: for genes, this is the ID
652 if (is_gene && gene_id==NULL && ID!=NULL) {
653 gene_id=Gstrdup(ID);
654 }
655 //parse other potentially useful GFF3 attributes
656 /*
657 if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
658 p+=7;
659 while (*p!=';' && *p!=0 && *p!=' ') p++;
660 if (*p!=' ') {
661 GError("Error parsing target coordinates from GFF line:\n%s\n",l);
662 }
663 if (!parseUInt(p,qstart))
664 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
665 if (*p!=' ') {
666 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
667 }
668 p++;
669 if (!parseUInt(p,qend))
670 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
671 }
672 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
673 p+=5;
674 if (!parseUInt(p,qstart))
675 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
676 if (*p!='-') {
677 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
678 }
679 p++;
680 if (!parseUInt(p,qend))
681 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
682 if (*p=='|' || *p==':') {
683 p++;
684 if (!parseUInt(p,qlen))
685 GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
686 }
687 }//has Qreg attr
688 if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
689 p+=5;
690 if (!parseUInt(p,qlen))
691 GError("Error parsing target length from GFF Qlen:\n%s\n",l);
692 }
693 */
694 } //GFF3
695 else { // ----------------- GTF syntax ------------------
696 if (reader->transcripts_Only && !is_t_data) {
697 return; //alwasys skip unrecognized non-transcript features in GTF
698 }
699 if (is_gene) {
700 reader->gtf_gene=true;
701 ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true); //Ensemble GTF might lack this
702 gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
703 if (ID==NULL) {
704 //no transcript_id -- this should not be valid GTF2 format, but Ensembl (Gencode?)
705 //has being known to add "gene" features with only gene_id in their GTF
706 if (gene_id!=NULL) { //likely a gene feature line (Ensembl!)
707 ID=Gstrdup(gene_id); //take over as ID (for defective GTF lacking transcript_id)
708 }
709 }
710 // else if (strcmp(gene_id, ID)==0) //GENCODE v20 gene feature ?
711 }
712 else if (is_transcript) {
713 ID = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
714 //gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
715 if (ID==NULL) {
716 //something is wrong here, cannot parse the GTF ID
717 GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l);
718 return;
719 }
720 gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true);
721 if (gene_id!=NULL)
722 Parent=Gstrdup(gene_id);
723 reader->gtf_transcript=true;
724 is_gtf_transcript=1;
725 } else { //must be an exon type
726 Parent = (gtf_tid!=NULL) ? gtf_tid : extractAttr("transcript_id", true, true);
727 gene_id = (gtf_gid!=NULL) ? gtf_gid : extractAttr("gene_id", true, true); // for GTF this is the only attribute accepted as geneID
728 //old pre-GTF2 formats like Jigsaw's (legacy support)
729 if (Parent==NULL && exontype==exgffExon) {
730 if (startsWith(track,"jigsaw")) {
731 is_cds=true;
732 strcpy(track,"jigsaw");
733 p=strchr(info,';');
734 if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
735 else { Parent=Gstrdup(info,p-1);
736 info=p+1;
737 }
738 }
739 }
740 if (Parent==NULL) {
741 //something is wrong here couldn't parse the transcript ID for this feature
742 GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n", l);
743 return;
744 }
745 }
746 //more GTF attribute parsing
747 if (is_gene && gene_id==NULL && ID!=NULL)
748 gene_id=Gstrdup(ID);
749 gene_name=getAttrValue("gene_name");
750 if (gene_name==NULL) {
751 gene_name=getAttrValue("gene_sym");
752 if (gene_name==NULL) {
753 gene_name=getAttrValue("gene");
754 if (gene_name==NULL)
755 gene_name=getAttrValue("genesymbol");
756 }
757 }
758 //*** IMPORTANT: prepare GTF for easy parseAttr by adding '=' character after the attribute name
759 // for ALL attributes
760 p=info;
761 bool noed=true; //not edited after the last delim
762 bool nsp=false; //non-space found after last delim
763 while (*p!=0) {
764 if (*p==' ') {
765 if (nsp && noed) {
766 *p='=';
767 noed=false;
768 p++;
769 continue;
770 }
771 }
772 else nsp=true; //non-space
773 if (*p==';') { noed=true; nsp=false; }
774 p++;
775 }
776 //-- GTF prepare parents[] if Parent found
777 if (Parent!=NULL) { //GTF transcript_id found as a parent
778 _parents=Parent;
779 num_parents=1;
780 _parents_len=strlen(Parent)+1;
781 GMALLOC(parents, sizeof(char*));
782 parents[0]=_parents;
783 }
784 } //GTF
785
786
787 if (ID==NULL && parents==NULL) {
788 if (gffWarnings)
789 GMessage("Warning: discarding unrecognized feature (no ID or Parent):\n%s\n",dupline);
790 return; //skip
791 }
792 skipLine=false;
793 }
794
795 //FIXME - this should only be used AFTER finalize() was called, and must have cdss=NULL of course
setCDS(uint cd_start,uint cd_end,char phase)796 void GffObj::setCDS(uint cd_start, uint cd_end, char phase) {
797 if (cd_start<this->start) {
798 GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n",
799 gffID, cd_start);
800 return;
801 }
802 if (cd_end>this->end) {
803 GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n",
804 gffID, cd_end);
805 return;
806 }
807 this->CDstart=cd_start;
808 this->CDend=cd_end;
809 this->CDphase=phase;
810 isTranscript(true);
811 subftype_id=gff_fid_exon;
812 if (monoFeature()) {
813 if (exons.Count()==0) addExon(this->start, this->end, exgffExon);
814 else exons[0]->exontype=exgffExon;
815 }
816 }
817
setCDS(GffObj * t)818 void GffObj::setCDS(GffObj* t) {
819 //copy the cdss as well
820 uint cd_start=t->CDstart;
821 uint cd_end=t->CDend;
822 uint phase=t->CDphase;
823 if (cd_start<this->start) {
824 GMessage("Warning: setCDS() called for %s with an out of bounds CDS start %d!\n",
825 gffID, cd_start);
826 return;
827 }
828 if (cd_end>this->end) {
829 GMessage("Warning: setCDS() called for %s with an out of bounds CDS end %d!\n",
830 gffID, cd_end);
831 return;
832 }
833 this->CDstart=cd_start;
834 this->CDend=cd_end;
835 this->CDphase=phase;
836 isTranscript(true);
837 subftype_id=gff_fid_exon;
838 if (monoFeature()) {
839 if (exons.Count()==0) addExon(this->start, this->end, exgffExon);
840 else exons[0]->exontype=exgffExon;
841 }
842 if (t->cdss!=NULL) {
843 if (this->cdss!=NULL) delete cdss;
844 cdss=new GList<GffExon>(true, true, false);
845 for (int i=0;i<t->cdss->Count();i++) {
846 cdss->Add(new GffExon(*(t->cdss->Get(i))));
847 }
848 }
849 }
850
readExon(GffReader & reader,GffLine & gl)851 int GffObj::readExon(GffReader& reader, GffLine& gl) {
852 // -- this should only be called before ::finalize()!
853 //should make sure to get the right subftype_id!
854 if (!isTranscript() && gl.exontype>exgffNone) {
855 //subfeature recognized as exon-like, so this should be considered a transcript!
856 isTranscript(true);
857 }
858 if (isTranscript()) {
859 if (subftype_id<0) {//exon_ftype_id=gff_fid_exon;
860 if (gl.exontype>exgffNone) subftype_id=gff_fid_exon;
861 else subftype_id=names->feats.addName(gl.ftype);
862 }
863 //any recognized exon-like segment gets the generic "exon" type (also applies to CDS)
864 if (gl.exontype==exgffNone && !gl.is_transcript) {
865 //extraneous mRNA feature, discard
866 if (reader.gff_warns)
867 GMessage("Warning: discarding unrecognized transcript subfeature '%s' of %s\n",
868 gl.ftype, gffID);
869 return -1;
870 }
871 }
872 else { //non-mRNA parent feature, check this subf type
873 int subf_id=names->feats.addName(gl.ftype);
874 if (subftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
875 subftype_id=subf_id;
876 else {
877 if (subftype_id!=subf_id) {
878 if (subftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
879 //the existing exon was just a dummy one created by default, discard it?
880 exons.Clear();
881 covlen=0;
882 subftype_id=subf_id; //allow the new subfeature to completely takeover
883 }
884 else { //multiple subfeatures, prefer those exon-like
885 if (reader.gff_warns)
886 GMessage("Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
887 names->feats.getName(subf_id), names->feats.getName(subftype_id),gffID);
888 if (gl.exontype>exgffNone) { //new feature is an exon, discard previously parsed subfeatures
889 if (reader.gff_warns) GMessage("%s.\n", names->feats.getName(subftype_id));
890 subftype_id=subf_id;
891 exons.Clear();
892 covlen=0;
893 }
894 else { //discard new feature
895 if (reader.gff_warns) GMessage("Warning: skipping subfeature %s.\n", names->feats.getName(subf_id));
896 return -1; //skip this 2nd subfeature type for this parent!
897 }
898 }
899 } //incoming subfeature is of different type
900 } //new subfeature type
901 } //non-mRNA parent
902 int eidx=-1;
903 GList<GffExon>* segs=NULL; //either cds or &exons
904 if (gl.is_cds) {
905 if (cdss==NULL)
906 cdss=new GList<GffExon>(true, true, false);
907 segs=cdss;
908 } else {
909 segs=&exons;
910 }
911 eidx=addExon(*segs, gl);
912 if (eidx<0) {
913 GMessage("Warning: addExon() failed for GFF line:\n%s\n",gl.dupline);
914 return eidx; //this should never happen!
915 }
916 if (reader.keep_Attrs) {
917 if (reader.noExonAttrs) {
918 parseAttrs(attrs, gl.info, true);
919 }
920 else { //need all exon-level attributes
921 parseAttrs((*segs)[eidx]->attrs, gl.info, true, gl.is_cds);
922 }
923 }
924 return eidx;
925 }
926
addExon(GList<GffExon> & segs,GffLine & gl,int8_t exontype_override)927 int GffObj::addExon(GList<GffExon>& segs, GffLine& gl, int8_t exontype_override) {
928 int ex_type=(exontype_override!=exgffNone) ? exontype_override : gl.exontype;
929 GffScore exon_score(gl.score, gl.score_decimals);
930 int eidx=addExon(gl.fstart, gl.fend, ex_type, gl.phase, exon_score, &segs);
931 if (&segs==cdss && isGene() && gl.ID!=NULL && eidx>=0) {
932 //special NCBI cases where CDS can be treated as discontiguous features, grouped by their ID
933 //-- used for genes with X_gene_segment features
934 //char* cds_id=Gstrdup(gl.ID);
935 //segs[eidx]->uptr=cds_id;
936 segs[eidx]->uptr=gl.ID;
937 gl.ID=NULL;
938 }
939 return eidx;
940 }
941
exonOverlapIdx(GList<GffExon> & segs,uint s,uint e,int * ovlen,int start_idx)942 int GffObj::exonOverlapIdx(GList<GffExon>& segs, uint s, uint e, int* ovlen, int start_idx) {
943 //return the exons' index for the overlapping OR ADJACENT exon
944 //ovlen, if given, will return the overlap length
945 //if (s>e) Gswap(s,e);
946 for (int i=start_idx;i<segs.Count();i++) {
947 if (segs[i]->start>e+1) break;
948 if (s-1>segs[i]->end) continue;
949 //-- overlap/adjacent if we are here:
950 if (ovlen!=NULL) {
951 int ovlend= (segs[i]->end>e) ? e : segs[i]->end;
952 *ovlen= ovlend - ((s>segs[i]->start)? s : segs[i]->start)+1;
953 }
954 return i;
955 } //for each exon
956 *ovlen=0;
957 return -1;
958 }
959
transferCDS(GffExon * cds)960 void GffObj::transferCDS(GffExon* cds) {
961 //direct adding of a cds to the cdss pointer, without checking
962 if (cdss==NULL) cdss=new GList<GffExon>(true, true, false);
963 cdss->Add(cds); //now the caller must forget this exon!
964 if (CDstart==0 || CDstart>cds->start) CDstart=cds->start;
965 }
966
addExon(uint segstart,uint segend,int8_t exontype,char phase,GffScore exon_score,GList<GffExon> * segs)967 int GffObj::addExon(uint segstart, uint segend, int8_t exontype, char phase, GffScore exon_score, GList<GffExon>* segs) {
968 if (segstart>segend) { Gswap(segstart, segend); }
969 if (segs==NULL) segs=&exons;
970 if (exontype!=exgffNone) { //check for overlaps between exon/CDS-type segments
971 //addExonSegment(gl.fstart, gl.fend, gl.score, gl.phase, gl.is_cds, exontype_override);
972 int ovlen=0;
973 int oi=-1;
974 while ((oi=exonOverlapIdx(*segs, segstart, segend, &ovlen, oi+1))>=0) {
975 //note: ovlen==0 for adjacent segments
976 if ((*segs)[oi]->exontype>exgffNone &&
977 (*segs)[oi]->start<=segstart && (*segs)[oi]->end>=segend) {
978 //existing feature contains this segment, so we do NOT need to add it
979 //-- unless its the annoying NCBI exception: gene with multiple alternate
980 // _gene_segment CDS features!
981 if (!(this->isGene() && exontype==exgffCDS &&
982 (*segs)[oi]->exontype==exgffCDS ))
983 return oi;
984 }
985 if (ovlen==0 || !(exontype==exgffCDS && (*segs)[oi]->exontype==exgffCDS)) {
986 //always merge adjacent features
987 //but NEVER merge two overlapping CDS (CDS programmed ribosomal shift aware)
988 int8_t segtype=((*segs)[oi]->exontype==exgffCDS || exontype==exgffCDS) ? exgffCDS : exgffExon;
989 //if expanded upward, may overlap the segment(s) above
990 expandSegment(*segs, oi, segstart, segend, segtype);
991 return oi;
992 }
993 }
994 } //exon overlap/adjacent check
995 //new exon/CDS, not merged in a previous one
996 GffExon* enew=new GffExon(segstart, segend, exontype, phase, exon_score.score, exon_score.precision);
997 int eidx=segs->Add(enew);
998 if (eidx<0) {
999 //this would actually be possible if the object is a "Gene" and "exons" are in fact isoforms
1000 delete enew;
1001 hasErrors(true);
1002 return -1;
1003 }
1004 if (start>segs->First()->start) start=segs->First()->start;
1005 if (end<segs->Last()->end) end=segs->Last()->end;
1006 if (isFinalized() && segs==&exons) {
1007 covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
1008 }
1009 return eidx;
1010 }
1011
expandSegment(GList<GffExon> & segs,int oi,uint segstart,uint segend,int8_t exontype)1012 void GffObj::expandSegment(GList<GffExon>& segs, int oi, uint segstart, uint segend, int8_t exontype) {
1013 //oi is the index of the *first* overlapping segment found that must be enlarged
1014 covlen-=segs[oi]->len();
1015 if (segstart<segs[oi]->start)
1016 segs[oi]->start=segstart;
1017 //if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
1018 if (segend>segs[oi]->end)
1019 segs[oi]->end=segend;
1020 //if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
1021 //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
1022 //if (sc!=0) segs[oi]->score=sc;
1023 //covlen+=exons[oi]->len();
1024 //if (exons[oi]->exontype< exontype) -- always true
1025 segs[oi]->exontype = exontype;
1026 //if (exontype==exgffCDS) exons[oi]->phase=fr;
1027 //we must check if any more exons are also overlapping this
1028 int ni=oi+1; //next exon index after oi
1029 while (ni<segs.Count() && segs[ni]->start<=segend+1) { // next segment overlaps OR adjacent to newly enlarged segment
1030 if (segs[ni]->exontype>0 &&
1031 (segs[ni]->start==segend+1 || segs[ni]->exontype!=exgffCDS || exontype!=exgffCDS)) {
1032 if (segs[ni]->start<segs[oi]->start) {
1033 segs[oi]->start=segs[ni]->start;
1034 if (strand=='+') segs[oi]->phase=segs[ni]->phase;
1035 }
1036 if (segs[ni]->end>segs[oi]->end) {
1037 segs[oi]->end=segs[ni]->end;
1038 if (strand=='-') segs[oi]->phase=segs[ni]->phase;
1039 }
1040
1041 segs.Delete(ni);
1042 } else ++ni;
1043 } //until no more overlapping/adjacent segments found
1044 // -- make sure any other related boundaries are updated:
1045 if (isFinalized()) {
1046 if (&segs==&exons) {
1047 start=exons.First()->start;
1048 end=exons.Last()->end;
1049 //recalculate covlen
1050 covlen=0;
1051 for (int i=0;i<exons.Count();++i) covlen+=exons[i]->len();
1052 }
1053 }
1054 else {
1055 if (start>segs.First()->start) start=segs.First()->start;
1056 if (end<segs.Last()->end) end=segs.Last()->end;
1057 }
1058 }
1059
removeExon(int idx)1060 void GffObj::removeExon(int idx) {
1061 if (idx<0 || idx>=exons.Count()) return;
1062 int segstart=exons[idx]->start;
1063 int segend=exons[idx]->end;
1064 exons.Delete(idx);
1065 if (isFinalized()) {
1066 covlen -= (int)(segend-segstart)+1;
1067 start=exons.First()->start;
1068 end=exons.Last()->end;
1069 if (isCDSOnly()) { CDstart=start; CDend=end; }
1070 }
1071 }
1072
removeExon(GffExon * p)1073 void GffObj::removeExon(GffExon* p) {
1074 for (int idx=0;idx<exons.Count();idx++) {
1075 if (exons[idx]==p) {
1076 int segstart=exons[idx]->start;
1077 int segend=exons[idx]->end;
1078 exons.Delete(idx);
1079 covlen -= (int)(segend-segstart)+1;
1080
1081 if (exons.Count() > 0) {
1082 start=exons.First()->start;
1083 end=exons.Last()->end;
1084 if (isCDSOnly()) { CDstart=start; CDend=end; }
1085 }
1086 return;
1087 }
1088 }
1089 }
1090
GffObj(GffReader & gfrd,BEDLine & bedline)1091 GffObj::GffObj(GffReader& gfrd, BEDLine& bedline):GSeg(0,0),
1092 exons(true,true,false), cdss(NULL), gscore() {
1093 uptr=NULL;
1094 ulink=NULL;
1095 parent=NULL;
1096 udata=0;
1097 flags=0;
1098 CDstart=0;
1099 CDend=0;
1100 CDphase=0;
1101 attrs=NULL;
1102 gffID=NULL;
1103 track_id=-1;
1104 gseq_id=-1;
1105 //ftype_id=-1;
1106 //subftype_id=-1;
1107 strand='.';
1108 gffnames_ref(names);
1109 //qlen=0;qstart=0;qend=0;
1110 covlen=0;
1111 geneID=NULL;
1112 gene_name=NULL;
1113 ftype_id=gff_fid_transcript;
1114 subftype_id=gff_fid_exon;
1115 start=bedline.fstart;
1116 end=bedline.fend;
1117 gseq_id=names->gseqs.addName(bedline.gseqname);
1118 track_id=names->tracks.addName("BED");
1119 strand=bedline.strand;
1120 //setup flags from gffline
1121 isGene(false);
1122 isTranscript(true);
1123 gffID=Gstrdup(bedline.ID);
1124 for (int i=0;i<bedline.exons.Count();++i) {
1125 int eidx=this->addExon(bedline.exons[i].start, bedline.exons[i].end, exgffExon);
1126 if (eidx<0 && gfrd.showWarnings())
1127 GMessage("Warning: failed adding segment %d-%d for %s (discarded)!\n",
1128 bedline.exons[i].start, bedline.exons[i].end, gffID);
1129
1130 }
1131 if (bedline.cds_start>0) {
1132 CDstart=bedline.cds_start;
1133 CDend=bedline.cds_end;
1134 if (CDstart>0 && bedline.cds_phase)
1135 CDphase=bedline.cds_phase;
1136 }
1137 if (gfrd.keep_Attrs && bedline.info!=NULL) this->parseAttrs(attrs, bedline.info);
1138 }
1139
GffObj(GffReader & gfrd,GffLine & gffline)1140 GffObj::GffObj(GffReader &gfrd, GffLine& gffline):
1141 GSeg(0,0), exons(true,true,false), cdss(NULL), children(1,false), gscore() {
1142 uptr=NULL;
1143 ulink=NULL;
1144 parent=NULL;
1145 udata=0;
1146 flags=0;
1147 CDstart=0;
1148 CDend=0;
1149 CDphase=0;
1150 geneID=NULL;
1151 gene_name=NULL;
1152 attrs=NULL;
1153 gffID=NULL;
1154 track_id=-1;
1155 gseq_id=-1;
1156 //ftype_id=-1;
1157 subftype_id=-1;
1158 strand='.';
1159 gffnames_ref(names);
1160 //qlen=0;qstart=0;qend=0;
1161 covlen=0;
1162 ftype_id=gffline.ftype_id;
1163 start=gffline.fstart;
1164 end=gffline.fend;
1165 gseq_id=names->gseqs.addName(gffline.gseqname);
1166 track_id=names->tracks.addName(gffline.track);
1167 strand=gffline.strand;
1168 /*
1169 qcov=0;
1170 qlen=gffline.qlen;
1171 qstart=gffline.qstart;
1172 qend=gffline.qend;
1173 */
1174 //setup flags from gffline
1175 isCDSOnly(gffline.is_cds); //for now
1176 isGene(gffline.is_gene);
1177 isTranscript(gffline.is_transcript || gffline.exontype!=exgffNone);
1178 //fromGff3(gffline.is_gff3);
1179 isGeneSegment(gffline.is_gene_segment);
1180 if (gffline.parents!=NULL && !gffline.is_transcript) {
1181 //GTF style -- create a GffObj directly by subfeature
1182 //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
1183 if (gffline.exontype!=exgffNone) { //recognized exon-like feature
1184 ftype_id=gff_fid_transcript; //so this is some sort of transcript
1185 subftype_id=gff_fid_exon; //subfeatures MUST be exons
1186 //typical GTF2 without "transcript" line
1187 // or malformed GFF3 with orphan or premature exon feature (found before the transcript line)
1188 gffID=Gstrdup(gffline.parents[0]);
1189 this->createdByExon(true);
1190 if (gfrd.is_gff3 && gfrd.showWarnings())
1191 GMessage("Warning: exon feature found before transcript ID %s\n",gffID);
1192 //this is the first exon/segment of the transcript
1193 readExon(gfrd, gffline);
1194 }
1195 else {//unrecognized (non-exon) subfeature
1196 //make this GffObj of the same feature type
1197 ftype_id=names->feats.addName(gffline.ftype);
1198 if (gffline.ID!=NULL) { //unrecognized non-exon feature ? use the ID instead
1199 this->hasGffID(true);
1200 gffID=Gstrdup(gffline.ID);
1201 if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info);
1202 }
1203 else { //no ID, just Parent
1204 GMessage("Warning: unrecognized parented feature without ID found before its parent:\n%s\n", gffline.dupline);
1205 gffID=Gstrdup(gffline.parents[0]);
1206 this->createdByExon(true);
1207 readExon(gfrd, gffline);
1208 }
1209 } //unrecognized (non-exon) feature
1210 } //non-transcript parented subfeature given directly
1211 else {
1212 //non-parented feature OR a recognizable transcript
1213 //create a parent feature in its own right
1214 gscore.score=gffline.score;
1215 gscore.precision=gffline.score_decimals;
1216 if (gffline.ID==NULL || gffline.ID[0]==0)
1217 GError("Error: no valid ID found for GFF record\n");
1218 this->hasGffID(true);
1219 gffID=Gstrdup(gffline.ID); //there must be an ID here
1220 //if (gffline.is_transcript) ftype_id=gff_fid_mRNA;
1221 //else
1222 if (gffline.is_transcript) {
1223 subftype_id=gff_fid_exon;
1224 if (ftype_id<0)
1225 ftype_id=names->feats.addName(gffline.ftype);
1226 if (gfrd.is_gff3) {
1227 if (gffline.exons.Count()>0) {
1228 //for compact GFF-like transcript line format (TLF), exons were already found as attributes
1229 for (int i=0;i<gffline.exons.Count();++i) {
1230 int eidx=this->addExon(gffline.exons[i].start, gffline.exons[i].end, exgffExon, '.', gscore);
1231 if (eidx<0 && gfrd.showWarnings())
1232 GMessage("Warning: failed adding exon %d-%d for %s (discarded)!\n",
1233 gffline.exons[i].start, gffline.exons[i].end, gffID);
1234
1235 }
1236 }
1237 if (gffline.cds_start>0) {
1238 CDstart=gffline.cds_start;
1239 CDend=gffline.cds_end;
1240 }
1241 if (gffline.phase!=0) CDphase=gffline.phase;
1242 if (gffline.cdss.Count()>0) {
1243 //for compact GFF-like transcript line format (TLF), CDS might be already found as attributes
1244 if (cdss==NULL) cdss=new GList<GffExon>(true, true, false);
1245 for (int i=0;i<gffline.cdss.Count();++i) {
1246 int eidx=this->addExon(gffline.cdss[i].start, gffline.cdss[i].end, exgffCDS, 0, GFFSCORE_NONE, cdss);
1247 if (eidx<0 && gfrd.showWarnings())
1248 GMessage("Warning: failed adding CDS segment %d-%d for %s (discarded)!\n",
1249 gffline.cdss[i].start, gffline.cdss[i].end, gffID);
1250
1251 }
1252 }
1253 }
1254 } //is_transcript
1255 if (gfrd.keep_Attrs) this->parseAttrs(attrs, gffline.info);
1256 if (gfrd.is_gff3 && gffline.parents==NULL && gffline.exontype!=exgffNone) {
1257 //special case with bacterial genes just given as a CDS/exon, without parent!
1258 this->createdByExon(true);
1259 if (ftype_id<0) ftype_id=gff_fid_mRNA;
1260 readExon(gfrd, gffline);
1261 }
1262 if (ftype_id<0)
1263 ftype_id=names->feats.addName(gffline.ftype);
1264 }//no parent OR recognizable transcript
1265
1266 if (gffline.gene_name!=NULL) {
1267 gene_name=Gstrdup(gffline.gene_name);
1268 }
1269 if (gffline.gene_id) { //only for gene features or GTF2 gene_id attribute
1270 geneID=Gstrdup(gffline.gene_id);
1271 }
1272 /*//we cannot assume parents[0] is a gene! for NCBI miRNA, parent can be a primary_transcript feature!
1273 else if (gffline.is_transcript && gffline.parents!=NULL) {
1274 geneID=Gstrdup(gffline.parents[0]);
1275 }
1276 */
1277 }
1278
nextBEDLine()1279 BEDLine* GffReader::nextBEDLine() {
1280 if (bedline!=NULL) return bedline; //caller should free gffline after processing
1281 while (bedline==NULL) {
1282 int llen=0;
1283 buflen=GFF_LINELEN-1;
1284 char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
1285 if (l==NULL) return NULL;
1286 int ns=0; //first nonspace position
1287 while (l[ns]!=0 && isspace(l[ns])) ns++;
1288 if (l[ns]=='#' || llen<7) continue;
1289 bedline=new BEDLine(this, l);
1290 if (bedline->skip) {
1291 delete bedline;
1292 bedline=NULL;
1293 continue;
1294 }
1295 }
1296 return bedline;
1297 }
1298
1299
nextGffLine()1300 GffLine* GffReader::nextGffLine() {
1301 if (gffline!=NULL) return gffline; //caller should free gffline after processing
1302 while (gffline==NULL) {
1303 int llen=0;
1304 buflen=GFF_LINELEN-1;
1305 char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
1306 if (l==NULL) {
1307 return NULL; //end of file
1308 }
1309 #ifdef CUFFLINKS
1310 _crc_result.process_bytes( linebuf, llen );
1311 #endif
1312 int ns=0; //first nonspace position
1313 bool commentLine=false;
1314 while (l[ns]!=0 && isspace(l[ns])) ns++;
1315 if (l[ns]=='#') {
1316 commentLine=true;
1317 if (llen<10) {
1318 if (commentParser!=NULL) (*commentParser)(l, &gflst);
1319 continue;
1320 }
1321 }
1322 gffline=new GffLine(this, l);
1323 if (gffline->skipLine) {
1324 if (commentLine && commentParser!=NULL) (*commentParser)(gffline->dupline, &gflst);
1325 delete gffline;
1326 gffline=NULL;
1327 continue;
1328 }
1329 if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
1330 //this might not be needed, already checked in the GffLine constructor
1331 if (gff_warns)
1332 GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
1333 delete gffline;
1334 gffline=NULL;
1335 //continue;
1336 }
1337 }
1338 return gffline;
1339 }
1340
1341
gfoBuildId(const char * id,const char * ctg)1342 char* GffReader::gfoBuildId(const char* id, const char* ctg) {
1343 //caller must free the returned pointer
1344 char* buf=NULL;
1345 int idlen=strlen(id);
1346 GMALLOC(buf, idlen+strlen(ctg)+2);
1347 strcpy(buf, id);
1348 buf[idlen]='~';
1349 strcpy(buf+idlen+1, ctg);
1350 return buf;
1351 }
1352
gfoAdd(GffObj * gfo)1353 GffObj* GffReader::gfoAdd(GffObj* gfo) {
1354 GPVec<GffObj>* glst=phash.Find(gfo->gffID);
1355 if (glst==NULL)
1356 glst=new GPVec<GffObj>(false);
1357 int i=glst->Add(gfo);
1358 phash.Add(gfo->gffID, glst);
1359 return glst->Get(i);
1360 }
1361
gfoAdd(GPVec<GffObj> & glst,GffObj * gfo)1362 GffObj* GffReader::gfoAdd(GPVec<GffObj>& glst, GffObj* gfo) {
1363 int i=glst.Add(gfo);
1364 return glst[i];
1365 }
1366
gfoReplace(GPVec<GffObj> & glst,GffObj * gfo,GffObj * toreplace)1367 GffObj* GffReader::gfoReplace(GPVec<GffObj>& glst, GffObj* gfo, GffObj* toreplace) {
1368 for (int i=0;i<glst.Count();++i) {
1369 if (glst[i]==toreplace) {
1370 //glst.Put(i,gfo);
1371 glst[i]=gfo;
1372 break;
1373 }
1374 }
1375 return gfo;
1376 }
1377
pFind(const char * id,GPVec<GffObj> * & glst)1378 bool GffReader::pFind(const char* id, GPVec<GffObj>*& glst) {
1379 glst = phash.Find(id);
1380 return (glst!=NULL);
1381 }
1382
gfoFind(const char * id,GPVec<GffObj> * & glst,const char * ctg,char strand,uint start,uint end)1383 GffObj* GffReader::gfoFind(const char* id, GPVec<GffObj>*& glst,
1384 const char* ctg, char strand, uint start, uint end) {
1385 GPVec<GffObj>* gl=NULL;
1386 if (glst) {
1387 gl=glst;
1388 } else {
1389 gl = phash.Find(id);
1390 }
1391 GffObj* gh=NULL;
1392 if (gl) {
1393 for (int i=0;i<gl->Count();i++) {
1394 GffObj& gfo = *(gl->Get(i));
1395 if (ctg!=NULL && strcmp(ctg, gfo.getGSeqName())!=0)
1396 continue;
1397 if (strand && gfo.strand!='.' && strand != gfo.strand)
1398 continue;
1399 if (start>0) {
1400 if (abs((int)start-(int)gfo.start)> (int)GFF_MAX_LOCUS)
1401 continue;
1402 if (end>0 && (gfo.start>end || gfo.end<start))
1403 continue;
1404 }
1405 //must be the same transcript, according to given comparison criteria
1406 gh=&gfo;
1407 break;
1408 }
1409 }
1410 if (!glst) glst=gl;
1411 return gh;
1412 }
1413
updateParent(GffObj * newgfo,GffObj * parent)1414 GffObj* GffReader::updateParent(GffObj* newgfo, GffObj* parent) {
1415 //assert(parent);
1416 //assert(newgfo);
1417 parent->children.Add(newgfo);
1418 if (newgfo->parent==NULL) newgfo->parent=parent;
1419 newgfo->setLevel(parent->getLevel()+1);
1420 //if (parent->isGene()) {
1421 if (parent->gene_name!=NULL && newgfo->gene_name==NULL)
1422 newgfo->gene_name=Gstrdup(parent->gene_name);
1423 if (parent->geneID!=NULL && newgfo->geneID==NULL)
1424 newgfo->geneID=Gstrdup(parent->geneID);
1425 //}
1426
1427 return newgfo;
1428 }
1429
newGffRec(GffLine * gffline,GffObj * parent,GffExon * pexon,GPVec<GffObj> * glst,bool replace_parent)1430 GffObj* GffReader::newGffRec(GffLine* gffline, GffObj* parent, GffExon* pexon, GPVec<GffObj>* glst, bool replace_parent) {
1431 GffObj* newgfo=new GffObj(*this, *gffline);
1432 GffObj* r=NULL;
1433 gflst.Add(newgfo);
1434 //tag non-transcripts to be discarded later
1435 if (this->transcripts_Only && this->is_gff3 && gffline->ID!=NULL &&
1436 gffline->exontype==exgffNone && !gffline->is_gene && !gffline->is_transcript) {
1437 //unrecognized non-exon entity, should be discarded
1438 newgfo->isDiscarded(true);
1439 this->discarded_ids.Add(gffline->ID, new int(1));
1440 }
1441 if (replace_parent && glst) {
1442 r=gfoReplace(*glst, newgfo, parent);
1443 updateParent(r, parent);
1444 } else { //regular case of new GffObj creation
1445 r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo);
1446 if (parent!=NULL) {
1447 updateParent(r, parent);
1448 if (pexon!=NULL) parent->removeExon(pexon);
1449 }
1450 }
1451 return r;
1452 }
1453
newGffRec(BEDLine * bedline,GPVec<GffObj> * glst)1454 GffObj* GffReader::newGffRec(BEDLine* bedline, GPVec<GffObj>* glst) {
1455 GffObj* newgfo=new GffObj(*this, *bedline);
1456 GffObj* r=NULL;
1457 gflst.Add(newgfo);
1458 r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo);
1459 return r;
1460 }
1461
updateGffRec(GffObj * prevgfo,GffLine * gffline)1462 GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline) {
1463 if (prevgfo==NULL) return NULL;
1464 //prevgfo->gffobj->createdByExon(false);
1465 if (gffline->ftype_id>=0)
1466 prevgfo->ftype_id=gffline->ftype_id;
1467 else
1468 prevgfo->ftype_id=prevgfo->names->feats.addName(gffline->ftype);
1469 prevgfo->start=gffline->fstart;
1470 prevgfo->end=gffline->fend;
1471 prevgfo->isGene(gffline->is_gene);
1472 prevgfo->isTranscript(gffline->is_transcript || gffline->exontype!=exgffNone);
1473 prevgfo->hasGffID(gffline->ID!=NULL);
1474 if (keep_Attrs) {
1475 if (prevgfo->attrs!=NULL) prevgfo->attrs->Clear();
1476 prevgfo->parseAttrs(prevgfo->attrs, gffline->info);
1477 }
1478 return prevgfo;
1479 }
1480
1481
readExonFeature(GffObj * prevgfo,GffLine * gffline,GHash<CNonExon> * pex)1482 bool GffReader::readExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>* pex) {
1483 //this should only be called before prevgfo->finalize()!
1484 bool r=true;
1485 if (gffline->strand!=prevgfo->strand) {
1486 if (prevgfo->strand=='.') {
1487 prevgfo->strand=gffline->strand;
1488 }
1489 else {
1490 GMessage("Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1491 prevgfo->gffID, prevgfo->strand, gffline->fstart, gffline->fend, gffline->strand,
1492 prevgfo->getGSeqName());
1493 return true;
1494 }
1495 }
1496 int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
1497 ((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
1498 0 );
1499 if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1500 GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
1501 //validation_errors = true;
1502 r=false;
1503 if (!gff_warns) exit(1);
1504 }
1505 int eidx=prevgfo->readExon(*this, *gffline);
1506 if (pex!=NULL && eidx>=0) {
1507 //if (eidx==0 && gffline->exontype>0) prevgfo->isTranscript(true);
1508 if (gffline->ID!=NULL && gffline->exontype==exgffNone)
1509 subfPoolAdd(*pex, prevgfo);
1510 }
1511 return r;
1512 }
1513
subfPoolCheck(GffLine * gffline,GHash<CNonExon> & pex,char * & subp_name)1514 CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
1515 CNonExon* subp=NULL;
1516 subp_name=NULL;
1517 for (int i=0;i<gffline->num_parents;i++) {
1518 if (transcripts_Only && discarded_ids.Find(gffline->parents[i])!=NULL)
1519 continue;
1520 subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1521 subp=pex.Find(subp_name);
1522 if (subp!=NULL)
1523 return subp;
1524 GFREE(subp_name);
1525 }
1526 return NULL;
1527 }
1528
subfPoolAdd(GHash<CNonExon> & pex,GffObj * newgfo)1529 void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo) {
1530 //this might become a parent feature later
1531 if (newgfo->exons.Count()>0) {
1532 char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1533 pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], *gffline));
1534 GFREE(xbuf);
1535 }
1536 }
1537
promoteFeature(CNonExon * subp,char * & subp_name,GHash<CNonExon> & pex)1538 GffObj* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex) {
1539 GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1540 //if (prevp!=gflst[subp->idx])
1541 // GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1542 subp->gffline->discardParent();
1543 GffObj* gfoh=newGffRec(subp->gffline, prevp, subp->exon);
1544 pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1545 prevp->promotedChildren(true);
1546 return gfoh; //returns the holder of newly promoted feature
1547 }
1548
1549 //In the rare cases where the GFF/GTF stream is properly formatted
1550 // i.e. when all sub-features are grouped with (and preceded by) their parent!
readNext()1551 GffObj* GffReader::readNext() { //user must free the returned GffObj*
1552 GffObj* gfo=NULL;
1553 //GSeg tseg(0,0); //transcript boundaries
1554 char* lastID=NULL;
1555 if (is_BED) {
1556 if (nextBEDLine()) {
1557 gfo=new GffObj(*this, *bedline);
1558 //tseg.start=gfo->start;
1559 //tseg.end=gfo->end;
1560 delete bedline;
1561 bedline=NULL;
1562 } //else return NULL;
1563 } else { //GFF parsing
1564 while (nextGffLine()!=NULL) {
1565 char* tid=gffline->ID;
1566 if (gffline->is_exon) tid=gffline->parents[0];
1567 else //not an exon
1568 if (!(gffline->is_transcript || gffline->is_gene))
1569 tid=NULL; //WARNING: only parsing transcript && gene records here
1570 //if (tid==NULL || gffline->num_parents>1) {
1571 if (tid==NULL) { //not a suitable transcript ID found, skip this line
1572 delete gffline;
1573 gffline=NULL;
1574 continue;
1575 }
1576 bool sameID=(lastID!=NULL && strcmp(lastID, tid)==0);
1577 if (sameID) {
1578 if (gfo==NULL) GError("Error: same transcript ID but GffObj not initialized?!(%s)\n", tid);
1579 //TODO: if gffline->is_transcript: trans-splicing!
1580 if (!gffline->is_exon) {
1581 GMessage("Warning: skipping unexpected non-exon record with previously seen ID:\n%s\n", gffline->dupline);
1582 delete gffline;
1583 gffline=NULL;
1584 continue;
1585 }
1586 readExonFeature(gfo, gffline); //also takes care of adding CDS segments
1587 } else { //new transcript
1588 if (gfo==NULL) {
1589 //start gathering this transcript's data now
1590 gfo=new GffObj(*this, *gffline);
1591 //GFREE(lastID);
1592 lastID=Gstrdup(tid);
1593 /*if (gffline->is_transcript) {
1594 tseg.start=gffline->fstart;
1595 tseg.end=gffline->fend;
1596 }*/
1597 }
1598 else {
1599 //this gffline is for the next transcript!
1600 //return what we've got so far
1601 //return gfo;
1602 break;
1603 }
1604 } //transcript ID change
1605 //gffline processed, move on
1606 delete gffline;
1607 gffline=NULL;
1608 } //while nextgffline()
1609 } //GFF records
1610 GFREE(lastID);
1611 //gfo populated with all its sub-features (or eof reached)
1612 if (gfo!=NULL) {
1613 gfo->finalize(this);
1614 }
1615 return gfo;
1616 }
1617
1618 //Usually we have to parse the whole file because exons and other subfeatures can be scattered, unordered in the input
1619 // (thanks to the annoyingly loose GFF3 specs)
1620 //Trans-splicing and fusions shall only be accepted in proper GFF3 format, i.e. multiple transcript features
1621 //with the same ID but NOT overlapping/continuous
1622 // *** BUT (exception): proximal xRNA features with the same ID, on the same strand, will be merged
1623 // and the segments will be treated like exons (e.g. TRNAR15 (rna1940) in RefSeq)
readAll()1624 void GffReader::readAll() {
1625 bool validation_errors = false;
1626 if (is_BED) {
1627 while (nextBEDLine()) {
1628 GPVec<GffObj>* prevgflst=NULL;
1629 GffObj* prevseen=gfoFind(bedline->ID, prevgflst, bedline->gseqname, bedline->strand, bedline->fstart);
1630 if (prevseen) {
1631 //duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs
1632 //e.g. a trans-spliced transcript - but segments should not overlap
1633 if (prevseen->overlap(bedline->fstart, bedline->fend)) {
1634 //overlapping feature with same ID is going too far
1635 GMessage("Error: overlapping duplicate BED feature (ID=%s)\n", bedline->ID);
1636 //validation_errors = true;
1637 if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors
1638 delete bedline;
1639 bedline=NULL;
1640 continue;
1641 }
1642 else exit(1);
1643 }
1644 //create a separate entry (true discontinuous feature?)
1645 prevseen=newGffRec(bedline, prevgflst);
1646 if (gff_warns) {
1647 GMessage("Warning: duplicate BED feature ID %s (%d-%d) (discontinuous feature?)\n",
1648 bedline->ID, bedline->fstart, bedline->fend);
1649 }
1650 }
1651 else {
1652 newGffRec(bedline, prevgflst);
1653 }
1654 delete bedline;
1655 bedline=NULL;
1656 }
1657 }
1658 else { //regular GFF/GTF or perhaps TLF?
1659 //loc_debug=false;
1660 GHash<CNonExon> pex; //keep track of any parented (i.e. exon-like) features that have an ID
1661 //and thus could become promoted to parent features
1662 while (nextGffLine()!=NULL) {
1663 GffObj* prevseen=NULL;
1664 GPVec<GffObj>* prevgflst=NULL;
1665 if (gffline->ID && gffline->exontype==exgffNone) {
1666 //parent-like feature ID (mRNA, gene, etc.) not recognized as an exon feature
1667 //check if this ID was previously seen on the same chromosome/strand within GFF_MAX_LOCUS distance
1668 prevseen=gfoFind(gffline->ID, prevgflst, gffline->gseqname, gffline->strand, gffline->fstart);
1669 if (prevseen) {
1670 //same ID seen in the same locus/region
1671 if (prevseen->createdByExon()) {
1672 if (gff_warns && (prevseen->start<gffline->fstart ||
1673 prevseen->end>gffline->fend))
1674 GMessage("Warning: invalid coordinates for %s parent feature (ID=%s)\n", gffline->ftype, gffline->ID);
1675 //an exon of this ID was given before
1676 //this line has the main attributes for this ID
1677 updateGffRec(prevseen, gffline);
1678 }
1679 else { //possibly a duplicate ID -- but this could also be a discontinuous feature according to GFF3 specs
1680 //e.g. a trans-spliced transcript - though segments should not overlap!
1681 bool gtf_gene_dupID=(prevseen->isGene() && gffline->is_gtf_transcript);
1682 if (prevseen->overlap(gffline->fstart, gffline->fend) && !gtf_gene_dupID) {
1683 //in some GTFs a gene ID may actually be the same with the parented transcript ID (thanks)
1684 //overlapping feature with same ID is going too far
1685 GMessage("Error: discarding overlapping duplicate %s feature (%d-%d) with ID=%s\n", gffline->ftype,
1686 gffline->fstart, gffline->fend, gffline->ID);
1687 //validation_errors = true;
1688 if (gff_warns) { //validation intent: just skip the feature, allow the user to see other errors
1689 delete gffline;
1690 gffline=NULL;
1691 continue;
1692 }
1693 //else exit(1);
1694 }
1695 if (gtf_gene_dupID) {
1696 //special GTF case where parent gene_id matches transcript_id (sigh)
1697 prevseen=newGffRec(gffline, prevseen, NULL, prevgflst, true);
1698 }
1699 else {
1700 //create a separate entry (true discontinuous feature)
1701 prevseen=newGffRec(gffline, prevseen->parent, NULL, prevgflst);
1702 if (gff_warns) {
1703 GMessage("Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n",
1704 gffline->ID, gffline->fstart, gffline->fend);
1705 }
1706 }
1707 } //duplicate ID in the same locus
1708 } //ID seen previously in the same locus
1709 } //parent-like ID feature (non-exon)
1710 if (gffline->parents==NULL) {
1711 //top level feature (transcript, gene), no parents (or parents can be ignored)
1712 if (!prevseen) newGffRec(gffline, NULL, NULL, prevgflst);
1713 }
1714 else { //--- it's a child feature (exon/CDS or even a mRNA with a gene as parent)
1715 //updates all the declared parents with this child
1716 bool found_parent=false;
1717 if (gffline->is_gtf_transcript && prevseen && prevseen->parent) {
1718 found_parent=true; //parent already found in special GTF case
1719 }
1720 else {
1721 GffObj* newgfo=prevseen;
1722 GPVec<GffObj>* newgflst=NULL;
1723 GVec<int> kparents; //kept parents (non-discarded)
1724 GVec< GPVec<GffObj>* > kgflst(false);
1725 GPVec<GffObj>* gflst0=NULL;
1726 for (int i=0;i<gffline->num_parents;i++) {
1727 newgflst=NULL;
1728 //if (transcriptsOnly && (
1729 if (discarded_ids.Find(gffline->parents[i])!=NULL) continue;
1730 if (!pFind(gffline->parents[i], newgflst))
1731 continue; //skipping discarded parent feature
1732 kparents.Add(i);
1733 if (i==0) gflst0=newgflst;
1734 kgflst.Add(newgflst);
1735 }
1736 if (gffline->num_parents>0 && kparents.Count()==0) {
1737 kparents.cAdd(0);
1738 kgflst.Add(gflst0);
1739 }
1740 for (int k=0;k<kparents.Count();k++) {
1741 int i=kparents[k];
1742 newgflst=kgflst[k];
1743 GffObj* parentgfo=NULL;
1744 if (gffline->is_transcript || gffline->exontype==exgffNone) {//likely a transcript
1745 //parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname,
1746 // gffline->strand, gffline->fstart, gffline->fend);
1747 if (newgflst!=NULL && newgflst->Count()>0)
1748 parentgfo = newgflst->Get(0);
1749 }
1750 else {
1751 //for exon-like entities we only need a parent to be in locus distance,
1752 //on the same strand
1753 parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname,
1754 gffline->strand, gffline->fstart);
1755 }
1756 if (parentgfo!=NULL) { //parent GffObj parsed earlier
1757 found_parent=true;
1758 if ((parentgfo->isGene() || parentgfo->isTranscript()) && (gffline->is_transcript ||
1759 gffline->exontype==exgffNone)) {
1760 //not an exon, but could be a transcript parented by a gene
1761 // *or* by another transcript (! miRNA -> primary_transcript)
1762 if (newgfo) {
1763 updateParent(newgfo, parentgfo);
1764 }
1765 else {
1766 newgfo=newGffRec(gffline, parentgfo);
1767 }
1768 }
1769 else { //potential exon subfeature?
1770 bool addingExon=false;
1771 if (transcripts_Only) {
1772 if (gffline->exontype>0) addingExon=true;
1773 }
1774 else { //always discard silly "intron" features
1775 if (! (gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0)))
1776 addingExon=true;
1777 }
1778 if (addingExon)
1779 if (!readExonFeature(parentgfo, gffline, &pex))
1780 validation_errors=true;
1781
1782 }
1783 } //overlapping parent feature found
1784 } //for each parsed parent Id
1785 if (!found_parent) { //new GTF-like record starting directly here as a subfeature
1786 //or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
1787 //or it could also be a stray transcript without a parent gene defined previously
1788 //check if this feature isn't parented by a previously stored "child" subfeature
1789 char* subp_name=NULL;
1790 CNonExon* subp=NULL;
1791 if (!gffline->is_transcript) { //don't bother with this check for obvious transcripts
1792 if (pex.Count()>0) subp=subfPoolCheck(gffline, pex, subp_name);
1793 if (subp!=NULL) { //found a subfeature that is the parent of this (!)
1794 //promote that subfeature to a full GffObj
1795 GffObj* gfoh=promoteFeature(subp, subp_name, pex);
1796 //add current gffline as an exon of the newly promoted subfeature
1797 if (!readExonFeature(gfoh, gffline, &pex))
1798 validation_errors=true;
1799 }
1800 }
1801 if (subp==NULL) { //no parent subfeature seen before
1802 //loc_debug=true;
1803 GffObj* ngfo=prevseen;
1804 if (ngfo==NULL) {
1805 //if it's an exon type, create directly the parent with this exon
1806 //but if it's recognized as a transcript, the object itself is created
1807 ngfo=newGffRec(gffline, NULL, NULL, newgflst);
1808 }
1809 if (!ngfo->isTranscript() &&
1810 gffline->ID!=NULL && gffline->exontype==0)
1811 subfPoolAdd(pex, ngfo);
1812 //even those with errors will be added here!
1813 }
1814 GFREE(subp_name);
1815 } //no previous parent found
1816 }
1817 } //parented feature
1818 //--
1819 delete gffline;
1820 gffline=NULL;
1821 }//while gff lines
1822 }
1823 if (gflst.Count()>0) {
1824 gflst.finalize(this); //force sorting by locus if so constructed
1825 }
1826 // all gff records are now loaded in GList gflst
1827 // so we can free the hash
1828 phash.Clear();
1829 //tids.Clear();
1830 if (validation_errors) {
1831 exit(1);
1832 }
1833 }
1834
finalize(GffReader * gfr)1835 void GfList::finalize(GffReader* gfr) { //if set, enforce sort by locus
1836 GList<GffObj> discarded(false,true,false);
1837 for (int i=0;i<Count();i++) {
1838 //finalize the parsing of each GffObj
1839 fList[i]->finalize(gfr);
1840 if (fList[i]->isDiscarded()) {
1841 discarded.Add(fList[i]);
1842 //inform parent that thiis child is removed
1843 if (fList[i]->parent!=NULL) {
1844 GPVec<GffObj>& pchildren=fList[i]->parent->children;
1845 for (int c=0;c<pchildren.Count();c++) {
1846 if (pchildren[c]==fList[i]) {
1847 pchildren.Delete(c);
1848 break;
1849 }
1850 }
1851 }
1852 if (fList[i]->children.Count()>0) { //inform children that the parent was removed
1853 for (int c=0;c<fList[i]->children.Count();c++) {
1854 fList[i]->children[c]->parent=NULL;
1855 if (gfr->keep_Attrs)
1856 //inherit the attributes of discarded parent (e.g. pseudo=true; )
1857 fList[i]->children[c]->copyAttrs(fList[i]);
1858 }
1859 }
1860 this->Forget(i);
1861 }
1862 }
1863 if (discarded.Count()>0) {
1864 this->Pack();
1865 }
1866 if (gfr->sortByLoc) {
1867 this->setSorted(false);
1868 if (gfr->refAlphaSort)
1869 this->setSorted((GCompareProc*)gfo_cmpByLoc);
1870 else
1871 this->setSorted((GCompareProc*)gfo_cmpRefByID);
1872 }
1873 }
1874
reduceExonAttrs(GList<GffExon> & segs)1875 bool GffObj::reduceExonAttrs(GList<GffExon>& segs) {
1876 bool attrs_discarded=false;
1877 for (int a=0;a<segs[0]->attrs->Count();a++) {
1878 int attr_id=segs[0]->attrs->Get(a)->attr_id;
1879 char* attr_name=names->attrs.getName(attr_id);
1880 char* attr_val =segs[0]->attrs->Get(a)->attr_val;
1881 bool sameExonAttr=true;
1882 bool discardAll=(GstrEq("exon_id", attr_name) || GstrEq("exon_number", attr_name));
1883 if (!discardAll)
1884 for (int i=1;i<segs.Count();i++) {
1885 char* ov=segs[i]->getAttr(attr_id);
1886 if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1887 sameExonAttr=false;
1888 break;
1889 }
1890 }
1891 if (sameExonAttr) {
1892 //delete this attribute from exon level
1893 attrs_discarded=true;
1894 if (!discardAll) {
1895 //add the attribute to transcript level
1896 //rename it if it exists and is different for the transcript!
1897 char* t_val=NULL;
1898 bool same_aval=false;
1899 if (this->attrs!=NULL &&
1900 (t_val=this->attrs->getAttr(attr_id))!=NULL) {
1901 //same attribute name already exists for the transcript!
1902 //write it using CDS_ or exon_ prefix
1903 same_aval=(strcmp(attr_val, t_val)==0);
1904 if (!same_aval) {
1905 //add renamed attribute
1906 const char* prefix = (&segs==cdss) ? "CDS_" : "exon_";
1907 char* new_attr_name=NULL;
1908 GMALLOC(new_attr_name, strlen(prefix)+strlen(attr_name)+1);
1909 new_attr_name[0]=0;
1910 strcat(new_attr_name, prefix);
1911 strcat(new_attr_name, attr_name);
1912 this->attrs->add_or_update(names, new_attr_name, attr_val);
1913 GFREE(new_attr_name);
1914 }
1915 }
1916 else { //no such attribute exists for the transcript, copy it from the exon
1917 this->addAttr(attr_name, attr_val);
1918 }
1919 }
1920 for (int i=1;i<segs.Count();i++) {
1921 removeExonAttr(*(segs[i]), attr_id);
1922 }
1923 segs[0]->attrs->freeItem(a);
1924 } //sameExonAttr
1925 }
1926 if (attrs_discarded) segs[0]->attrs->Pack();
1927 return attrs_discarded;
1928 }
1929 //return the segs index of segment containing coord:
whichExon(uint coord,GList<GffExon> * segs)1930 int GffObj::whichExon(uint coord, GList<GffExon>* segs) {
1931 //segs MUST be sorted by GSeg order (start coordinate)
1932 if (segs==NULL) segs=&exons;
1933 if (segs->Count()==0) return -1;
1934 if (coord<segs->First()->start || coord>segs->Last()->end)
1935 return -1;
1936 if (segs->Count()<6) {
1937 //simple scan
1938 for (int i=0;i<segs->Count();i++)
1939 if ((*segs)[i]->overlap(coord))
1940 return i;
1941 return -1;
1942 }
1943 else { //use quick search
1944 int i=0;
1945 int l=0; //lower boundary
1946 int h=segs->Count()-1; //higher boundary
1947 while (l<=h) {
1948 i = (l+h) >> 1; //range midpoint
1949 if (coord > segs->Get(i)->end)
1950 l=i+1;
1951 else { //coord <= segs->Get(i)->end
1952 if (coord >= segs->Get(i)->start) {
1953 return i;
1954 }
1955 //here: coord < segs->Get(i)->start
1956 h = i-1;
1957 }
1958 }
1959 }
1960 return -1;
1961 }
1962
processGeneSegments(GffReader * gfr)1963 bool GffObj::processGeneSegments(GffReader* gfr) {
1964 /* procedure:
1965 1)store the info about any X_gene_segment entries in a GVec<int>
1966 (just storing their index in gene->children[] list)
1967 2)for each CDS, group them by ID in GHash<GeneCDSChain> (and a GPVec<GeneCDSChain> for storage)
1968 3)for each GeneCDSChain, collect _gene_segments having a containment-relationship and rank them by lowest noncov
1969 4)for each GeneCDSChain, pick best _gene_segment match (if any) and transfer CDSs to it
1970 */
1971 GVec<int> geneSegs; //X_gene_segment features (children transcripts of this gene)
1972 GHash<GeneCDSChain> cdsChainById(false); // hash of CDS chains: CDS feature grouped by ID
1973 GPVec<GeneCDSChain> cdsChains; // CDS chains storage
1974 if (cdss==NULL || cdss->Count()==0 || children.Count()==0)
1975 return false; //we shouldn't be here
1976 //check if we have any _gene_segment children for this gene
1977 for (int i=0;i<children.Count();i++)
1978 if (children[i]->flag_GENE_SEGMENT) {
1979 if (children[i]->hasCDS() || children[i]->cdss!=NULL) {
1980 GMessage("Warning: will not transfer CDS from %s to gene_segment %s which already has its own\n",
1981 gffID, children[i]->gffID);
1982 continue;
1983 }
1984 geneSegs.Add(i);
1985 }
1986 if (geneSegs.Count()==0) {
1987 if (gfr->gff_warns)
1988 GMessage("Warning: gene %s has CDS and transcripts but no suitable _gene_segment features\n",gffID);
1989 return false; //nothing to do
1990 }
1991 //group CDSs into CDS chains by their ID:
1992 for (int i=0;i<cdss->Count();i++) {
1993 char* id=(char*)(cdss->Get(i)->uptr);
1994 if (id==NULL) continue; //should never happen
1995 GeneCDSChain *gcc=cdsChainById.Find(id);
1996 if (gcc!=NULL)
1997 gcc->addCDS(i, cdss->Get(i)->start, cdss->Get(i)->end);
1998 else { //new CDS chain:
1999 gcc=new GeneCDSChain(i, cdss->Get(i)->start, cdss->Get(i)->end);
2000 cdsChains.Add(gcc);
2001 cdsChainById.shkAdd(id, gcc);
2002 }
2003 }
2004 for (int i=0;i<cdss->Count();i++) {
2005 GFREE(cdss->Get(i)->uptr); //no CDS ID no longer needed
2006 }
2007
2008 //collect _gene_segment containers for each CDS chain
2009 int cds_moved=0;
2010 for (int i=0;i<cdsChains.Count();i++) {
2011 GeneCDSChain &gc=*(cdsChains[i]);
2012 for (int si=0;si<geneSegs.Count();si++) {
2013 GffObj& t=*(children[geneSegs[si]]);
2014 //if (t.hasCDS() || t.cdss!=NULL) continue; //already transferred
2015 int novl=-1;
2016 if (gc.containedBy(t, novl))
2017 gc.addMatch(geneSegs[si], novl, si);
2018 }
2019 //-- assess the collected matches for this CDS chain:
2020 if (gc.mxs.Count()==0) {
2021 GMessage("Warning: could not find the corresponding gene segment for CDS chain %d-%d of gene %s\n",
2022 gc.start, gc.end, gffID);
2023 continue;
2024 }
2025 GffObj* t=children[gc.mxs.First().child_idx];
2026 for (int c=0;c<gc.cdsList.Count();c++) {
2027 t->transferCDS(cdss->Get(gc.cdsList[c].idx));
2028 cdss->Forget(gc.cdsList[c].idx);
2029 cds_moved++;
2030 }
2031 // also remove it from the list of gene_segments to be mapped
2032 geneSegs.Delete(gc.mxs.First().gsegidx); //assigned, should no longer be checked against other CDS chains
2033 if (t->isFinalized()) t->finalize(gfr);
2034
2035 }
2036 if (cds_moved>0) cdss->Pack();
2037 if (cdss->Count()==0) {
2038 delete cdss;
2039 cdss=NULL;
2040 if (exons.Count()==0) isTranscript(false);
2041 }
2042 return true;
2043 }
2044
finalize(GffReader * gfr)2045 GffObj* GffObj::finalize(GffReader* gfr) {
2046 if (this->createdByExon() && this->end-this->start<10 && this->exons.Count()<=1) {
2047 //? misleading exon-like feature parented by an exon or CDS mistakenly
2048 // interpreted as a standalone transcript
2049 // example: GENCODE gff3 feature "stop_codon_redefined_as_selenocysteine" which is
2050 // parented by a CDS !
2051 if (cdss==NULL || cdss->Count()<=1) {
2052 if (gfr->showWarnings()) {
2053 GMessage("Warning: discarding suspicious '%s' record (ID=%s)\n",this->getFeatureName(),gffID);
2054 }
2055 isDiscarded(true);
2056 }
2057 }
2058 if (!isDiscarded()) {
2059 bool noExons=(exons.Count()==0 && (cdss==NULL || cdss->Count()==0));
2060 if (noExons) {
2061 if (isTranscript() || (isGene() && children.Count()==0 && gfr->gene2exon)) {
2062 //add exon feature to an exonless transcript/gene
2063 addExon(this->start, this->end, exgffExon);
2064 //effectively this becomes a transcript (even childless genes if gene2exon)
2065 isTranscript(true);
2066 }
2067 }
2068 else { //it has exons or CDSs
2069 if (cdss!=NULL && isGene() && children.Count()>0) {
2070 //check for X_gene_segment processing
2071 processGeneSegments(gfr);//distribute the cdss to children _gene_segments
2072 } // _gene_segment processing
2073 }
2074 }
2075 if (cdss!=NULL && isGene()) //in case we stored IDs for gene_segment features
2076 for (int i=0;i<cdss->Count();i++) {
2077 GFREE(cdss->Get(i)->uptr);
2078 }
2079 if (gfr->transcripts_Only && !isTranscript() &&
2080 !(gfr->keep_Genes && isGene())) {
2081 //discard non-transcripts, unless it's a gene and keepGenes was specified
2082 isDiscarded(true);
2083 }
2084 isFinalized(true);
2085
2086 if (isDiscarded()) {
2087 //just in case we have cds with uptr in use (X_gene_segment), free them
2088 uptr=NULL;
2089 udata=0;
2090 return this;
2091 }
2092 if (isTranscript()) {
2093 isCDSOnly(cdss!=NULL && exons.Count()==0 && cdss->Count()>0);
2094 subftype_id=isCDSOnly() ? gff_fid_CDS : gff_fid_exon;
2095 }
2096 if (cdss!=NULL && cdss->Count()>0) {
2097 CDstart=cdss->First()->start;
2098 CDend=cdss->Last()->end;
2099 CDphase=(strand=='-')? cdss->Last()->phase : cdss->First()->phase;
2100 bool updatePhase=(CDphase=='.' || CDphase==0);
2101 if (!updatePhase)
2102 for (int i=0;i<cdss->Count();++i)
2103 if ((*cdss)[i]->phase<'0') {
2104 updatePhase=true;
2105 break;
2106 }
2107 if (updatePhase) updateCDSPhase(*cdss);
2108 //there are GFFs out there which only provide UTR and CDS records instead of full exons
2109 //so make sure we add all CDS segments to exons, if they are not already there
2110 for (int i=0;i<cdss->Count();++i) {
2111 int eidx=addExon((*cdss)[i]->start, (*cdss)[i]->end, exgffExon, 0, (*cdss)[i]->score);
2112 if (eidx<0) GError("Error: could not reconcile CDS %d-%d with exons of transcript %s\n",
2113 (*cdss)[i]->start, (*cdss)[i]->end, gffID);
2114 }
2115 }
2116 else if (CDstart==0) {//no CDS, no phase
2117 CDphase=0;
2118 CDend=0;
2119 }
2120 //-- attribute reduction for some records which
2121 // repeat the exact same attr=value for every exon
2122 bool reduceAttributes=(gfr->keep_Attrs && !gfr->noExonAttrs &&
2123 !gfr->keep_AllExonAttrs && exons.Count()>0 && exons[0]->attrs!=NULL);
2124 if (reduceAttributes) {
2125 //for each attribute of the 1st exon, if it has the
2126 //same value for all other exons, move it to transcript level
2127 //bool reduced=reduceExonAttrs(exons);
2128 reduceExonAttrs(exons);
2129 //if (gfr->showWarnings() && reduced)
2130 // GMessage("Info: duplicate exon attributes reduced for %s\n", gffID);
2131 //do the same for CDS segments, if any
2132 if (cdss!=NULL && cdss->Count()>0 && (*cdss)[0]->attrs!=NULL) {
2133 //reduced=
2134 reduceExonAttrs(*cdss);
2135 //if (gfr->showWarnings() && reduced)
2136 // GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID);
2137 }
2138 }
2139 //merge close exons if requested
2140 if (exons.Count()>0 && isTranscript()) {
2141 if (gfr->merge_CloseExons) {
2142 for (int i=0;i<exons.Count()-1;i++) {
2143 int ni=i+1;
2144 uint mend=exons[i]->end;
2145 while (ni<exons.Count()) {
2146 int dist=(int)(exons[ni]->start-mend-1); //<0 = overlap, 0 = adjacent, >0 = bases apart
2147 if (dist>GFF_MIN_INTRON) break; //no merging with next segment
2148 if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
2149 GMessage("Warning: merging adjacent/overlapping segments (distance=%d) of %s on %s (%d-%d, %d-%d)\n",
2150 dist, gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
2151 }
2152 mend=exons[ni]->end;
2153 exons[i]->end=mend;
2154 if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
2155 exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
2156 //use the other exon attributes, if it has more
2157 delete(exons[i]->attrs);
2158 exons[i]->attrs=exons[ni]->attrs;
2159 exons[ni]->attrs=NULL;
2160 }
2161 exons.Delete(ni);
2162 } //check for merge with next exon
2163 } //for each exon
2164 } //merge close exons
2165 if (isCDSOnly() && exons.Count()!=cdss->Count())
2166 isCDSOnly(false);
2167 }
2168 //-- check features vs their exons' span
2169 if (isTranscript()) {
2170 if (exons.Count()>0) {
2171 if (gfr->gff_warns && (this->start!=exons.First()->start ||
2172 this->end!=exons.Last()->end) )
2173 GMessage("Warning: adjusted transcript %s boundaries according to terminal exons.\n",
2174 gffID);
2175 this->start=exons.First()->start;
2176 this->end=exons.Last()->end;
2177 }
2178 }
2179 else { //non-transcripts just have to be at least as wide as their sub-features
2180 if (exons.Count()>0) {
2181 bool adj=false;
2182 if (this->start>exons.First()->start) {
2183 this->start=exons.First()->start;
2184 adj=true;
2185 }
2186 if (this->end<exons.Last()->end) {
2187 this->end=exons.First()->end;
2188 adj=true;
2189 }
2190 if (gfr->gff_warns && adj)
2191 GMessage("Warning: adjusted %s %s boundaries according to terminal sub-features.\n",
2192 this->getFeatureName(), gffID);
2193 }
2194 }
2195 //-- update covlen
2196 covlen=0;
2197 for (int i=0;i<exons.Count();++i) covlen+=exons[i]->len();
2198 //-- check if CDS segments are different from exons and thus worth keeping separately in cdss
2199 if (cdss!=NULL && cdss->Count()>0) {
2200 bool cds_exComp=true; //CDSs are exon-compatible (no need to keep them separately)
2201 if (cdss->Count()==1) {
2202 //check that the CDS segment is within a single exon
2203 int start_eidx=-1;
2204 int end_eidx=-1;
2205 for (int i=0;i<exons.Count();i++) {
2206 //GMessage("[DBG:] checking if CDS %d-%d is within exon %d-%d\n", CDstart, CDend, exons[i]->start,
2207 // exons[i]->end);
2208 if (CDstart>=exons[i]->start && CDstart<=exons[i]->end) {
2209 start_eidx=i;
2210 }
2211 if (CDend>=exons[i]->start || CDend<=exons[i]->end ) {
2212 end_eidx=i;
2213 }
2214 if (start_eidx>=0 && end_eidx>=0) break;
2215 }
2216 cds_exComp=(start_eidx==end_eidx && start_eidx>=0);
2217 if (!cds_exComp) GMessage("Warning: transcript %s has incorrect CDS segment definition (%d-%d)!\n",
2218 gffID, CDstart, CDend);
2219 cds_exComp=true; //just to free cdss, even though it's wrong
2220 } else {
2221 if (cdss->Count()>exons.Count()) {
2222 cds_exComp=false;
2223 } else { //2 or more CDS segments
2224 //CDSs should be intron compatible with exons, and CDS ends should be within exons
2225 int imax=exons.Count()-1;
2226 int jmax=cdss->Count()-1;
2227 int i=0;
2228 int j=0;
2229 //find which exon has CDstart
2230 for (i=0;i<=imax;++i)
2231 if (CDstart>=exons[i]->start
2232 && CDstart<=exons[i]->end) break;
2233 if (i>imax) cds_exComp=false;
2234 else { //check the introns now
2235 while (i<imax && j<jmax) {
2236 if (exons[i]->end!=(*cdss)[j]->end ||
2237 exons[i+1]->start!=(*cdss)[j+1]->start) {
2238 cds_exComp=false;
2239 break;
2240 }
2241 ++i;
2242 ++j;
2243 }
2244 //now j must be the last segment of cdss and CDend must be within exon[i]
2245 if (cds_exComp)
2246 if (j!=jmax || CDend>exons[i]->end || CDend<exons[i]->start)
2247 cds_exComp=false;
2248 }
2249 }
2250 } //multiple CDS segments
2251 if (cds_exComp) {
2252 if (isCDSOnly() && cdss->Count()==exons.Count())
2253 for (int i=0;i<cdss->Count();i++)
2254 exons[i]->phase=cdss->Get(i)->phase;
2255 if (gfr->keep_Attrs && !gfr->noExonAttrs) {
2256 int eidx=whichExon((*cdss)[0]->start, &exons);
2257 if (eidx<0)
2258 GError("Error finding CDS coordinate inside exons (?) for %s\n",
2259 gffID);
2260 for (int i=0;i<cdss->Count();i++) {
2261 if (isCDSOnly()) //eidx should be the same with i
2262 exons[eidx]->phase=cdss->Get(i)->phase;
2263 if ((*cdss)[i]->attrs!=NULL && (*cdss)[i]->attrs->Count()>0) {
2264 if (exons[eidx]->attrs==NULL)
2265 exons[eidx]->attrs=new GffAttrs();
2266 exons[eidx]->attrs->copyAttrs((*cdss)[i]->attrs, true);
2267 if (exons[eidx]->attrs->Count()==0) {
2268 delete exons[eidx]->attrs;
2269 exons[eidx]->attrs=NULL;
2270 }
2271 }
2272 ++eidx;
2273 }
2274 }
2275 delete cdss;
2276 cdss=NULL;
2277 //this->isXCDS(false);
2278 } else this->isXCDS(true);
2279 }//cdss check
2280
2281 //--- collect stats for the reference genomic sequence
2282 if (gfr->gseqtable.Count()<=gseq_id) {
2283 gfr->gseqtable.setCount(gseq_id+1);
2284 }
2285 GSeqStat* gsd=gfr->gseqtable[gseq_id];
2286 if (gsd==NULL) {
2287 gsd=new GSeqStat(gseq_id,names->gseqs.getName(gseq_id));
2288 //gfr->gseqtable.Put(gseq_id, gsd);
2289 gfr->gseqtable[gseq_id]=gsd;
2290 gfr->gseqStats.Add(gsd);
2291 }
2292 gsd->fcount++;
2293 if (start<gsd->mincoord) gsd->mincoord=start;
2294 if (end>gsd->maxcoord) gsd->maxcoord=end;
2295 if (this->len()>gsd->maxfeat_len) {
2296 gsd->maxfeat_len=this->len();
2297 gsd->maxfeat=this;
2298 }
2299 uptr=NULL;
2300 udata=0;
2301 return this;
2302 }
2303
printExonList(FILE * fout)2304 void GffObj::printExonList(FILE* fout) {
2305 //print comma delimited list of exon intervals
2306 for (int i=0;i<exons.Count();++i) {
2307 if (i>0) fprintf(fout, ",");
2308 fprintf(fout, "%d-%d",exons[i]->start, exons[i]->end);
2309 }
2310 }
2311
printCDSList(FILE * fout)2312 void GffObj::printCDSList(FILE* fout) {
2313 //print comma delimited list of CDS intervals
2314 if (!hasCDS()) return;
2315 GVec<GffExon> cds;
2316 this->getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment
2317 for (int i=0;i<cds.Count();i++) {
2318 if (i>0) fprintf(fout, ",");
2319 fprintf(fout, "%d-%d", cds[i].start, cds[i].end);
2320 }
2321 }
2322
BED_addAttribute(FILE * fout,int & acc,const char * format,...)2323 void BED_addAttribute(FILE* fout, int& acc, const char* format,... ) {
2324 ++acc;
2325 if (acc==1) fprintf(fout, "\t");
2326 else fprintf(fout, ";");
2327 va_list arguments;
2328 va_start(arguments,format);
2329 vfprintf(fout,format,arguments);
2330 va_end(arguments);
2331 }
2332
printBED(FILE * fout,bool cvtChars,char * dbuf,int dbuf_len)2333 void GffObj::printBED(FILE* fout, bool cvtChars, char* dbuf, int dbuf_len) {
2334 //print a BED-12 line + GFF3 attributes in 13th field
2335 int cd_start=CDstart>0? CDstart-1 : start-1;
2336 int cd_end=CDend>0 ? CDend : end;
2337 char cdphase=(CDphase>0) ? CDphase : '0';
2338 fprintf(fout, "%s\t%d\t%d\t%s\t%d\t%c\t%d\t%d\t%c,0,0", getGSeqName(), start-1, end, getID(),
2339 100, strand, cd_start, cd_end, cdphase);
2340 if (exons.Count()>0) {
2341 int i;
2342 fprintf(fout, "\t%d\t", exons.Count());
2343 for (i=0;i<exons.Count();++i)
2344 fprintf(fout,"%d,",exons[i]->len());
2345 fprintf(fout, "\t");
2346 for (i=0;i<exons.Count();++i)
2347 fprintf(fout,"%d,",exons[i]->start-start);
2348 } else { //no-exon feature(!), shouldn't happen
2349 fprintf(fout, "\t1\t%d,\t0,", len());
2350 }
2351 //now add the GFF3 attributes for in the 13th field
2352 int numattrs=0;
2353 if (CDstart>0) BED_addAttribute(fout, numattrs,"CDS=%d:%d",CDstart-1, CDend);
2354 if (CDphase>0) BED_addAttribute(fout, numattrs,"CDSphase=%c", CDphase);
2355 if (geneID!=NULL)
2356 BED_addAttribute(fout, numattrs, "geneID=%s",geneID);
2357 if (gene_name!=NULL)
2358 fprintf(fout, ";gene_name=%s",gene_name);
2359 if (attrs!=NULL) {
2360 for (int i=0;i<attrs->Count();i++) {
2361 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
2362 const char* attrval=attrs->Get(i)->attr_val;
2363 if (attrval==NULL || attrval[0]=='\0') {
2364 BED_addAttribute(fout, numattrs,"%s",attrname);
2365 continue;
2366 }
2367 if (cvtChars) {
2368 decodeHexChars(dbuf, attrval, dbuf_len-1);
2369 BED_addAttribute(fout, numattrs, "%s=%s", attrname, dbuf);
2370 }
2371 else
2372 BED_addAttribute(fout, numattrs,"%s=%s", attrname, attrs->Get(i)->attr_val);
2373 }
2374 }
2375 fprintf(fout, "\n");
2376 }
2377
parseAttrs(GffAttrs * & atrlist,char * info,bool isExon,bool CDSsrc)2378 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon, bool CDSsrc) {
2379 if (names==NULL)
2380 GError(ERR_NULL_GFNAMES, "parseAttrs()");
2381 if (atrlist==NULL) {
2382 atrlist=new GffAttrs();
2383 }
2384 bool exon2transcript=(isExon && atrlist==this->attrs);
2385 char* endinfo=info+strlen(info);
2386 char* start=info;
2387 char* pch=start;
2388 while (start<endinfo) {
2389 while (*start==' ' && start<endinfo) start++;
2390 pch=strchr(start, ';');
2391 if (pch==NULL) pch=endinfo;
2392 else { *pch='\0'; pch++; }
2393 char* ech=strchr(start,'=');
2394 if (ech!=NULL) { // attr=value format found
2395 *ech='\0';
2396 if (exon2transcript) { //we do NOT want these exon attributes at transcript level
2397 if (startsiWith(start, "exon_") || strcmp(start, "exon")==0) {
2398 start=pch; continue;
2399 }
2400 }
2401 ech++;
2402 while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
2403 /*
2404 if (isExon && startsiWith(start, "protein")) {
2405 //--protein info should never be left at exon level
2406 // ( attribute policing much? :p )
2407 this->addAttr(start, ech);
2408 start=pch;
2409 continue;
2410 }
2411 */
2412 if (exon2transcript)
2413 atrlist->add_if_new(this->names, start, ech); //never override transcript attribute with exon's
2414 else atrlist->add_or_update(this->names, start, ech, CDSsrc); //overwrite previous attr with the same name
2415 }
2416 start=pch;
2417 } //while info characters
2418 if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
2419 }
2420
addAttr(const char * attrname,const char * attrvalue)2421 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
2422 if (this->attrs==NULL)
2423 this->attrs=new GffAttrs();
2424 //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
2425 this->attrs->add_or_update(names, attrname, attrvalue);
2426 }
2427
copyAttrs(GffObj * from)2428 void GffObj::copyAttrs(GffObj* from) { //typically from is the parent gene, and this is a transcript
2429 if (from==NULL || from->attrs==NULL || from->attrs->Count()==0) return;
2430 if (this->attrs==NULL) {
2431 this->attrs=new GffAttrs();
2432 }
2433 //special RefSeq case
2434 int desc_attr_id=names->attrs.getId("description"); //from gene
2435 int prod_attr_id=names->attrs.getId("product"); //from transcript (this)
2436 char* prod = (prod_attr_id>=0) ? this->attrs->getAttr(prod_attr_id) : NULL;
2437
2438 for (int i=0;i<from->attrs->Count();++i) {
2439 //this->attrs->add_no_update(names, from->attrs->Get(i)->attr_id, from->attrs->Get(i)->attr_val);
2440 int aid=from->attrs->Get(i)->attr_id;
2441 //special case for GenBank refseq genes vs transcripts:
2442 if (prod && aid==desc_attr_id && strcmp(from->attrs->getAttr(desc_attr_id), prod)==0)
2443 continue; //skip description if product already there and the same
2444 bool haveit=false;
2445 for (int ai=0;ai<this->attrs->Count();++ai) {
2446 //do we have it already?
2447 if (aid==this->attrs->Get(ai)->attr_id) {
2448 haveit=true;
2449 break; //skip this, don't replace
2450 }
2451 }
2452 if (!haveit)
2453 this->attrs->Add(new GffAttr(aid, from->attrs->Get(i)->attr_val));
2454 }
2455 }
2456
setFeatureName(const char * feature)2457 void GffObj::setFeatureName(const char* feature) {
2458 //change the feature name/type for a transcript
2459 int fid=names->feats.addName(feature);
2460 if (monoFeature() && exons.Count()>0)
2461 this->subftype_id=fid;
2462 this->ftype_id=fid;
2463 }
2464
setRefName(const char * newname)2465 void GffObj::setRefName(const char* newname) {
2466 //change the feature name/type for a transcript
2467 int rid=names->gseqs.addName(newname);
2468 this->gseq_id=rid;
2469 }
2470
2471
2472
removeAttr(const char * attrname,const char * attrval)2473 int GffObj::removeAttr(const char* attrname, const char* attrval) {
2474 if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
2475 int aid=this->names->attrs.getId(attrname);
2476 if (aid<0) return 0;
2477 int delcount=0; //could be more than one ?
2478 for (int i=0;i<this->attrs->Count();i++) {
2479 if (aid==this->attrs->Get(i)->attr_id) {
2480 if (attrval==NULL ||
2481 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
2482 delcount++;
2483 this->attrs->freeItem(i);
2484 }
2485 }
2486 }
2487 if (delcount>0) this->attrs->Pack();
2488 return delcount;
2489 }
2490
removeAttr(int aid,const char * attrval)2491 int GffObj::removeAttr(int aid, const char* attrval) {
2492 if (this->attrs==NULL || aid<0) return 0;
2493 int delcount=0; //could be more than one ?
2494 for (int i=0;i<this->attrs->Count();i++) {
2495 if (aid==this->attrs->Get(i)->attr_id) {
2496 if (attrval==NULL ||
2497 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
2498 delcount++;
2499 this->attrs->freeItem(i);
2500 }
2501 }
2502 }
2503 if (delcount>0) this->attrs->Pack();
2504 return delcount;
2505 }
2506
2507
removeExonAttr(GffExon & exon,const char * attrname,const char * attrval)2508 int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
2509 if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
2510 int aid=this->names->attrs.getId(attrname);
2511 if (aid<0) return 0;
2512 int delcount=0; //could be more than one
2513 for (int i=0;i<exon.attrs->Count();i++) {
2514 if (aid==exon.attrs->Get(i)->attr_id) {
2515 if (attrval==NULL ||
2516 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
2517 delcount++;
2518 exon.attrs->freeItem(i);
2519 }
2520 }
2521 }
2522 if (delcount>0) exon.attrs->Pack();
2523 return delcount;
2524 }
2525
removeExonAttr(GffExon & exon,int aid,const char * attrval)2526 int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
2527 if (exon.attrs==NULL || aid<0) return 0;
2528 int delcount=0; //could be more than one
2529 for (int i=0;i<exon.attrs->Count();i++) {
2530 if (aid==exon.attrs->Get(i)->attr_id) {
2531 if (attrval==NULL ||
2532 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
2533 delcount++;
2534 exon.attrs->freeItem(i);
2535 }
2536 }
2537 }
2538 if (delcount>0) exon.attrs->Pack();
2539 return delcount;
2540 }
2541
2542
getUnspliced(GFaSeqGet * faseq,int * rlen,GMapSegments * seglst)2543 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GMapSegments* seglst) {
2544
2545 if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
2546 return NULL;
2547 }
2548 //restore normal coordinates:
2549 if (exons.Count()==0) return NULL;
2550 int fspan=end-start+1;
2551 const char* gsubseq=faseq->subseq(start, fspan);
2552 if (gsubseq==NULL) {
2553 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
2554 }
2555 char* unspliced=NULL;
2556
2557 int seqstart=exons.First()->start;
2558 int seqend=exons.Last()->end;
2559
2560 int unsplicedlen = 0;
2561 if (seglst)
2562 seglst->Clear(strand);
2563 unsplicedlen += seqend - seqstart + 1;
2564
2565 GMALLOC(unspliced, unsplicedlen+1); //allocate more here
2566 //uint seqstart, seqend;
2567 int s = 0; //resulting nucleotide counter
2568 if (strand=='-')
2569 {
2570 if (seglst!=NULL)
2571 seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend);
2572 for (int i=seqend;i>=seqstart;i--) {
2573 unspliced[s] = ntComplement(gsubseq[i-start]);
2574 s++;
2575 }//for each nt
2576 } // - strand
2577 else {
2578 // + strand
2579 if (seglst!=NULL)
2580 seglst->add(s+1,s+1+seqend-seqstart, seqstart, seqend);
2581 for (int i=seqstart;i<=seqend;i++) {
2582 unspliced[s]=gsubseq[i-start];
2583 s++;
2584 }//for each nt
2585 } // + strand
2586 //assert(s <= unsplicedlen);
2587 unspliced[s]=0;
2588 if (rlen!=NULL) *rlen=s;
2589 return unspliced;
2590 }
2591
addPadding(int padLeft,int padRight)2592 void GffObj::addPadding(int padLeft, int padRight) {
2593 this->start-=padLeft;
2594 this->end+=padRight;
2595 if (exons.Count()>0) {
2596 exons[0]->start-=padLeft;
2597 exons.Last()->end+=padRight;
2598 }
2599 covlen+=padLeft+padRight;
2600 }
2601
removePadding(int padLeft,int padRight)2602 void GffObj::removePadding(int padLeft, int padRight) {
2603 this->start+=padLeft;
2604 this->end-=padRight;
2605 if (exons.Count()>0) {
2606 exons[0]->start+=padLeft;
2607 exons.Last()->end-=padRight;
2608 }
2609 covlen-=padLeft+padRight;
2610 }
2611
getSpliced(GFaSeqGet * faseq,bool CDSonly,int * rlen,uint * cds_start,uint * cds_end,GMapSegments * seglst,bool cds_open)2612 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
2613 GMapSegments* seglst, bool cds_open) {
2614 //cds_open only makes sense when CDSonly is true by overriding CDS 3'end such that the end of
2615 //the sequence beyond the 3' CDS end is also returned (the 3' UTR is appended to the CDS)
2616 if (CDSonly && CDstart==0) {
2617 GMessage("Warning: getSpliced(CDSOnly) requested for transcript with no CDS (%s)!\n", gffID); //should never happen
2618 return NULL;
2619 }
2620 if (faseq==NULL) {
2621 GMessage("Warning: getSpliced() called with uninitialized GFaSeqGet object!\n"); //should never happen
2622 return NULL;
2623 }
2624 GList<GffExon>* xsegs=&exons;
2625 if (CDSonly && this->cdss!=NULL)
2626 xsegs=this->cdss;
2627 if (xsegs->Count()==0) return NULL;
2628 int fspan=end-start+1;
2629 const char* gsubseq=faseq->subseq(start, fspan);
2630 if (gsubseq==NULL) {
2631 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
2632 }
2633 if (fspan<(int)(end-start+1)) {
2634 //special case: stop coordinate was extended past the gseq length, must adjust
2635 int endadj=end-start+1-fspan;
2636 uint prevend=end;
2637 end-=endadj;
2638 if (CDend>end) CDend=end;
2639 if (xsegs->Last()->end>end) {
2640 xsegs->Last()->end=end; //this could be trouble if exon start is also > end
2641 if (xsegs->Last()->start>xsegs->Last()->end) {
2642 GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
2643 prevend,getGSeqName(), getID());
2644 }
2645 covlen-=endadj;
2646 }
2647 }
2648 char* spliced=NULL;
2649 GMALLOC(spliced, covlen+1); //IMPORTANT: covlen must be correct here!
2650 uint g_start=0, g_end=0;
2651 int cdsadj=0;
2652 if (CDphase=='1' || CDphase=='2') {
2653 cdsadj=CDphase-'0';
2654 }
2655 uint CDS_start=CDstart;
2656 uint CDS_stop=CDend;
2657 if (cdsadj>0) {
2658 if (strand=='-') CDS_stop-=cdsadj;
2659 else CDS_start+=cdsadj;
2660 }
2661 if (CDSonly) {
2662 g_start=CDS_start;
2663 g_end=CDS_stop;
2664 if (g_end-g_start<3)
2665 GMessage("Warning: CDS %d-%d too short for %s, check your data.\n",
2666 g_start, g_end, gffID);
2667 } else { //all exon content, not just CDS
2668 g_start=xsegs->First()->start;
2669 g_end=xsegs->Last()->end;
2670 cds_open=false; //override mistaken user request
2671 }
2672 if (seglst!=NULL) seglst->Clear(strand);
2673 int s=0; //resulting nucleotide counter
2674 if (strand=='-') {
2675 if (cds_open) {// appending 3'UTR
2676 g_start=xsegs->First()->start;
2677 //CDS_start=g_start;
2678 }
2679 for (int x=xsegs->Count()-1;x>=0;x--) {
2680 uint sgstart=xsegs->Get(x)->start;
2681 uint sgend=xsegs->Get(x)->end;
2682 if (g_end<sgstart || g_start>sgend) continue;
2683 if (g_start>=sgstart && g_start<=sgend)
2684 sgstart=g_start; //3' end within this segment
2685 if (g_end>=sgstart && g_end<=sgend)
2686 sgend=g_end; //5' end within this segment
2687 if (seglst!=NULL)
2688 seglst->add(s+1,s+1+sgend-sgstart,sgend,sgstart);
2689 for (uint i=sgend;i>=sgstart;i--) {
2690 spliced[s] = ntComplement(gsubseq[i-start]);
2691 s++;
2692 }//for each nt
2693 //--update local CDS start-end coordinates
2694 if (cds_start!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) {
2695 //CDS start in this segment
2696 *cds_start=s-(CDS_stop-sgstart);
2697 }
2698 if (cds_end!=NULL && CDS_start>=sgstart && CDS_start<=sgend) {
2699 //CDS stop in this segment
2700 *cds_end=s-(CDS_start-sgstart);
2701 }
2702 } //for each exon
2703 } // - strand
2704 else { // + strand
2705 if (cds_open) { // appending 3'UTR
2706 g_end=xsegs->Last()->end;
2707 //CDS_stop=g_end;
2708 }
2709 for (int x=0;x<xsegs->Count();x++) {
2710 uint sgstart=xsegs->Get(x)->start;
2711 uint sgend=xsegs->Get(x)->end;
2712 if (g_end<sgstart || g_start>sgend) continue;
2713 if (g_start>=sgstart && g_start<=sgend)
2714 sgstart=g_start; //seqstart within this segment
2715 if (g_end>=sgstart && g_end<=sgend)
2716 sgend=g_end; //seqend within this segment
2717 if (seglst!=NULL)
2718 seglst->add(s+1,s+1+sgend-sgstart, sgstart, sgend);
2719 for (uint i=sgstart;i<=sgend;i++) {
2720 spliced[s]=gsubseq[i-start];
2721 s++;
2722 }//for each nt
2723 //--update local CDS start-end coordinates
2724 if (cds_start!=NULL && CDS_start>=sgstart && CDS_start<=sgend) {
2725 //CDS start in this segment
2726 *cds_start=s-(sgend-CDS_start);
2727 }
2728 if (cds_end!=NULL && CDS_stop>=sgstart && CDS_stop<=sgend) {
2729 //CDS stop in this segment
2730 *cds_end=s-(sgend-CDS_stop);
2731 }
2732 } //for each exon
2733 } // + strand
2734 spliced[s]=0;
2735 if (rlen!=NULL) *rlen=s;
2736 return spliced;
2737 }
2738
printSummary(FILE * fout)2739 void GffObj::printSummary(FILE* fout) {
2740 if (fout==NULL) fout=stdout;
2741 fprintf(fout, "%s\t%c\t%d\t%d\t", gffID,
2742 strand, start, end);
2743 gscore.print(fout);
2744 fprintf(fout, "\n");
2745 }
2746 //TODO we should also have an escapeChars function for some situations
2747 //when we want to write a GFF3 strictly compliant to the dang specification
decodeHexChars(char * dbuf,const char * s,int maxlen)2748 void GffObj::decodeHexChars(char* dbuf, const char* s, int maxlen) {
2749 int dlen=0;
2750 dbuf[0]=0;
2751 if (s==NULL) return;
2752 for (const char* p=s;(*p)!=0 && dlen<maxlen;++p) {
2753 if (p[0]=='%' && isxdigit(p[1]) && isxdigit(p[2])) {
2754 int a=*(++p);
2755 if (a>'Z') a^=0x20; //toupper()
2756 if (a>'9') a=10+(a-'A');
2757 else a-='0';
2758 int b=*(++p);
2759 if (b>'Z') b^=0x20;
2760 if (b>'9') b=10+(b-'A');
2761 else b-='0';
2762 char c=(char)((a<<4)+b);
2763 if (c=='%') {
2764 dbuf[dlen]='p';
2765 ++dlen;
2766 dbuf[dlen]='r';
2767 ++dlen;
2768 c='c';
2769 }
2770 else if (c==';') c='.';
2771 else if (c<='\t') c=' ';
2772 if (c>=' ') {
2773 dbuf[dlen]=c;
2774 ++dlen;
2775 continue;
2776 }
2777 }
2778 dbuf[dlen]=*p;
2779 ++dlen;
2780 }
2781 dbuf[dlen]=0;
2782 }
2783
printGTab(FILE * fout,char ** extraAttrs)2784 void GffObj::printGTab(FILE* fout, char** extraAttrs) {
2785 fprintf(fout, "%s\t%c\t%d\t%d\t%s\t", this->getGSeqName(), this->strand,
2786 this->start, this->end, this->getID());
2787 if (exons.Count()) printExonList(fout);
2788 else fprintf(fout, ".");
2789 if (extraAttrs!=NULL) {
2790 //print a list of "attr=value;" pairs here as the last column
2791 //for requested attributes
2792 bool t1=true;
2793 for (int i=0;extraAttrs[i]!=NULL;++i) {
2794 const char* v=this->getAttr(extraAttrs[i]);
2795 if (v==NULL) continue;
2796 if (t1) { fprintf(fout, "\t"); t1=false; }
2797 fprintf(fout, "%s=%s;", extraAttrs[i], v);
2798 }
2799 }
2800 fprintf(fout,"\n");
2801 }
2802
printGxfExon(FILE * fout,const char * tlabel,const char * gseqname,bool iscds,GffExon * exon,bool gff3,bool cvtChars,char * dbuf,int dbuf_len)2803 void GffObj::printGxfExon(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
2804 GffExon* exon, bool gff3, bool cvtChars,
2805 char* dbuf, int dbuf_len) {
2806 //strcpy(dbuf,".");
2807 //if (exon->score>0) sprintf(dbuf,"%.2f", exon->score);
2808 exon->score.sprint(dbuf);
2809 if (exon->phase==0 || !iscds) exon->phase='.';
2810 const char* ftype=iscds ? "CDS" : getSubfName();
2811 const char* attrname=NULL;
2812 const char* attrval=NULL;
2813 if (gff3) {
2814 fprintf(fout,
2815 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
2816 gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand,
2817 exon->phase, gffID);
2818 if (exon->attrs!=NULL) {
2819 for (int i=0;i<exon->attrs->Count();i++) {
2820 if (exon->attrs->Get(i)->cds!=iscds) continue;
2821 attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id);
2822 if (cvtChars) {
2823 decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1);
2824 fprintf(fout,";%s=%s", attrname, dbuf);
2825 } else {
2826 fprintf(fout,";%s=%s", attrname, exon->attrs->Get(i)->attr_val);
2827 }
2828 }
2829 }
2830 fprintf(fout, "\n");
2831 } //GFF3
2832 else {//GTF
2833 fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
2834 gseqname, tlabel, ftype, exon->start, exon->end, dbuf, strand, exon->phase, gffID);
2835 if (geneID)
2836 fprintf(fout," gene_id \"%s\";",geneID);
2837 if (gene_name!=NULL) {
2838 fprintf(fout," gene_name \"%s\";",gene_name);
2839 }
2840 if (exon->attrs!=NULL) {
2841 bool trId=false;
2842 bool gId=false;
2843 for (int i=0;i<exon->attrs->Count();i++) {
2844 if (exon->attrs->Get(i)->attr_val==NULL) continue;
2845 if (exon->attrs->Get(i)->cds!=iscds) continue;
2846 attrname=names->attrs.getName(exon->attrs->Get(i)->attr_id);
2847 if (strcmp(attrname, "transcriptID")==0) {
2848 if (trId) continue;
2849 trId=true;
2850 }
2851 if (strcmp(attrname, "transcript_id")==0 && !trId) {
2852 attrname="transcriptID";
2853 trId=true;
2854 }
2855 if (strcmp(attrname, "geneID")==0) {
2856 if (gId) continue;
2857 gId=true;
2858 }
2859 if (strcmp(attrname, "gene_id")==0 && !gId) {
2860 attrname="geneID";
2861 gId=true;
2862 }
2863 if (Gstricmp(attrname, "gene_name")==0 && gene_name!=NULL) {
2864 continue;
2865 }
2866 fprintf(fout, " %s ",attrname);
2867 if (cvtChars) {
2868 decodeHexChars(dbuf, exon->attrs->Get(i)->attr_val, dbuf_len-1);
2869 attrval=dbuf;
2870 } else {
2871 attrval=exon->attrs->Get(i)->attr_val;
2872 }
2873
2874 if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
2875 else fprintf(fout, "\"%s\";",attrval);
2876 }
2877 }
2878 //for GTF, also append the GffObj attributes to each exon line
2879 // - do not do this when the transcript line is also printed!
2880 /*
2881 if (attrs!=NULL) {
2882 for (int i=0;i<attrs->Count();i++) {
2883 if (attrs->Get(i)->attr_val==NULL) continue;
2884 attrname=names->attrs.getName(attrs->Get(i)->attr_id);
2885 fprintf(fout, " %s ",attrname);
2886 if (cvtChars) {
2887 decodeHexChars(dbuf, attrs->Get(i)->attr_val, dbuf_len-1);
2888 attrval=dbuf;
2889 } else {
2890 attrval=attrs->Get(i)->attr_val;
2891 }
2892 if (attrval[0]=='"') fprintf(fout, "%s;",attrval);
2893 else fprintf(fout, "\"%s\";",attrval);
2894 }
2895 }
2896 */
2897 fprintf(fout, "\n");
2898 }//GTF
2899 }
2900
printGxf(FILE * fout,GffPrintMode gffp,const char * tlabel,const char * gfparent,bool cvtChars)2901 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
2902 const char* tlabel, const char* gfparent, bool cvtChars) {
2903 const int DBUF_LEN=1024; //there should not be attribute values longer than 1K!
2904 char dbuf[DBUF_LEN];
2905 if (tlabel==NULL) {
2906 tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
2907 (char*)"gffobj" ;
2908 }
2909 if (gffp==pgffBED) {
2910 printBED(fout, cvtChars, dbuf, DBUF_LEN);
2911 return;
2912 }
2913 const char* gseqname=names->gseqs.Get(gseq_id)->name;
2914 bool gff3 = (gffp>=pgffAny && gffp<=pgffTLF);
2915 bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
2916 bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
2917 //if (gscore>0.0) sprintf(dbuf,"%.2f", gscore);
2918 // else strcpy(dbuf,".");
2919 gscore.sprint(dbuf);
2920 if (gffp<=pgtfCDS && gffp>=pgtfAny) { //GTF output
2921 fprintf(fout,
2922 "%s\t%s\ttranscript\t%d\t%d\t%s\t%c\t.\ttranscript_id \"%s\"",
2923 gseqname, tlabel, start, end, dbuf, strand, gffID);
2924 char* gid=NULL;
2925 if (geneID!=NULL) {
2926 gid=geneID;
2927 }
2928 else {
2929 gid=getAttr("gene_id");
2930 if (gid==NULL)
2931 gid=gffID; //last resort, write gid the same with gffID
2932 }
2933 if (gid!=NULL) fprintf(fout, "; gene_id \"%s\"",gid);
2934 if (gene_name!=NULL && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL)
2935 fprintf(fout, "; gene_name \"%s\"",gene_name);
2936 if (attrs!=NULL) {
2937 bool trId=false;
2938 //bool gId=false;
2939 for (int i=0;i<attrs->Count();i++) {
2940 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
2941 const char* attrval=attrs->Get(i)->attr_val;
2942 if (attrval==NULL || attrval[0]=='\0') continue;
2943 if (strcmp(attrname, "transcriptID")==0) {
2944 if (trId) continue;
2945 trId=true;
2946 }
2947 if (strcmp(attrname, "transcript_id")==0 && !trId) {
2948 attrname="transcriptID";
2949 trId=true;
2950 }
2951 if (Gstrcmp(attrname, "geneID")==0 && gid!=NULL &&
2952 strcmp(attrval, gid)==0) continue;
2953 if (strcmp(attrname, "gene_id")==0) continue;
2954 if (cvtChars) {
2955 decodeHexChars(dbuf, attrval, DBUF_LEN-1);
2956 fprintf(fout,"; %s \"%s\"", attrname, dbuf);
2957 }
2958 else
2959 fprintf(fout,"; %s \"%s\"", attrname, attrs->Get(i)->attr_val);
2960 }
2961 }
2962 fprintf(fout,";\n");
2963 }
2964 else if (gff3) {
2965 //print GFF3 transcript line:
2966 uint pstart, pend;
2967 if (gffp==pgffCDS) {
2968 pstart=CDstart;
2969 pend=CDend;
2970 }
2971 else { pstart=start;pend=end; }
2972 //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
2973 const char* ftype=getFeatureName();
2974 fprintf(fout,
2975 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
2976 gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID);
2977 bool parentPrint=false;
2978 if (gfparent!=NULL && gffp!=pgffTLF) {
2979 //parent override - also prevents printing gene_name and gene_id
2980 fprintf(fout, ";Parent=%s",gfparent);
2981 parentPrint=true;
2982 }
2983 else if (parent!=NULL && !parent->isDiscarded() && gffp!=pgffTLF) {
2984 fprintf(fout, ";Parent=%s",parent->getID());
2985 if (parent->isGene()) parentPrint=true;
2986 }
2987 if (gffp==pgffTLF) {
2988 fprintf(fout, ";exonCount=%d",exons.Count());
2989 if (exons.Count()>0)
2990 fprintf(fout, ";exons=%d-%d", exons[0]->start, exons[0]->end);
2991 for (int i=1;i<exons.Count();++i) {
2992 fprintf(fout, ",%d-%d",exons[i]->start, exons[i]->end);
2993 }
2994 }
2995 if (CDstart>0 && (gffp==pgffTLF || !showCDS)) {
2996 if (cdss==NULL) fprintf(fout,";CDS=%d:%d",CDstart,CDend);
2997 else {
2998 fprintf(fout, ";CDS=");
2999 for (int i=0;i<cdss->Count();++i) {
3000 if (i>0) fprintf(fout, ",");
3001 fprintf(fout, "%d-%d", (*cdss)[i]->start, (*cdss)[i]->end);
3002 }
3003 }
3004 }
3005 if (CDphase>0 && (gffp==pgffTLF || !showCDS)) fprintf(fout,";CDSphase=%c", CDphase);
3006 char* g_id=NULL;
3007 if (geneID!=NULL && !parentPrint && getAttr("geneID")==NULL &&
3008 ((g_id=getAttr("gene_id"))==NULL || strcmp(g_id, geneID)!=0))
3009 fprintf(fout, ";geneID=%s",geneID);
3010 if (gene_name!=NULL && !parentPrint && getAttr("gene_name")==NULL && getAttr("GENE_NAME")==NULL)
3011 fprintf(fout, ";gene_name=%s",gene_name);
3012 if (attrs!=NULL) {
3013 for (int i=0;i<attrs->Count();i++) {
3014 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
3015 const char* attrval=attrs->Get(i)->attr_val;
3016 if (attrval==NULL || attrval[0]=='\0') continue;
3017 //fprintf(fout,";%s",attrname);
3018 if (cvtChars) {
3019 decodeHexChars(dbuf, attrval, DBUF_LEN-1);
3020 fprintf(fout,";%s=%s", attrname, dbuf);
3021 }
3022 else
3023 fprintf(fout,";%s=%s", attrname, attrs->Get(i)->attr_val);
3024 }
3025 }
3026 fprintf(fout,"\n");
3027 }// gff3 transcript line
3028 if (gffp==pgffTLF) return;
3029 bool is_cds_only = (gffp==pgffBoth) ? false : isCDSOnly();
3030 if (showExon) {
3031 //print exons
3032 for (int i=0;i<exons.Count();i++) {
3033 printGxfExon(fout, tlabel, gseqname, is_cds_only, exons[i], gff3, cvtChars, dbuf, DBUF_LEN);
3034 }
3035 }//printing exons
3036 if (showCDS && !is_cds_only && CDstart>0) {
3037 GVec<GffExon> cds;
3038 getCDSegs(cds); //also uses/prepares the CDS phase for each CDS segment
3039 for (int i=0;i<cds.Count();i++) {
3040 printGxfExon(fout, tlabel, gseqname, true, &(cds[i]), gff3, cvtChars, dbuf, DBUF_LEN);
3041 }
3042 } //showCDS
3043 }
3044
updateCDSPhase(GList<GffExon> & segs)3045 void GffObj::updateCDSPhase(GList<GffExon>& segs) {
3046 int cdsacc=0;
3047 if (CDphase=='1' || CDphase=='2') {
3048 cdsacc+= 3-(CDphase-'0');
3049 }
3050 else CDphase='0';
3051 if (strand=='-') { //reverse strand
3052 for (int i=segs.Count()-1;i>=0;i--) {
3053 segs[i]->phase='0'+ (3-cdsacc%3)%3;
3054 cdsacc+=segs[i]->end-segs[i]->start+1;
3055 }
3056 }
3057 else { //forward strand
3058 for (int i=0;i<segs.Count();i++) {
3059 segs[i]->phase='0'+ (3-cdsacc%3)%3;
3060 cdsacc+=segs[i]->end-segs[i]->start+1;
3061 }
3062 }
3063 }
3064
getCDSegs(GVec<GffExon> & cds)3065 void GffObj::getCDSegs(GVec<GffExon>& cds) {
3066 //like updateCDSPhase() above, also updates phase for each segment
3067 GffExon cdseg(true);
3068 cds.Clear();
3069 if (cdss!=NULL) {
3070 //copy directly from cdss list
3071 for (int i=0;i<cdss->Count();i++) {
3072 cdseg=(*cdss->Get(i));
3073 cdseg.sharedAttrs=true;
3074 cds.Add(cdseg);
3075 }
3076 return;
3077 }
3078 int cdsacc=0;
3079 if (CDphase=='1' || CDphase=='2') {
3080 cdsacc+= 3-(CDphase-'0');
3081 }
3082 if (strand=='-') {
3083 for (int x=exons.Count()-1;x>=0;x--) {
3084 uint sgstart=exons[x]->start;
3085 uint sgend=exons[x]->end;
3086 if (CDend<sgstart || CDstart>sgend) continue;
3087 if (CDstart>=sgstart && CDstart<=sgend)
3088 sgstart=CDstart; //cdstart within this segment
3089 if (CDend>=sgstart && CDend<=sgend)
3090 sgend=CDend; //cdend within this segment
3091 cdseg.start=sgstart;
3092 cdseg.end=sgend;
3093 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
3094 cdseg.phase='0'+ (3-cdsacc%3)%3;
3095 cdsacc+=sgend-sgstart+1;
3096 cdseg.attrs=exons[x]->attrs;
3097 cdseg.sharedAttrs=true;
3098 cds.Add(cdseg);
3099 } //for each exon
3100 cds.Reverse();
3101 } // - strand
3102 else { // + strand
3103 for (int x=0;x<exons.Count();x++) {
3104 uint sgstart=exons[x]->start;
3105 uint sgend=exons[x]->end;
3106 if (CDend<sgstart || CDstart>sgend) continue;
3107 if (CDstart>=sgstart && CDstart<=sgend)
3108 sgstart=CDstart; //seqstart within this segment
3109 if (CDend>=sgstart && CDend<=sgend)
3110 sgend=CDend; //seqend within this segment
3111 cdseg.start=sgstart;
3112 cdseg.end=sgend;
3113 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
3114 cdseg.phase='0' + (3-cdsacc%3)%3 ;
3115 cdsacc+=sgend-sgstart+1;
3116 cdseg.attrs=exons[x]->attrs;
3117 cdseg.sharedAttrs=true;
3118 cds.Add(cdseg);
3119 } //for each exon
3120 } // + strand
3121
3122 }
3123
3124 //-- transcript match/overlap classification functions
3125
3126
transcriptMatch(GffObj & a,GffObj & b,int & ovlen)3127 char transcriptMatch(GffObj& a, GffObj& b, int& ovlen) {
3128 //return '=' if exact exon match, '~' if intron-chain match (or 80% overlap for single-exon)
3129 // or 0 otherwise
3130 int imax=a.exons.Count()-1;
3131 int jmax=b.exons.Count()-1;
3132 ovlen=0;
3133 if (imax!=jmax) return false; //different number of exons, cannot match
3134 if (imax==0) //single-exon mRNAs
3135 return (singleExonTMatch(a,b,ovlen));
3136 if ( a.exons[imax]->start<b.exons[0]->end ||
3137 b.exons[jmax]->start<a.exons[0]->end )
3138 return 0; //intron chains do not overlap at all
3139 //check intron overlaps
3140 ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
3141 ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
3142 for (int i=1;i<=imax;i++) {
3143 if (i<imax) ovlen+=a.exons[i]->len();
3144 if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
3145 (a.exons[i]->start!=b.exons[i]->start)) {
3146 return 0; //intron mismatch
3147 }
3148 }
3149 //--- full intron chain match:
3150 if (a.exons[0]->start==b.exons[0]->start &&
3151 a.exons.Last()->end==b.exons.Last()->end)
3152 return '=';
3153 return '~';
3154 }
3155
3156
singleExonTMatch(GffObj & m,GffObj & r,int & ovlen)3157 char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) {
3158 //return '=' if exact match, '~' if the overlap is >=80% of the longer sequence length
3159 // return 0 if there is no overlap
3160 GSeg mseg(m.start, m.end);
3161 ovlen=mseg.overlapLen(r.start,r.end);
3162 if (ovlen<=0) return 0;
3163 // fuzzy matching for single-exon transcripts:
3164 // matching = overlap is at least 80% of the length of the longer transcript
3165 // *OR* in case of reverse containment (reference contained in m)
3166 // it's also considered "matching" if the overlap is at least 80% of
3167 // the reference len AND at least 70% of the query len
3168 if (m.start==r.start && m.end==r.end) return '=';
3169 if (m.covlen>r.covlen) {
3170 if ( (ovlen >= m.covlen*0.8) ||
3171 (ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 ) )
3172 //allow also some fuzzy reverse containment
3173 return '~';
3174 } else {
3175 if (ovlen >= r.covlen*0.8) return '~';
3176 }
3177 return 0;
3178 }
3179
3180 //formerly in gffcompare
getOvlCode(GffObj & m,GffObj & r,int & ovlen,bool strictMatch)3181 char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch) {
3182 ovlen=0; //total actual exonic overlap
3183 if (!m.overlap(r.start,r.end)) return 0;
3184 int jmax=r.exons.Count()-1;
3185 //int iovlen=0; //total m.exons overlap with ref introns
3186 char rcode=0;
3187 if (m.exons.Count()==1) { //single-exon transfrag
3188 GSeg mseg(m.start, m.end);
3189 if (jmax==0) { //also single-exon ref
3190 //ovlen=mseg.overlapLen(r.start,r.end);
3191 char eqcode=0;
3192 if ((eqcode=singleExonTMatch(m, r, ovlen))>0) {
3193 if (strictMatch) return eqcode;
3194 else return '=';
3195 }
3196 if (m.covlen<r.covlen)
3197 { if (ovlen >= m.covlen*0.8) return 'c'; } // fuzzy containment
3198 else
3199 if (ovlen >= r.covlen*0.8 ) return 'k'; // fuzzy reverse containment
3200 return 'o'; //just plain overlapping
3201 }
3202 //-- single-exon qry overlaping multi-exon ref
3203 //check full pre-mRNA case (all introns retained): code 'm'
3204
3205 if (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start)
3206 return 'm';
3207
3208 for (int j=0;j<=jmax;j++) {
3209 //check if it's ~contained by an exon
3210 int exovlen=mseg.overlapLen(r.exons[j]);
3211 if (exovlen>0) {
3212 ovlen+=exovlen;
3213 if (m.start>r.exons[j]->start-4 && m.end<r.exons[j]->end+4) {
3214 return 'c'; //close enough to be considered contained in this exon
3215 }
3216 }
3217 if (j==jmax) break; //last exon here, no intron to check
3218 //check if it fully covers an intron (retained intron)
3219 if (m.start<r.exons[j]->end && m.end>r.exons[j+1]->start)
3220 return 'n';
3221 //check if it's fully contained by an intron
3222 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
3223 return 'i';
3224 // check if it's a potential pre-mRNA transcript
3225 // (if overlaps this intron at least 10 bases)
3226 uint introvl=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
3227 //iovlen+=introvl;
3228 if (introvl>=10 && mseg.len()>introvl+10) { rcode='e'; }
3229 } //for each ref exon
3230 if (rcode>0) return rcode;
3231 return 'o'; //plain overlap, uncategorized
3232 } //single-exon transfrag
3233 //-- multi-exon transfrag --
3234 int imax=m.exons.Count()-1;// imax>0 here
3235 if (jmax==0) { //single-exon reference overlap
3236 //any exon overlap?
3237 GSeg rseg(r.start, r.end);
3238 for (int i=0;i<=imax;i++) {
3239 //check if it's ~contained by an exon
3240 int exovlen=rseg.overlapLen(m.exons[i]);
3241 if (exovlen>0) {
3242 ovlen+=exovlen;
3243 if (r.start>m.exons[i]->start-4 && r.end<m.exons[i]->end+4) {
3244 return 'k'; //reference contained in this assembled exon
3245 }
3246 }
3247 if (i==imax) break;
3248 if (r.end<m.exons[i+1]->start && r.start>m.exons[i]->end)
3249 return 'y'; //ref contained in this transfrag intron
3250 }
3251 return 'o';
3252 }
3253 // * check if transfrag contained by a ref intron
3254 for (int j=0;j<jmax;j++) {
3255 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
3256 return 'i';
3257 }
3258 if (m.exons[imax]->start<r.exons[0]->end) {
3259 //qry intron chain ends before ref intron chain starts
3260 //check if last qry exon plugs the 1st ref intron
3261 if (m.exons[imax]->start<=r.exons[0]->end &&
3262 m.exons[imax]->end>=r.exons[1]->start) return 'n';
3263 return 'o'; //only terminal exons overlap
3264 }
3265 else if (r.exons[jmax]->start<m.exons[0]->end) {
3266 //qry intron chain starts after ref intron chain ends
3267 //check if first qry exon plugs the last ref intron
3268 if (m.exons[0]->start<=r.exons[jmax-1]->end &&
3269 m.exons[0]->end>=r.exons[jmax]->start) return 'n';
3270 return 'o'; //only terminal exons overlap
3271 }
3272 //check intron chain overlap (match, containment, intron retention etc.)
3273 int i=1; //index of exon to the right of current qry intron
3274 int j=1; //index of exon to the right of current ref intron
3275 bool intron_conflict=false; //used for checking for retained introns
3276 //from here on we check all qry introns against ref introns
3277 bool junct_match=false; //true if at least a junction match is found
3278 bool ichain_match=true; //if there is intron (sub-)chain match, to be updated by any mismatch
3279 bool intron_ovl=false; //if any intron overlap is found
3280 bool intron_retention=false; //if any ref intron is covered by a qry exon
3281 int imfirst=0; //index of first intron match in query (valid>0)
3282 int jmfirst=0; //index of first intron match in reference (valid>0)
3283 int imlast=0; //index of first intron match in query
3284 int jmlast=0; //index of first intron match in reference
3285 //check for intron matches
3286 while (i<=imax && j<=jmax) {
3287 uint mstart=m.exons[i-1]->end;
3288 uint mend=m.exons[i]->start;
3289 uint rstart=r.exons[j-1]->end;
3290 uint rend=r.exons[j]->start;
3291 if (rend<mstart) { //qry intron starts after ref intron ends
3292 if (!intron_conflict && r.exons[j]->overlap(mstart+1, mend-1))
3293 intron_conflict=true;
3294 if (!intron_retention && rstart>=m.exons[i-1]->start)
3295 intron_retention=true;
3296 if (intron_ovl) ichain_match=false;
3297 j++;
3298 continue;
3299 } //no intron overlap, skipping ref intron
3300 if (rstart>mend) { //qry intron ends before ref intron starts
3301 //if qry intron overlaps the exon on the left, we have an intron conflict
3302 if (!intron_conflict && r.exons[j-1]->overlap(mstart+1, mend-1))
3303 intron_conflict=true;
3304 if (!intron_retention && rend<=m.exons[i]->end)
3305 intron_retention=true;
3306 if (intron_ovl) ichain_match=false;
3307 i++;
3308 continue;
3309 } //no intron overlap, skipping qry intron
3310 intron_ovl=true;
3311 //overlapping introns, test junction matching
3312 bool smatch=(mstart==rstart);
3313 bool ematch=(mend==rend);
3314 if (smatch || ematch) junct_match=true;
3315 if (smatch && ematch) {
3316 //perfect match for this intron
3317 if (ichain_match) { //chain matching still possible
3318 if (jmfirst==0) jmfirst=j;
3319 if (imfirst==0) imfirst=i;
3320 imlast=i;
3321 jmlast=j;
3322 }
3323 i++; j++;
3324 continue;
3325 }
3326 //intron overlapping but with at least a junction mismatch
3327 intron_conflict=true;
3328 ichain_match=false;
3329 if (mend>rend) j++; else i++;
3330 } //while checking intron overlaps
3331 if (ichain_match) { //intron sub-chain match
3332 if (imfirst==1 && imlast==imax) { // qry full intron chain match
3333 if (jmfirst==1 && jmlast==jmax) {//identical intron chains
3334 if (strictMatch) return (r.exons[0]->start==m.exons[0]->start &&
3335 r.exons.Last()->end && m.exons.Last()->end) ? '=' : '~';
3336 else return '=';
3337 }
3338 // -- qry intron chain is shorter than ref intron chain --
3339 int l_iovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
3340 int r_iovh=0; // same type of overhang through the ref intron on the right
3341 if (jmfirst>1 && r.exons[jmfirst-1]->start>m.start)
3342 l_iovh = r.exons[jmfirst-1]->start - m.start;
3343 if (jmlast<jmax && m.end > r.exons[jmlast]->end)
3344 r_iovh = m.end - r.exons[jmlast]->end;
3345 if (l_iovh<4 && r_iovh<4) return 'c';
3346 } else if ((jmfirst==1 && jmlast==jmax)) {//ref full intron chain match
3347 //check if the reference i-chain is contained in qry i-chain
3348 int l_jovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
3349 int r_jovh=0; // same type of overhang through the ref intron on the right
3350 if (imfirst>1 && m.exons[imfirst-1]->start>r.start)
3351 l_jovh = m.exons[imfirst-1]->start - r.start;
3352 if (imlast<imax && r.end > m.exons[imlast]->end)
3353 r_jovh = r.end - m.exons[imlast]->end;
3354 if (l_jovh<4 && r_jovh<4) return 'k'; //reverse containment
3355 }
3356 }
3357 //'=', 'c' and 'k' were checked and assigned, check for 'm' and 'n' before falling back to 'j'
3358 if (!intron_conflict && (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start)) {
3359 return 'm';
3360 }
3361 if (intron_retention) return 'n';
3362 if (junct_match) return 'j';
3363 //we could have 'o' or 'y' here
3364 //any real exon overlaps?
3365 ovlen=m.exonOverlapLen(r);
3366 if (ovlen>4) return 'o';
3367 return 'y'; //all reference exons are within transfrag introns!
3368 }
3369