1 #include "gff_utils.h"
2 
3 GHash<GeneInfo*> gene_ids;
4 
5 bool verbose=false; //same with GffReader::showWarnings and GffLoader::beVserbose
6 bool debugMode=false;
7 bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
8 
9 FILE* ffasta=NULL;
10 FILE* f_in=NULL;
11 FILE* f_out=NULL;
12 FILE* f_w=NULL; //writing fasta with spliced exons (transcripts)
13 int wPadding = 0; //padding for -w option
14 FILE* f_x=NULL; //writing fasta with spliced CDS
15 FILE* f_y=NULL; //wrting fasta with translated CDS
16 
17 FILE* f_j=NULL; //wrting junctions (intron coordinates)
18 
19 int maxintron=999000000;
20 
21 ID_Flt_Type IDflt=idFlt_None;
22 
23 bool TFilters=false;
24 bool wCDSonly=false;
25 bool wNConly=false;
26 int minLen=0; //minimum transcript length
27 bool validCDSonly=false; // translation with no in-frame STOP
28 bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
29 bool altPhases=false; //if original phase fails translation validation,
30                      //try the other 2 phases until one makes it
31 bool addCDSattrs=false;
32 bool add_hasCDS=false;
33 //bool streamIn=false; // --stream option
34 bool adjustStop=false; //automatic adjust the CDS stop coordinate
35 bool covInfo=false; // --cov-info : only report genome coverage
36 GStr tableFormat; //list of "attributes" to print in tab delimited format
37 bool spliceCheck=false; //only known splice-sites
38 bool decodeChars=false; //decode url-encoded chars in attrs (-D)
39 bool StarStop=false; //use * instead of . for stop codon translation
40 bool fullCDSonly=false; // starts with START, ends with STOP codon
41 
42 bool multiExon=false;
43 bool writeExonSegs=false;
44 char* tracklabel=NULL;
45 /*
46 char* rfltGSeq=NULL;
47 char rfltStrand=0;
48 uint rfltStart=0;
49 uint rfltEnd=MAX_UINT;*/
50 GRangeParser* fltRange=NULL;
51 
52 GRangeParser* fltJunction=NULL;
53 
54 bool rfltWithin=false; //check for full containment within given range
55 bool addDescr=false;
56 
57 bool wfaNoCDS=false;
58 
59 bool fmtGFF3=true; //default output: GFF3
60 //other formats only make sense in transcriptOnly mode
61 bool fmtGTF=false;
62 bool fmtBED=false;
63 bool fmtTLF=false;
64 bool fmtTable=false;
65 
66 GffPrintMode exonPrinting=pgffAny;
67 
68 GFastaDb gfasta;
69 
70 GHash<SeqInfo*> seqinfo;
71 GVec<CTableField> tableCols;
72 GHash<RefTran*> reftbl;
73 GStrSet<> fltIDs;
74 
75 GStrSet<> attrList;
76 
77 GHash<int> isoCounter; //counts the valid isoforms
78 
printFasta(FILE * f,GStr * defline,char * seq,int seqlen,bool useStar)79 void printFasta(FILE* f, GStr* defline, char* seq, int seqlen, bool useStar) {
80  if (seq==NULL) return;
81  int len=(seqlen>0)?seqlen:strlen(seq);
82  if (len<=0) return;
83  if (defline!=NULL)
84      fprintf(f, ">%s\n",defline->chars());
85  int ilen=0;
86  for (int i=0; i < len; i++, ilen++) {
87    if (ilen == 70) {
88      fputc('\n', f);
89      ilen = 0;
90      }
91    if (useStar && seq[i]=='.')
92         putc('*', f);
93    else putc(seq[i], f);
94    } //for
95  fputc('\n', f);
96 }
97 
qsearch_gloci(uint x,GList<GffLocus> & loci)98 int qsearch_gloci(uint x, GList<GffLocus>& loci) {
99   //binary search
100   //do the simplest tests first:
101   if (loci[0]->start>x) return 0;
102   if (loci.Last()->start<x) return -1;
103   uint istart=0;
104   int i=0;
105   int idx=-1;
106   int maxh=loci.Count()-1;
107   int l=0;
108   int h = maxh;
109   while (l <= h) {
110      i = (l+h)>>1;
111      istart=loci[i]->start;
112      if (istart < x)  l = i + 1;
113           else {
114              if (istart == x) { //found matching coordinate here
115                   idx=i;
116                   while (idx<=maxh && loci[idx]->start==x) {
117                      idx++;
118                      }
119                   return (idx>maxh) ? -1 : idx;
120                   }
121              h = i - 1;
122              }
123      } //while
124  idx = l;
125  while (idx<=maxh && loci[idx]->start<=x) {
126     idx++;
127     }
128  return (idx>maxh) ? -1 : idx;
129 }
130 
qsearch_rnas(uint x,GList<GffObj> & rnas)131 int qsearch_rnas(uint x, GList<GffObj>& rnas) {
132   //binary search
133   //do the simplest tests first:
134   if (rnas[0]->start>x) return 0;
135   if (rnas.Last()->start<x) return -1;
136   uint istart=0;
137   int i=0;
138   int idx=-1;
139   int maxh=rnas.Count()-1;
140   int l=0;
141   int h = maxh;
142   while (l <= h) {
143      i = (l+h)>>1;
144      istart=rnas[i]->start;
145      if (istart < x)  l = i + 1;
146           else {
147              if (istart == x) { //found matching coordinate here
148                   idx=i;
149                   while (idx<=maxh && rnas[idx]->start==x) {
150                      idx++;
151                      }
152                   return (idx>maxh) ? -1 : idx;
153                   }
154              h = i - 1;
155              }
156      } //while
157  idx = l;
158  while (idx<=maxh && rnas[idx]->start<=x) {
159     idx++;
160     }
161  return (idx>maxh) ? -1 : idx;
162 }
163 
cmpRedundant(GffObj & a,GffObj & b)164 int cmpRedundant(GffObj& a, GffObj& b) {
165   if (a.exons.Count()==b.exons.Count()) {
166      if (a.covlen==b.covlen) {
167        return strcmp(a.getID(), b.getID());
168        }
169      else return (a.covlen>b.covlen)? 1 : -1;
170      }
171    else return (a.exons.Count()>b.exons.Count())? 1: -1;
172 }
173 
tMatch(GffObj & a,GffObj & b)174 bool tMatch(GffObj& a, GffObj& b) {
175   //strict intron chain match, or single-exon perfect match
176   int imax=a.exons.Count()-1;
177   int jmax=b.exons.Count()-1;
178   int ovlen=0;
179   if (imax!=jmax) return false; //different number of introns
180 
181   if (imax==0) { //single-exon mRNAs
182     //if (equnspl) {
183       //fuzz match for single-exon transfrags:
184       // it's a match if they overlap at least 80% of max len
185       ovlen=a.exons[0]->overlapLen(b.exons[0]);
186       int maxlen=GMAX(a.covlen,b.covlen);
187       return (ovlen>=maxlen*0.8);
188     /*}
189     else {
190       //only exact match
191       ovlen=a.covlen;
192       return (a.exons[0]->start==b.exons[0]->start &&
193           a.exons[0]->end==b.exons[0]->end);
194 
195        }*/
196      }
197   //check intron overlaps
198   ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
199   ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
200   for (int i=1;i<=imax;i++) {
201     if (i<imax) ovlen+=a.exons[i]->len();
202     if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
203       (a.exons[i]->start!=b.exons[i]->start)) {
204             return false; //intron mismatch
205     }
206   }
207   return true;
208 }
209 
210 
getSeqDescr(char * seqid)211 char* getSeqDescr(char* seqid) {
212  static char charbuf[128];
213  if (seqinfo.Count()==0) return NULL;
214  char* suf=rstrchr(seqid, '.');
215  if (suf!=NULL) *suf=0;
216  SeqInfo* seqd=seqinfo.Find(seqid);
217  if (suf!=NULL) *suf='.';
218  if (seqd!=NULL) {
219   GStr s(seqd->descr);
220   //cleanup some Uniref gunk
221   if (s[0]=='[') {
222     int r=s.index(']');
223     if (r>=0 && r<8 && isdigit(s[1]))
224        s.remove(0,r+1);
225     }
226   if (s.length()>80) {
227     int r=s.index(';');
228     if (r>5) s.cut(r);
229     }
230   if (s.length()>127) {
231    s.cut(127);
232    int r=s.rindex(' ');
233    if (r>0) s.cut(r);
234    }
235   strcpy(charbuf, s.chars());
236   return charbuf;
237   }
238  else return NULL;
239 }
240 
getSeqName(char * seqid)241 char* getSeqName(char* seqid) {
242   static char charbuf[128];
243   char* suf=rstrchr(seqid, '.');
244   if (suf!=NULL) *suf=0;
245   strcpy(charbuf, seqid);
246   if (suf!=NULL) *suf='.';
247   return charbuf;
248 }
249 
250 
adjust_stopcodon(GffObj & gffrec,int adj,GList<GSeg> * seglst)251 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst) {
252   //adj>0, extend CDS to include a potential stop codon
253   //when CDS is expanded, the terminal exon might have to be adjusted too
254   int realadj=0;
255   if (gffrec.strand=='-') {
256        if ((int)gffrec.CDstart>adj) {
257            gffrec.CDstart-=adj;
258            realadj=adj;
259            if (gffrec.exons.First()->start>gffrec.CDstart) {
260                  gffrec.covlen+=gffrec.exons.First()->start - gffrec.CDstart;
261                  gffrec.exons.First()->start=gffrec.CDstart;
262                  gffrec.start=gffrec.CDstart;
263                  }
264              }
265           }
266         else { // forward strand
267          //expand beyond
268          realadj=adj;
269          gffrec.CDend+=adj;
270          if (adj<0) {//restore
271            if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
272                         gffrec.exons.Last()->end+=adj;
273                         gffrec.end=gffrec.exons.Last()->end;
274                         gffrec.covlen+=adj;
275                         }
276          }
277          else if (gffrec.exons.Last()->end<gffrec.CDend) {
278              gffrec.covlen+=gffrec.CDend-gffrec.exons.Last()->end;
279              gffrec.exons.Last()->end=gffrec.CDend;
280              gffrec.end=gffrec.CDend;
281              }
282          }
283   if (seglst!=NULL) seglst->Last()->end+=realadj;
284   return realadj;
285  }
286 
printTableData(FILE * f,GffObj & g,bool inFasta)287 void printTableData(FILE* f, GffObj& g, bool inFasta) {
288  //using attribute list in tableCols
289 	const int DBUF_LEN=1024; //there should not be attribute values larger than 1K!
290 	char dbuf[DBUF_LEN];
291 	char* av=NULL;
292 	for(int i=0;i<tableCols.Count();i++) {
293 		if (i>0 || inFasta) {
294      	   if (!inFasta || tableCols[i].type!=ctfGFF_ID)
295      		   fprintf(f,"\t");
296 		}
297 		switch(tableCols[i].type) {
298 		case ctfGFF_Attr:
299 			av=g.getAttr(tableCols[i].name.chars());
300 			if (av) {
301 				if (decodeChars) {
302 					GffObj::decodeHexChars(dbuf, av, DBUF_LEN-1);
303 					fprintf(f,"%s", dbuf);
304 				} else fprintf(f,"%s",av);
305 			} else fprintf(f,".");
306 			break;
307 		case ctfGFF_chr:
308 			fprintf(f,"%s",g.getGSeqName());
309 			break;
310 		case ctfGFF_ID:
311 			if (!inFasta)
312 			  fprintf(f,"%s",g.getID());
313 			break;
314 		case ctfGFF_geneID:
315 			fprintf(f,"%s",g.getGeneID()!=NULL ? g.getGeneID() : ".");
316 			break;
317 		case ctfGFF_geneName:
318 			fprintf(f,"%s",g.getGeneName()!=NULL ? g.getGeneName() : ".");
319 			break;
320 		case ctfGFF_Parent:
321 			fprintf(f,"%s",g.parent!=NULL ? g.parent->getID() : ".");
322 			break;
323 		case ctfGFF_feature:
324 			fprintf(f,"%s",g.getFeatureName());
325 			break;
326 		case ctfGFF_start:
327 			fprintf(f,"%d",g.start);
328 			break;
329 		case ctfGFF_end:
330 			fprintf(f,"%d",g.end);
331 			break;
332 		case ctfGFF_strand:
333 			fprintf(f,"%c",g.strand);
334 			break;
335 		case ctfGFF_numexons:
336 			fprintf(f,"%d",g.exons.Count());
337 			break;
338 		case ctfGFF_exons:
339 			if (g.exons.Count()>0) {
340 				for (int x=0;x<g.exons.Count();x++) {
341 					if (x>0) fprintf(f,",");
342 					fprintf(f,"%d-%d",g.exons[x]->start, g.exons[x]->end);
343 				}
344 			} else fprintf(f,".");
345 			break;
346 		case ctfGFF_cds:
347 			if (g.hasCDS()) {
348 				GVec<GffExon> cds;
349 				g.getCDSegs(cds);
350 				for (int x=0;x<cds.Count();x++) {
351 					if (x>0) fprintf(f,",");
352 				    fprintf(f,"%d-%d",cds[x].start, cds[x].end);
353 				}
354 			}
355 			else fprintf(f,".");
356 			break;
357 		case ctfGFF_covlen:
358 			fprintf(f, "%d", g.covlen);
359 			break;
360 		case ctfGFF_cdslen:
361 			if (g.hasCDS()) {
362 				GVec<GffExon> cds;
363 				g.getCDSegs(cds);
364 				int clen=0;
365 				for (int x=0;x<cds.Count();x++)
366 				    clen+=cds[x].end-cds[x].start+1;
367 				fprintf(f, "%d", clen);
368 			}
369 			else fprintf(f, "0");
370 			break;
371 		} //switch
372 	}
373 	fprintf(f,"\n");
374 }
375 
validateGffRec(GffObj * gffrec)376 bool GffLoader::validateGffRec(GffObj* gffrec) {
377 	if (!checkFilters(gffrec)) {
378 		if (gffrec->isTranscript()) {
379 			TFilters=true;
380 			if (gffrec->parent!=NULL && keepGenes) {
381 			   GPVec<GffObj>& pchildren=gffrec->parent->children;
382 			   for (int c=0;c<pchildren.Count();c++) {
383 				   if (pchildren[c]==gffrec) {
384 					   pchildren.Delete(c);
385 					   break;
386 				   }
387 			   }
388 			}
389 		return false;
390 		}
391 		if (gffrec->isGene() && keepGenes) return true;
392 		return false;
393 	} //transcript rejected
394 	return true;
395 }
396 
checkFilters(GffObj * gffrec)397 bool GffLoader::checkFilters(GffObj* gffrec) {
398 	if (reftbl.Count()>0) { //check if we need to reject by ref seq filter
399 		GStr refname(gffrec->getRefName());
400 		RefTran* rt=reftbl.Find(refname.chars());
401 		if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
402 			//try removing the version suffix
403 			refname.cut(-2);
404 			//GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
405 			rt=reftbl.Find(refname.chars());
406 		}
407 		if (rt) {
408 			gffrec->setRefName(rt->new_name);
409 		}
410 		/* //no, do not discard non-matching entries, let them pass through!
411 		else {
412 			if (verbose)
413 				GMessage("Info: %s discarded due to reference %s not being mapped\n",
414 						gffrec->getID(), refname.chars());
415 			return false; //discard, ref seq not in the given translation table
416 		}*/
417 	}
418 	if (transcriptsOnly && gffrec->isDiscarded()) {
419 		//discard generic "locus" features with no other detailed subfeatures
420 		//GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",gffrec->getID());
421 		return false;
422 	}
423 	if (IDflt) {
424 		if (fltIDs.hasKey(gffrec->getID())) {
425 			if (IDflt==idFlt_Exclude) return false;
426 		}
427 		else if (IDflt==idFlt_Only) return false;
428 	}
429 	if (minLen>0 && gffrec->covlen<minLen) {
430 		if (verbose)
431 			GMessage("Info: %s discarded due to minimum length threshold %d\n",
432 					gffrec->getID(), minLen);
433     	return false;
434 	}
435 	if (fltRange!=NULL) { //filter by gseqName
436 		if (fltRange->refName!=NULL && strcmp(gffrec->getGSeqName(),fltRange->refName)!=0) {
437 			return false;
438 		}
439 		if (fltRange->strand>0 && gffrec->strand!=fltRange->strand) {
440 			return false;
441 		}
442 		//check coordinates
443 		if (fltRange->start || fltRange->end<UINT_MAX) {
444 			if (rfltWithin) {
445 				if (gffrec->start<fltRange->start || gffrec->end>fltRange->end) {
446 					return false; //not within query range
447 				}
448 			}
449 			else {
450 				if (gffrec->start>fltRange->end || gffrec->end<fltRange->start) {
451 					return false;
452 				}
453 			}
454 		}
455 	}
456 	if (this->attrsFilter) { //mostly relevant for transcripts and gene records
457 		//remove attributes that are not in attrList
458 		gffrec->removeAttrs(attrList);
459 	}
460     if (gffrec->isTranscript()) {    // && TFilters) ?
461     	//these filters only apply to transcripts
462 		if (multiExon && gffrec->exons.Count()<=1) {
463 			return false;
464 		}
465 		if (wCDSonly && gffrec->CDstart==0) {
466 			return false;
467 		}
468 		if (wNConly && gffrec->hasCDS()) return false;
469 		if (fltJunction!=NULL) {
470 			if (gffrec->exons.Count()<=1) return false;
471 			if (fltJunction->refName!=NULL && strcmp(gffrec->getGSeqName(),fltJunction->refName)!=0) {
472 				return false;
473 			}
474 			if (fltJunction->strand && gffrec->strand!=fltJunction->strand) {
475 				return false;
476 			}
477 			//check coordinates
478 			uint jstart=fltJunction->start;
479 			uint jend=fltJunction->end;
480 			if (jstart==0) jstart=jend;
481 			if (jend==0)  jend=jstart;
482 			if (gffrec->start>=jstart || gffrec->end<=jend) {
483 						return false;
484 	        }
485 
486 			bool noJMatch=true;
487 			for (int i=0;i<gffrec->exons.Count()-1;++i) {
488 				if (fltJunction->start && fltJunction->end) {
489 					if (gffrec->exons[i]->end+1==fltJunction->start &&
490 							gffrec->exons[i+1]->start-1==fltJunction->end)
491 						{ noJMatch=false; break; }
492 				} else if (fltJunction->start) { //end match not required
493 					if (gffrec->exons[i]->end+1==fltJunction->start)
494 						{ noJMatch=false; break; }
495 				} else { //only end match required:
496 					if (gffrec->exons[i+1]->start-1==fltJunction->end)
497 						{ noJMatch=false; break; }
498 				}
499 			}
500 			if (noJMatch) return false;
501 		}
502 
503 		return process_transcript(gfasta, *gffrec);
504     } //transcript filters check
505 	return true;
506 }
507 
process_transcript(GFastaDb & gfasta,GffObj & gffrec)508 bool GffLoader::process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
509  if (!gffrec.isTranscript()) return false; //shouldn't call this function unless it's a transcript
510  //returns true if the transcript passed the filter
511  char* gname=gffrec.getGeneName();
512  if (gname==NULL) gname=gffrec.getGeneID();
513  if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
514       const char* biotype=gffrec.getAttr("gene_biotype");
515       if (biotype) {
516          gffrec.addAttr("type", biotype);
517          gffrec.removeAttr("gene_biotype");
518          }
519        else { //old Ensembl files lacking gene_biotype
520          gffrec.addAttr("type", gffrec.getTrackName());
521          }
522 
523       //bool is_gene=false;
524       bool is_pseudo=false;
525       if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
526                 gffrec.setFeatureName("mRNA");
527        else {
528           if (strcmp(biotype, "processed_transcript")==0)
529               gffrec.setFeatureName("proc_RNA");
530             else {
531               //is_gene=endsWith(biotype, "gene");
532               is_pseudo=strifind(biotype, "pseudo");
533               if (is_pseudo) {
534                    gffrec.setFeatureName("pseudo_RNA");
535                    }
536                 else if (endsWith(biotype, "RNA")) {
537                    gffrec.setFeatureName(biotype);
538                    } else gffrec.setFeatureName("misc_RNA");
539               }
540           }
541       }
542  if (gname && strcmp(gname, gffrec.getID())!=0) {
543 	   int* isonum=isoCounter.Find(gname);
544 	   if  (isonum==NULL) {
545 		   //isonum=new int(1);
546 		   isoCounter.Add(gname,1);
547 	   }
548 		else (*isonum)++;
549 	   //defline.appendfmt(" gene=%s", gname);
550    }
551   int seqlen=0;
552 
553   const char* tlabel=tracklabel;
554   if (tlabel==NULL) tlabel=gffrec.getTrackName();
555   //defline.appendfmt(" track:%s",tlabel);
556   char* cdsnt = NULL;
557   char* cdsaa = NULL;
558   int aalen=0;
559   for (int i=1;i<gffrec.exons.Count();i++) {
560      int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
561      if (verbose && ilen>4000000)
562             GMessage("Warning: very large intron (%d) for transcript %s\n",
563                            ilen, gffrec.getID());
564      if (ilen>maxintron) {
565          return false;
566      }
567   }
568   GMapSegments seglst(gffrec.strand);
569   GFaSeqGet* faseq=NULL;
570   if (f_x!=NULL || f_y!=NULL || f_w!=NULL || spliceCheck || validCDSonly || addCDSattrs) {
571 	  faseq=fastaSeqGet(gfasta, gffrec.getGSeqName());
572       if (faseq==NULL)
573 	    	GError("Error: no genomic sequence available (check -g option!).\n");
574   }
575   if (spliceCheck && gffrec.exons.Count()>1) {
576     //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
577     int glen=gffrec.end-gffrec.start+1;
578     const char* gseq=faseq->subseq(gffrec.start, glen);
579     if (gseq==NULL) {
580     	GMessage("Error at GFF ID %s : could not retrieve subsequence %s:%d-%d !\n",
581     			  gffrec.getID(), gffrec.getRefName(), gffrec.start, gffrec.end);
582     	return false;
583     }
584     bool revcompl=(gffrec.strand=='-');
585     bool ssValid=true;
586     for (int e=1;e<gffrec.exons.Count();e++) {
587       const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
588       int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
589       GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
590       GSpliceSite    donorSite(intron,intronlen, false, revcompl);
591       //GMessage("%c intron %d-%d : %s .. %s\n",
592       //           gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
593       if (acceptorSite=="AG") { // GT-AG or GC-AG
594          if (!donorSite.canonicalDonor()) {
595             ssValid=false;break;
596             }
597          }
598       else if (acceptorSite=="AC") { //AT-AC also accepted
599          if (donorSite!="AT") { ssValid=false; break; }
600          }
601       else { ssValid=false; break; }
602       }
603     if (!ssValid) {
604       if (verbose)
605          GMessage("Unrecognized splice sites found for '%s'\n",gffrec.getID());
606       return false; //don't print this one!
607     }
608   }
609   bool trprint=true;
610   bool inframeStop=false;
611   //int stopCodonAdjust=0;
612   int mCDphase=0;
613   bool fullCDS=false;
614   bool endStop=false;
615   bool stopAdjusted=false;
616   if (add_hasCDS && gffrec.hasCDS()) gffrec.addAttr("hasCDS", "true");
617   if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
618       mCDphase = gffrec.CDphase-'0';
619   //CDS partialness only added when -y -x -V options are given
620   if (gffrec.hasCDS() && (f_y!=NULL || f_x!=NULL || validCDSonly || addCDSattrs)) {
621     int strandNum=0;
622     int phaseNum=0;
623   CDS_CHECK:
624     uint cds_olen=0;
625     inframeStop=false;
626     cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, &cds_olen, &seglst, adjustStop);
627     //if adjustStop, seqlen has the CDS+3'UTR length, but cds_olen still has the original CDS length
628     if (cdsnt!=NULL && cdsnt[0]!='\0') { //has CDS
629          cdsaa=translateDNA(cdsnt, aalen, seqlen);
630          char* p=strchr(cdsaa,'.');
631          int cds_aalen=aalen;
632          if (adjustStop)
633         	 cds_aalen=cds_olen/3; //originally stated CDS length
634          endStop=false;
635          if (p!=NULL) { //stop codon found
636         	 if (p-cdsaa==cds_aalen-1) { //stop found as the stated last CDS codon
637                   *p='\0';//remove it
638                   endStop=true;
639                   if (adjustStop) {
640                 	  seqlen=cds_aalen*3;
641                 	  aalen=cds_aalen;
642                   }
643                   cds_aalen--;
644                   aalen--;
645                   //no need to adjust stop codon
646               }
647               else {//stop found in a different position than the last codon
648             	  if (p-cdsaa<cds_aalen-1 && !adjustStop) {
649             		  inframeStop=true;
650             	  }
651             	  if (adjustStop) {
652             		  *p='\0';
653             		  cds_aalen=p-cdsaa+1; //adjusted CDS length
654             		  seqlen=cds_aalen*3;
655             		  aalen=cds_aalen;
656             		  uint gc=seglst.gmap(seqlen);
657             		  if (gffrec.strand=='-') gffrec.CDstart=gc;
658             		  else gffrec.CDend=gc;
659             		  endStop=true;
660             		  stopAdjusted=true;
661             	  }
662               }
663          }//stop codon found
664          //if (trprint==false) { //failed CDS validity check
665          if (inframeStop) {
666            //in-frame stop codon found
667            if (altPhases && phaseNum<3) {
668               phaseNum++; //try a different phase
669               gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
670               GFREE(cdsaa);
671               goto CDS_CHECK;
672            }
673            if (gffrec.exons.Count()==1 && bothStrands) {
674               strandNum++;
675               phaseNum=0;
676               if (strandNum<2) {
677                  GFREE(cdsaa);
678                  gffrec.strand = (gffrec.strand=='-') ? '+':'-';
679                  goto CDS_CHECK; //repeat the CDS check for a different frame
680               }
681            }
682            if (verbose) GMessage("Warning: In-frame STOP found for '%s'\n",gffrec.getID());
683            if (addCDSattrs) gffrec.addAttr("InFrameStop", "true");
684          } //has in-frame STOP
685          if (stopAdjusted) {
686       	   if (addCDSattrs) gffrec.addAttr("CDStopAdjusted", "true");
687       	   inframeStop=false; //pretend it's OK now that we've adjusted it
688          }
689          if (!inframeStop) {
690 			 bool hasStart=(cdsaa[0]=='M'); //for the regular eukaryotic translation table
691 			 fullCDS=(endStop && hasStart);
692 			 if (!fullCDS) {
693 				 const char* partialness=NULL;
694 				 if (hasStart) partialness="3";
695 				 else {
696 					partialness = endStop ? "5" : "5_3";
697 				 }
698 				 if (addCDSattrs) gffrec.addAttr("partialness", partialness);
699 			 }
700          }
701          if (trprint && ((fullCDSonly && !fullCDS) || (validCDSonly && inframeStop)) )
702         	 trprint=false;
703          //} // Valid CDS only requested?
704       } //has CDS
705   } //translation or codon check was requested
706   if (!trprint) {
707     GFREE(cdsnt);
708     GFREE(cdsaa);
709     //if (adjstop!=NULL) delete adjstop;
710     return false;
711   }
712   /*
713   if (validCDSonly) {
714      int stopCodonAdjust=adjstop->restore();
715      if (stopCodonAdjust!=0 && !endStop) {
716         //restore stop codon location
717         //adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
718 	    if (seglst.Count()>0) seglst.Last()->end-=stopCodonAdjust;
719         if (cdsnt!=NULL && seqlen>0) {
720            seqlen-=stopCodonAdjust;
721            cdsnt[seqlen]=0;
722         }
723         if (cdsaa!=NULL) aalen--;
724      }
725   }
726   if (adjstop!=NULL) delete adjstop;
727   */
728   if (cdsnt!=NULL) { // && !inframeStop) {
729 	  GStr defline(gffrec.getID(), 94);
730 	  if (writeExonSegs) {
731 		  defline.append(" loc:");
732 		  defline.append(gffrec.getGSeqName());
733 		  defline.appendfmt("(%c)",gffrec.strand);
734 		  //warning: not CDS coordinates are written here, but the exon ones
735 		  defline+=(int)gffrec.start;
736 		  defline+=(char)'-';
737 		  defline+=(int)gffrec.end;
738 		  // -- here these are CDS substring coordinates on the spliced sequence:
739 		  defline.append(" segs:");
740 		  for (int i=0;i<seglst.Count();i++) {
741 			  if (i>0) defline.append(",");
742 			  defline+=(int)seglst[i].start;
743 			  defline.append("-");
744 			  defline+=(int)seglst[i].end;
745 		  }
746 	  }
747 	  if (f_y!=NULL) { //CDS translation fasta output requested
748 			 if (cdsaa==NULL) { //translate now if not done before
749 			   cdsaa=translateDNA(cdsnt, aalen, seqlen);
750 			 }
751 			 if (aalen>0) {
752 			   if (cdsaa[aalen-1]=='.' || cdsaa[aalen-1]=='\0') --aalen; //avoid printing the stop codon
753  			   fprintf(f_y, ">%s", defline.chars());
754  			   if (fmtTable) printTableData(f_y, gffrec, true);
755  			   else {
756  				  if (gffrec.attrs!=NULL && gffrec.attrs->Count()>0) fprintf(f_y," ");
757  				  gffrec.printAttrs(f_y, ";", false, decodeChars, false);
758  				  fprintf(f_y, "\n");
759  			   }
760 			   printFasta(f_y, NULL, cdsaa, aalen, StarStop);
761 			 }
762 	  }
763 	  if (f_x!=NULL) { //CDS only
764 			 fprintf(f_x, ">%s", defline.chars());
765 			 if (fmtTable) printTableData(f_x, gffrec, true);
766 			 else {
767 				 if (gffrec.attrs!=NULL && gffrec.attrs->Count()>0) fprintf(f_x," ");
768 				 gffrec.printAttrs(f_x, ";", false, decodeChars, false);
769 				 fprintf(f_x, "\n");
770 			 }
771 			 printFasta(f_x, NULL, cdsnt, seqlen);
772 	  }
773 	  GFREE(cdsnt);
774 	  GFREE(cdsaa);
775   } //writing CDS or its translation
776   if (f_w!=NULL) { //write spliced exons
777 	  uint cds_start=0;
778 	  uint cds_end=0;
779 	  seglst.Clear();
780 	  int padLeft=0;
781 	  int padRight=0;
782 	  if (wPadding>0) {
783 		padLeft= (gffrec.start>(uint)wPadding) ? wPadding : gffrec.start - 1;
784 		int ediff=faseq->getseqlen()-gffrec.end;
785 	    padRight=(wPadding>ediff) ?  ediff : wPadding;
786    	    gffrec.addPadding(padLeft, padRight);
787 	  }
788 	  char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
789 	  //restore exons to normal (remove padding)
790 	  if (wPadding>0)
791 		  gffrec.removePadding(padLeft, padRight);
792 
793 	  GStr defline(gffrec.getID());
794 	  if (exont!=NULL) {
795 		  if (!wfaNoCDS && gffrec.CDstart>0) {
796 			  defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
797 		  }
798 		  if (writeExonSegs) {
799 			  defline.append(" loc:");
800 			  defline.append(gffrec.getGSeqName());
801 			  defline+=(char)'|';
802 			  defline+=(int)gffrec.start;
803 			  defline+=(char)'-';
804 			  defline+=(int)gffrec.end;
805 			  defline+=(char)'|';
806 			  defline+=(char)gffrec.strand;
807 			  defline.append(" exons:");
808 			  for (int i=0;i<gffrec.exons.Count();i++) {
809 				  if (i>0) defline.append(",");
810 				  defline+=(int)gffrec.exons[i]->start;
811 				  defline.append("-");
812 				  defline+=(int)gffrec.exons[i]->end;
813 			  }
814 			if (wPadding>0) {
815 				defline.append(" padding:");
816 				defline.append(padLeft);
817 				defline+=(char)'|';
818 				defline.append(padRight);
819 			}
820 
821 			defline.append(" segs:");
822 			for (int i=0;i<seglst.Count();i++) {
823 				if (i>0) defline.append(",");
824 				defline+=(int)seglst[i].start;
825 				defline.append("-");
826 				defline+=(int)seglst[i].end;
827 				}
828 		  }
829 
830 		  fprintf(f_w, ">%s", defline.chars());
831 		  if (fmtTable) printTableData(f_w, gffrec, true);
832 		    else {
833 		    	if (gffrec.attrs!=NULL && gffrec.attrs->Count()>0) fprintf(f_w," ");
834 		    	gffrec.printAttrs(f_w, ";", false, decodeChars, false);
835 		    	fprintf(f_w, "\n");
836 		    }
837 		  printFasta(f_w, NULL, exont, seqlen);
838 		  GFREE(exont);
839 	  }
840   } //writing f_w (spliced exons)
841   return true;
842 }
843 
844 
GTData(GffObj * t,GenomicSeqData * gd)845 GTData::GTData(GffObj* t, GenomicSeqData* gd):rna(t),gdata(gd), locus(NULL), replaced_by(NULL), geneinfo(NULL) {
846     if (rna!=NULL) {
847         //geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
848         rna->uptr=this;
849     }
850     if (gdata!=NULL)
851  	   gdata->tdata.Add(this);
852 }
853 
unsplContained(GffObj & ti,GffObj & tj)854 bool GffLoader::unsplContained(GffObj& ti, GffObj&  tj) {
855  //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
856  //but it does not cross any intron-exon boundary of tj
857   int imax=ti.exons.Count()-1;
858   int jmax=tj.exons.Count()-1;
859   if (imax>0) GError("Error: bad unsplContained() call, 1st parameter must be single-exon transcript!\n");
860   if (fuzzSpan) {
861     int maxIntronOvl=dOvlSET ? 25 : 0;
862     //int minovl = dOvlSET ? 5 : (int)(0.8 * ti.len()); //minimum overlap to declare "redundancy"
863     for (int j=0;j<=jmax;j++) {
864        bool exonOverlap=false;
865        if (dOvlSET) {
866     	   exonOverlap= (tj.exons[j]->overlapLen(ti.start-1, ti.end+1) > 0);
867        } else {
868     	   exonOverlap=(ti.overlapLen(tj.exons[j])>=0.8 * ti.len());
869        }
870        if (exonOverlap) {
871           //must not overlap the introns
872           if ((j>0 && ti.start+maxIntronOvl<tj.exons[j]->start)
873              || (j<jmax && ti.end>tj.exons[j]->end+maxIntronOvl))
874              return false;
875           return true;
876        }
877     } //for each exon
878   } else { // not fuzzSpan, strict containment required
879     for (int j=0;j<=jmax;j++) {
880         if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
881           return true;
882     }
883  }
884  return false;
885 }
886 
redundantTranscripts(GffObj & ti,GffObj & tj)887 GffObj* GffLoader::redundantTranscripts(GffObj& ti, GffObj&  tj) {
888   // matchAllIntrons==true:  transcripts are considered "redundant" only if
889   //                   they have the exact same number of introns and same splice sites (or none)
890   //                   (single-exon transcripts should be also fully contained to be considered matching)
891   // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
892   //                   as long as no intron-exon boundaries are violated; also, a single-exon
893   //                   transcript will be collapsed if it's contained in one of the exons of the another transcript
894   // fuzzSpan==false: the genomic span of one transcript MUST BE contained in or equal with the genomic
895   //                  span of the other
896   //
897   // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
898   //                 (i.e. they may extend each-other in opposite directions)
899 
900   //if redundancy is detected, the "bigger" transcript is returned (otherwise NULL is returned)
901  int adj=dOvlSET ? 1 : 0;
902  if (ti.start>tj.end+adj || tj.start>ti.end+adj ||
903 		 (tj.strand!='.' && ti.strand!='.' && tj.strand!=ti.strand)) return NULL; //no span overlap
904  int imax=ti.exons.Count()-1;
905  int jmax=tj.exons.Count()-1;
906  GffObj* bigger=NULL;
907  GffObj* smaller=NULL;
908  if (matchAllIntrons) { //full intron chain match expected, or full containment for SET
909    if (imax!=jmax) return NULL; //must have the same number of exons!
910    if (ti.covlen>tj.covlen) {
911       bigger=&ti;
912       if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end))
913         return NULL; //no containment
914    }
915    else { //ti.covlen<=tj.covlen
916       bigger=&tj;
917       if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end))
918          return NULL; //no containment
919    }
920    //check that all introns really match
921    for (int i=0;i<imax;i++) {
922      if (ti.exons[i]->end!=tj.exons[i]->end ||
923          ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
924      }
925    return bigger;
926  }
927  //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
928  int minlen=0;
929  if (ti.covlen>tj.covlen) {
930       if (tj.exons.Count()>ti.exons.Count()) {
931           //exon count override
932           bigger=&tj;
933           smaller=&ti;
934       } else {
935           bigger=&ti;
936           smaller=&tj;
937       }
938       //maxlen=ti.covlen;
939       minlen=tj.covlen;
940  } else { //tj has more bases covered
941       if (ti.exons.Count()>tj.exons.Count()) {
942           //exon count override
943           bigger=&ti;
944           smaller=&tj;
945       } else {
946           bigger=&tj;
947           smaller=&ti;
948       }
949       //maxlen=tj.covlen;
950       minlen=ti.covlen;
951  }
952  if (imax==0 && jmax==0) {
953      //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
954      if (fuzzSpan) {
955        if (dOvlSET) {
956            return (ti.exons[0]->overlapLen(tj.exons[0]->start-1, tj.exons[0]->end+1)>0) ? bigger : NULL;
957        } else {
958           return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
959        }
960      } else { //boundary containment required
961        return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
962      }
963  }
964  //containment is also considered redundancy
965  if (smaller->exons.Count()==1) {
966    //check if this single exon is contained in any of tj exons
967    //without violating any intron-exon boundaries
968    return (unsplContained(*smaller, *bigger) ? bigger : NULL);
969  }
970 
971  //--- from here on: both are multi-exon transcripts: imax>0 && jmax>0
972   if (ti.exons[imax]->start<tj.exons[0]->end ||
973      tj.exons[jmax]->start<ti.exons[0]->end )
974          return NULL; //intron chains do not overlap at all
975  //checking full intron chain containment
976  uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
977  int i=1; //exon idx to the right of the current intron of ti
978  int j=1; //exon idx to the right of the current intron of tj
979  //find the first intron overlap:
980  while (i<=imax && j<=jmax) {
981     eistart=ti.exons[i-1]->end;
982     eiend=ti.exons[i]->start;
983     ejstart=tj.exons[j-1]->end;
984     ejend=tj.exons[j]->start;
985     if (ejend<eistart) { j++; continue; }
986     if (eiend<ejstart) { i++; continue; }
987     //we found an intron overlap
988     break;
989  }
990  if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
991  if ((i>1 && j>1) || i>imax || j>jmax) {
992      return NULL; //either no intron overlaps found at all
993                   //or it's not the first intron for at least one of the transcripts
994  }
995  if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
996  int maxIntronOvl=dOvlSET ? 25 : 0;
997  if (j>i) {
998    //i==1, ti's start must not conflict with the previous intron of tj
999    if (ti.start+maxIntronOvl<tj.exons[j-1]->start) return NULL;
1000    //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
1001    //so i's first intron starts AFTER j's first intron
1002    // then j must contain i, so i's last intron must end with or before j's last intron
1003    if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
1004  }
1005  else if (i>j) {
1006    //j==1, tj's start must not conflict with the previous intron of ti
1007    if (tj.start+maxIntronOvl<ti.exons[i-1]->start) return NULL;
1008    //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
1009    //so j's intron chain starts AFTER i's
1010    // then i must contain j, so j's last intron must end with or before j's last intron
1011    if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
1012  }
1013  //now check if the rest of the introns overlap, in the same sequence
1014  i++;
1015  j++;
1016  while (i<=imax && j<=jmax) {
1017    if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
1018       ti.exons[i]->start!=tj.exons[j]->start) return NULL;
1019    i++;
1020    j++;
1021  }
1022  i--;
1023  j--;
1024  if (i==imax && j<jmax) {
1025    // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
1026    if (ti.end>tj.exons[j]->end+maxIntronOvl) return NULL;
1027    }
1028  else if (j==jmax && i<imax) {
1029    if (tj.end>ti.exons[i]->end+maxIntronOvl) return NULL;
1030    }
1031  return bigger;
1032 }
1033 
1034 
gseqCmpName(const pointer p1,const pointer p2)1035 int gseqCmpName(const pointer p1, const pointer p2) {
1036  return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
1037 }
1038 
1039 
printLocus(GffLocus * loc,const char * pre)1040 void printLocus(GffLocus* loc, const char* pre) {
1041   if (pre!=NULL) fprintf(stderr, "%s", pre);
1042   GMessage(" [%d-%d] : ", loc->start, loc->end);
1043   GMessage("%s",loc->rnas[0]->getID());
1044   for (int i=1;i<loc->rnas.Count();i++) {
1045     GMessage(",%s",loc->rnas[i]->getID());
1046     }
1047   GMessage("\n");
1048 }
1049 
preserveContainedCDS(GffObj * tcontainer,GffObj * t)1050 void preserveContainedCDS(GffObj* tcontainer, GffObj* t) {
1051  //transfer contained CDS info to the container if t has a CDS but container does not
1052  if (!t->hasCDS()) return;
1053   if (!tcontainer->hasCDS())//no CDS info on container, just copy it from the contained
1054 	 tcontainer->setCDS(t);
1055 }
1056 
exonOverlap2Gene(GffObj * t,GffObj & g)1057 bool exonOverlap2Gene(GffObj* t, GffObj& g) {
1058 	if (t->exons.Count()>0) {
1059 		return t->exonOverlap(g.start, g.end);
1060 	}
1061 	else return g.overlap(*t);
1062 }
placeGf(GffObj * t,GenomicSeqData * gdata)1063 bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata) {
1064   bool keep=false;
1065   GTData* tdata=NULL;
1066   //int tidx=-1;
1067   /*
1068   if (debug) {
1069      GMessage(">>Placing transcript %s\n", t->getID());
1070      debugState=true;
1071      }
1072     else debugState=false;
1073    */
1074   //dumb TRNA case for RefSeq: gene parent link missing
1075   //try to restore it here; BUT this only works if gene feature comes first
1076   ////DEBUG ONLY:
1077   //if (strcmp(t->getID(),"id24448")==0) { //&& t->start==309180) {
1078   //	 GMessage("placeGf %s (%d, %d) (%d exons)\n", t->getID(),t->start, t->end, t->exons.Count());
1079   //}
1080   //GMessage("DBG>>Placing transcript %s(%d-%d, %d exons)\n", t->getID(), t->start, t->end, t->exons.Count());
1081 
1082   if (t->parent==NULL && t->isTranscript() && trAdoption) {
1083   	int gidx=gdata->gfs.Count()-1;
1084   	while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) {
1085   		GffObj& g = *(gdata->gfs[gidx]);
1086   		//try to find a container gene object for this transcript
1087   		//if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) {
1088   		if (g.isGene() && (t->strand=='.' || t->strand==g.strand) && g.exons.Count()==0
1089   				  && t->start>=g.start && t->end<=g.end) {
1090   			if (g.children.IndexOf(t)<0)
1091   				g.children.Add(t);
1092   			keep=true;
1093   			if (tdata==NULL) {
1094   		       tdata=new GTData(t, gdata); //additional transcript data
1095   			}
1096   			t->parent=&g;
1097   			//disable printing of gene if transcriptsOnly and --keep-genes wasn't given
1098   			if (transcriptsOnly && !keepGenes) {
1099   				T_NO_PRINT(g.udata); //tag it as non-printable
1100   				//keep gene ID and Name into transcript, when we don't print genes
1101   	  			const char* geneName=g.getAttr("Name");
1102   	  			if (t->getAttr("Name")==NULL && geneName) {
1103   	  				t->addAttr("Name", geneName);
1104   	  				if (t->getAttr("gene_name")==NULL)
1105   	  					t->addAttr("gene_name", geneName);
1106   	  			}
1107   	  			t->addAttr("geneID", g.getID());
1108   			}
1109   			break;
1110   		}
1111   		--gidx;
1112   	}
1113   }
1114   bool noexon_gfs=false;
1115   if (t->exons.Count()>0) { //treating this entry as a transcript
1116 	gdata->rnas.Add(t); //added it in sorted order
1117 	if (tdata==NULL) {
1118 	   tdata=new GTData(t, gdata); //additional transcript data
1119 	   //gdata->tdata.Add(tdata);
1120 	}
1121 	keep=true;
1122   }
1123    else {
1124     if (t->isGene() || !this->transcriptsOnly) {
1125 	   gdata->gfs.Add(t);
1126 	   keep=true;
1127 	   if (tdata==NULL) {
1128 		   tdata=new GTData(t, gdata); //additional transcript data
1129 		   //gdata->tdata.Add(tdata);
1130 	   }
1131 	   noexon_gfs=true; //gene-like record, no exons defined
1132 	   keep=true;
1133     } else {
1134        return false; //nothing to do with these non-transcript objects
1135     }
1136   }
1137   //keeping track of genes in special cases
1138 	char* geneid=t->getGeneID();
1139 	bool trackGenes=!t->isGene() && ( (keepGenes && t->parent==NULL) ||
1140 			    (ensembl_convert && startsWith(t->getID(), "ENS") ) ) ;
1141 	if (trackGenes) {
1142 		GTData* tdata=(GTData*)(t->uptr);
1143 		//keep track of chr|gene_id data and coordinate range
1144 		if (geneid!=NULL) {
1145 			GeneInfo* ginfo=gene_ids.Find(geneid);
1146 			if (ginfo==NULL) {//first time seeing this gene ID
1147 				GeneInfo* geneinfo=new GeneInfo(t, tdata->gdata, ensembl_convert);
1148 				gene_ids.Add(geneid, geneinfo);
1149 				//if (gfnew!=NULL) //new gene features
1150 				//  gfnew->Add(geneinfo->gf);
1151 			}
1152 			else ginfo->update(t);
1153 		}
1154 	}
1155 
1156 
1157 
1158   if (!doCluster) return keep;
1159 
1160   if (!keep) return false;
1161 
1162   //---- place into a locus
1163   if (dOvlSET && t->exons.Count()==1) {
1164 	  //for single exon transcripts temporarily set the strand to '.'
1165 	  //so we can check both strands for overlap/locus
1166       T_SET_OSTRAND(t->udata, t->strand);
1167       t->strand='.';
1168   }
1169   if (gdata->loci.Count()==0) {
1170        gdata->loci.Add(new GffLocus(t));
1171        return true; //new locus on this ref seq
1172   }
1173   //--- look for any existing loci overlapping t
1174   uint t_end=t->end;
1175   uint t_start=t->start;
1176   if (dOvlSET) {
1177 	  t_end++;
1178 	  t_start--;
1179   }
1180   int nidx=qsearch_gloci(t_end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
1181   //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
1182   if (nidx==0) {
1183      //cannot have any overlapping loci
1184      //if (debug) GMessage("  <<no ovls possible, create locus %d-%d \n",t->start, t->end);
1185      gdata->loci.Add(new GffLocus(t));
1186      return true;
1187   }
1188   if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
1189   int lfound=0; //count of parent loci
1190   GArray<int> mrgloci(false);
1191   GList<GffLocus> tloci(true); //candidate parent loci to adopt this
1192   //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
1193   for (int l=nidx-1;l>=0;l--) {
1194       GffLocus& loc=*(gdata->loci[l]);
1195       if ((loc.strand=='+' || loc.strand=='-') && t->strand!='.'&& loc.strand!=t->strand) continue;
1196       if (t_start>loc.end) {
1197            if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
1198            continue;
1199       }
1200       if (loc.start>t_end) {
1201                //this should never be the case if nidx was found correctly
1202                GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
1203                continue;
1204       }
1205 
1206       if (loc.add_gfobj(t, dOvlSET)) {
1207          //will add this transcript to loc
1208          lfound++;
1209          mrgloci.Add(l);
1210          if (collapseRedundant && !noexon_gfs) {
1211            //compare to every single transcript in this locus
1212            for (int ti=0;ti<loc.rnas.Count();ti++) {
1213                  if (loc.rnas[ti]==t) continue;
1214                  GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
1215                  //GMessage("  ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
1216                  GffObj* container=NULL;
1217                  if (odata->replaced_by==NULL &&
1218                       (container=redundantTranscripts(*t, *(loc.rnas[ti])))!=NULL) {
1219                      if (container==t) {
1220                         odata->replaced_by=t;
1221                         preserveContainedCDS(t, loc.rnas[ti]);
1222                      }
1223                      else {// t is being replaced by previously defined transcript
1224                         tdata->replaced_by=loc.rnas[ti];
1225                         preserveContainedCDS(loc.rnas[ti], t);
1226                      }
1227                  }
1228            }//for each transcript in the exon-overlapping locus
1229          } //if doCollapseRedundant
1230       } //overlapping locus
1231   } //for each existing locus
1232   if (lfound==0) {
1233       //overlapping loci not found, create a locus with only this mRNA
1234       int addidx=gdata->loci.Add(new GffLocus(t));
1235       if (addidx<0) {
1236          //should never be the case!
1237          GMessage("  WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
1238       }
1239    }
1240    else { //found at least one overlapping locus
1241      lfound--;
1242      int locidx=mrgloci[lfound];
1243      GffLocus& loc=*(gdata->loci[locidx]);
1244      //last locus index found is also the smallest index
1245      if (lfound>0) {
1246        //more than one loci found parenting this mRNA, merge loci
1247        /* if (debug)
1248           GMessage(" merging %d loci \n",lfound);
1249        */
1250        for (int l=0;l<lfound;l++) {
1251           int mlidx=mrgloci[l];
1252           loc.addMerge(*(gdata->loci[mlidx]), t);
1253           gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
1254        }
1255      }
1256      int i=locidx;
1257      while (i>0 && loc<*(gdata->loci[i-1])) {
1258        //bubble down until it's in the proper order
1259        i--;
1260        gdata->loci.Swap(i,i+1);
1261      }
1262   }//found at least one overlapping locus
1263   return true;
1264 }
1265 
collectLocusData(GList<GenomicSeqData> & ref_data,bool covInfo)1266 void collectLocusData(GList<GenomicSeqData>& ref_data, bool covInfo) {
1267 	int locus_num=0;
1268 	for (int g=0;g<ref_data.Count();g++) {
1269 		GenomicSeqData* gdata=ref_data[g];
1270 		for (int l=0;l<gdata->loci.Count();l++) {
1271 			GffLocus& loc=*(gdata->loci[l]);
1272 			GHash<int> gnames; //gene names in this locus
1273 			//GHash<int> geneids(true); //Entrez GeneID: numbers
1274 			GHash<int> geneids;
1275 			int fstrand=0,rstrand=0,ustrand=0;
1276 			for (int i=0;i<loc.rnas.Count();i++) {
1277 				GffObj& t=*(loc.rnas[i]);
1278 				char tstrand=(char) T_OSTRAND(t.udata);
1279 				if (tstrand==0) tstrand=t.strand;
1280 				if (tstrand=='+') fstrand++;
1281 				 else if (tstrand=='-') rstrand++;
1282 				   else ustrand++;
1283 				GStr gname(t.getGeneName());
1284 				if (!gname.is_empty()) {
1285 					gname.upper();
1286 					int* prevg=gnames.Find(gname.chars());
1287 					if (prevg!=NULL) (*prevg)++;
1288 					else gnames.Add(gname.chars(), 1);
1289 				}
1290 				GStr geneid(t.getGeneID());
1291 				if (!geneid.is_empty()) {
1292 					int* prevg=gnames.Find(geneid.chars());
1293 					if (prevg!=NULL) (*prevg)++;
1294 					geneids.Add(geneid.chars(), 1);
1295 				}
1296 				//parse GeneID xrefs, if any (RefSeq):
1297 				/*
1298 				GStr xrefs(t.getAttr("xrefs"));
1299 				if (!xrefs.is_empty()) {
1300 					xrefs.startTokenize(",");
1301 					GStr token;
1302 					while (xrefs.nextToken(token)) {
1303 						token.upper();
1304 						if (token.startsWith("GENEID:")) {
1305 							token.cut(0,token.index(':')+1);
1306 							int* prevg=geneids.Find(token.chars());
1307 							if (prevg!=NULL) (*prevg)++;
1308 							else geneids.Add(token, new int(1));
1309 						}
1310 					} //for each xref
1311 				} //xrefs parsing
1312 				*/
1313 			}//for each transcript
1314             if ((fstrand>0 && rstrand>0) ||
1315             		 (fstrand==0 && rstrand==0)) loc.strand='.';
1316             else if (fstrand==0 && rstrand>0) loc.strand='-';
1317             else loc.strand='+';
1318 			for (int i=0;i<loc.gfs.Count();i++) {
1319 				GffObj& nt=*(loc.gfs[i]);
1320 				if (nt.isGene()) {
1321 					GStr gname(nt.getGeneName());
1322 					if (!gname.is_empty()) {
1323 						gname.upper();
1324 						int* prevg=gnames.Find(gname.chars());
1325 						if (prevg!=NULL) (*prevg)++;
1326 						else gnames.Add(gname, 1);
1327 					}
1328 					GStr geneid(nt.getID());
1329 					if (!geneid.is_empty()) {
1330 						int* prevg=gnames.Find(geneid.chars());
1331 						if (prevg!=NULL) (*prevg)++;
1332 						geneids.Add(geneid.chars(),1);
1333 					}
1334 				}
1335 				//parse GeneID xrefs, if any (RefSeq):
1336 				/*
1337 				GStr xrefs(nt.getAttr("xrefs"));
1338 				if (!xrefs.is_empty()) {
1339 					xrefs.startTokenize(",");
1340 					GStr token;
1341 					while (xrefs.nextToken(token)) {
1342 						token.upper();
1343 						if (token.startsWith("GENEID:")) {
1344 							token.cut(0,token.index(':')+1);
1345 							int* prevg=geneids.Find(token.chars());
1346 							if (prevg!=NULL) (*prevg)++;
1347 							else geneids.Add(token, new int(1));
1348 						}
1349 					} //for each xref
1350 				} //xrefs parsing
1351 				*/
1352 			}//for each non-transcript (genes?)
1353 			if (covInfo) {
1354 				for (int m=0;m<loc.mexons.Count();m++) {
1355 					if (loc.strand=='+')
1356 						gdata->f_bases+=loc.mexons[m].len();
1357 					else if (loc.strand=='-')
1358 						gdata->r_bases+=loc.mexons[m].len();
1359 					else gdata->u_bases+=loc.mexons[m].len();
1360 				}
1361 			}
1362 			locus_num++;
1363 			loc.locus_num=locus_num;
1364 			if (gnames.Count()>0) { //collect all gene names associated to this locus
1365 				gnames.startIterate();
1366 				int gfreq=0;
1367 				const char* key=NULL;
1368 				while ((key=gnames.Next(gfreq))!=NULL) {
1369 					loc.gene_names.AddIfNew(new CGeneSym(key, gfreq));
1370 				}
1371 			} //added collected gene_names
1372 			if (geneids.Count()>0) { //collect all GeneIDs names associated to this locus
1373 				geneids.startIterate();
1374 				int gfreq=0;
1375 				const char* key=NULL;
1376 				while ((key=geneids.Next(gfreq))!=NULL) {
1377 					loc.gene_ids.AddIfNew(new CGeneSym(key, gfreq));
1378 				}
1379 			}
1380 		} //for each locus
1381 	}//for each genomic sequence
1382 }
1383 
loadRefNames(GStr & flst)1384 void GffLoader::loadRefNames(GStr& flst) {
1385  //load the whole file and split by (' \t\n\r,'
1386 	int64_t fsize=fileSize(flst.chars());
1387 	if (fsize<0) GError("Error: could not get file size for %s !\n",
1388 			flst.chars());
1389 	GStr slurp("", fsize+1);
1390 	//sanity check for file size?
1391 	FILE* f=fopen(flst.chars(), "r");
1392 	if (f==NULL)
1393 		GError("Error: could not open file %s !\n", flst.chars());
1394 	slurp.read(f, NULL);
1395 	fclose(f);
1396 	slurp.startTokenize(" ,;\t\r\n", tkCharSet);
1397 	GStr refname;
1398 	while (slurp.nextToken(refname)) {
1399 		if (refname.is_empty()) continue;
1400 		names->gseqs.addName(refname.chars());
1401 	}
1402 }
1403 
getGSeqData(GList<GenomicSeqData> & seqdata,int gseq_id)1404 GenomicSeqData* getGSeqData(GList<GenomicSeqData>& seqdata, int gseq_id) {
1405 	int i=-1;
1406 	GenomicSeqData f(gseq_id);
1407 	GenomicSeqData* gdata=NULL;
1408 	if (seqdata.Found(&f,i)) gdata=seqdata[i];
1409 	else { //entry not created yet for this genomic seq
1410 		gdata=new GenomicSeqData(gseq_id);
1411 		seqdata.Add(gdata);
1412 	}
1413 	return gdata;
1414 }
1415 
warnPseudo(GffObj & m)1416 void warnPseudo(GffObj& m) {
1417 	GMessage("Info: pseudo gene/transcript record with ID=%s discarded.\n",m.getID());
1418 }
collectIntrons(GffObj & t)1419 void GffLoader::collectIntrons(GffObj& t) {
1420 	// assume t are coming grouped by chromosome and sorted by start coordinate!
1421 	if (intronList.jlst.Count()>0) {
1422          if (t.gseq_id!=intronList.gseq_id ||
1423         		 t.start>=intronList.jlst.Last()->start) {
1424         	intronList.print(f_j);
1425         	intronList.clear();
1426          }
1427          else if (t.start<intronList.last_t_start)
1428         	 GError("Error collectIntrons(%s) called when last_t_start was %d\n",
1429         			 t.getID(), intronList.last_t_start );
1430          //add this transcript's introns
1431 	}
1432     intronList.add(t);
1433 }
1434 
load(GList<GenomicSeqData> & seqdata,GFFCommentParser * gf_parsecomment)1435 void GffLoader::load(GList<GenomicSeqData>& seqdata, GFFCommentParser* gf_parsecomment) {
1436 	if (f==NULL) GError("Error: GffLoader::load() cannot be called before ::openFile()!\n");
1437 	GffReader* gffr=new GffReader(f, this->transcriptsOnly, true); //not only mRNA features, sorted
1438 	clearHeaderLines();
1439 	gffr->showWarnings(verbose);
1440 	//           keepAttrs   mergeCloseExons  noExonAttr
1441 	gffr->gene2Exon(gene2exon);
1442 	if (BEDinput) gffr->isBED(true);
1443 	//if (TLFinput) gffr->isTLF(true);
1444 	gffr->mergeCloseExons(mergeCloseExons);
1445 	gffr->keepAttrs(fullAttributes, gatherExonAttrs, keep_AllExonAttrs);
1446 	gffr->keepGenes(keepGenes);
1447 	gffr->setIgnoreLocus(ignoreLocus);
1448 	gffr->setRefAlphaSorted(this->sortRefsAlpha);
1449 	gffr->procEnsemblID(this->ensemblProc);
1450 	if (keepGff3Comments && gf_parsecomment!=NULL) gffr->setCommentParser(gf_parsecomment);
1451     int outcounter=0;
1452 	if (streamIn) { //this will ignore any clustering options
1453 		GffObj* t=NULL;
1454 		while ((t=gffr->readNext())!=NULL) {
1455 			if (!validateGffRec(t)) {
1456 				delete t;
1457 				continue;
1458 			}
1459 
1460 			if (f_j!=NULL && t->isTranscript() && t->exons.Count()>1)
1461 				collectIntrons(*t);
1462 
1463 			outcounter++;
1464 			if (f_out) {
1465 			  if (fmtTable)
1466 					printTableData(f_out, *t);
1467 			  else //GFF3, GTF, BED, TLF
1468 				t->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
1469 			}
1470 			delete t;
1471 		}
1472 		if (f_j && intronList.jlst.Count()>0) {
1473         	intronList.print(f_j);
1474         	intronList.clear();
1475 		}
1476 		delete gffr;
1477 		return;
1478 	}
1479 
1480 	gffr->readAll();
1481 	GVec<int> pseudoFeatureIds; //feature type: pseudo*
1482 	GVec<int> pseudoAttrIds;  // attribute: [is]pseudo*=true/yes/1
1483 	GVec<int> pseudoTypeAttrIds;  // attribute: *_type=pseudo*
1484 
1485 	if (this->noPseudo) {
1486 		GffNameList& fnames = GffObj::names->feats; //gffr->names->feats;
1487 		for (int i=0;i<fnames.Count();i++) {
1488 			char* n=fnames[i]->name;
1489 			if (startsWith(n, "pseudo")) {
1490 				pseudoFeatureIds.Add(fnames[i]->idx);
1491 			}
1492 		}
1493 		GffNameList& attrnames = GffObj::names->attrs;//gffr->names->attrs;
1494 		for (int i=0;i<attrnames.Count();i++) {
1495 			char* n=attrnames[i]->name;
1496 			if (endsiWith(n, "type")) {
1497 				pseudoTypeAttrIds.Add(attrnames[i]->idx);
1498 			}// else {
1499 			char* p=strifind(n, "pseudo");
1500 			if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s') ||
1501 					(p==n+3 && startsiWith(n, "is_")) ) {
1502 				pseudoAttrIds.Add(attrnames[i]->idx);
1503 			}
1504 			//}
1505 		}
1506 	}
1507 
1508 	if (verbose) GMessage("   .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
1509 	//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
1510 	for (int k=0;k<gffr->gflst.Count();k++) {
1511 		GffObj* m=gffr->gflst[k];
1512 		if (strcmp(m->getFeatureName(), "locus")==0 &&
1513 				m->getAttr("transcripts")!=NULL) {
1514 			continue; //discard locus meta-features
1515 		}
1516 		if (this->noPseudo) {
1517 			bool is_pseudo=false;
1518 			for (int i=0;i<pseudoFeatureIds.Count();++i) {
1519 				if (pseudoFeatureIds[i]==m->ftype_id) {
1520 					is_pseudo=true;
1521 					break;
1522 				}
1523 			}
1524 			if (is_pseudo) {
1525 				if (verbose) warnPseudo(*m);
1526 				continue;
1527 			}
1528 			for (int i=0;i<pseudoAttrIds.Count();++i) {
1529 				char* attrv=NULL;
1530 				if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]);
1531 				if (attrv!=NULL) {
1532 					char fc=tolower(attrv[0]);
1533 					if (fc=='t' || fc=='y' || fc=='1') {
1534 						is_pseudo=true;
1535 						break;
1536 					}
1537 				}
1538 			}
1539 			if (is_pseudo) {
1540 				if (verbose) warnPseudo(*m);
1541 				continue;
1542 			}
1543 			//  *type=*_pseudogene
1544             //find all attributes ending with _type and have value like: *_pseudogene
1545 			for (int i=0;i<pseudoTypeAttrIds.Count();++i) {
1546 				char* attrv=NULL;
1547 				if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoTypeAttrIds[i]);
1548 				if (attrv!=NULL &&
1549 						(startsWith(attrv, "pseudogene") || endsWith(attrv, "_pseudogene")) ) {
1550 					is_pseudo=true;
1551 					break;
1552 				}
1553 			}
1554 			if (is_pseudo) {
1555 				if (verbose) warnPseudo(*m);
1556 				continue;
1557 			}
1558 		} //pseudogene detection requested
1559 		char* rloc=m->getAttr("locus");
1560 		if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
1561 			m->removeAttr("locus", rloc);
1562 		}
1563 		if (forceExons) {
1564 			m->subftype_id=gff_fid_exon;
1565 		}
1566 		//GList<GffObj> gfadd(false,false); -- for gf_validate()?
1567 		if (!validateGffRec(m)) { //this will also apply process_transcript() CDS filters etc.
1568 			continue;
1569 		}
1570 		if (f_j!=NULL && m->isTranscript() && m->exons.Count()>1)
1571 			collectIntrons(*m);
1572 
1573 		m->isUsed(true); //so the gffreader won't destroy it
1574 		GenomicSeqData* gdata=getGSeqData(seqdata, m->gseq_id);
1575 		bool keep=placeGf(m, gdata);
1576 		if (!keep) {
1577 			m->isUsed(false);
1578 			//DEBUG
1579 			//GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end);
1580 		}
1581 	} //for each read gffObj
1582 	if (f_j && intronList.jlst.Count()>0) {
1583     	intronList.print(f_j);
1584     	intronList.clear();
1585 	}
1586 	//if (verbose) GMessage("  .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
1587 	//if (f && f!=stdin) { fclose(f); f=NULL; }
1588 	delete gffr;
1589 }
1590