1 /*
2  *  gtf_tracking.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 9/5/09.
6  *  Copyright 2009 Geo Pertea. All rights reserved.
7  *
8  */
9 
10 #include "gtf_tracking.h"
11 
12 bool gtf_tracking_verbose = false;
13 bool gtf_tracking_largeScale=false; //many input Cufflinks files processed at once by cuffcompare, discard exon attributes
14 
15 int GXConsensus::count=0;
16 
getGSeqName(int gseq_id)17 char* getGSeqName(int gseq_id) {
18  return GffObj::names->gseqs.getName(gseq_id);
19 }
20 
cmpByPtr(const pointer p1,const pointer p2)21 int cmpByPtr(const pointer p1, const pointer p2) {
22   return (p1>p2) ? 1: ((p1==p2)? 0 : -1);
23   }
24 
betterRef(GffObj * a,GffObj * b)25 bool betterRef(GffObj* a, GffObj* b) {
26  if (a==NULL || b==NULL) return (a!=NULL);
27  if (a->exons.Count()!=b->exons.Count()) return (a->exons.Count()>b->exons.Count());
28  if (a->hasCDS() && !b->hasCDS())
29         return true;
30    else {
31      if (b->hasCDS() && !a->hasCDS()) return false;
32      return (a->covlen>b->covlen);
33      }
34  }
35 
is_RefDup(GffObj * m,GList<GffObj> & mrnas,int & dupidx)36 GffObj* is_RefDup(GffObj* m, GList<GffObj>& mrnas, int& dupidx) {
37  //mrnas MUST be sorted by start coordinate
38   int ovlen=0;
39   dupidx=-1;
40   if (mrnas.Count()==0) return NULL;
41   int nidx=qsearch_mrnas(m->end, mrnas);
42   if (nidx==0) return NULL;
43   if (nidx==-1) nidx=mrnas.Count();//all can overlap
44   for (int i=nidx-1;i>=0;i--) {
45       GffObj& omrna=*mrnas[i];
46       if (m->start>omrna.end) {
47            if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
48            continue;
49            }
50       if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
51       //locus overlap here:
52       if (tMatch(*m, omrna, ovlen, false, true)) {
53              dupidx=i;
54              return mrnas[i];
55              }
56       }
57   return NULL;
58 }
59 
60 
intronRedundant(GffObj & ti,GffObj & tj,bool no5share=false)61 bool intronRedundant(GffObj& ti, GffObj&  tj, bool no5share=false) {
62  //two transcripts are "intron redundant" iff one transcript's intron chain
63   // is a sub-chain of the other's
64  int imax=ti.exons.Count()-1;
65  int jmax=tj.exons.Count()-1;
66  if (imax==0 || jmax==0) return false; //don't deal with single-exon transcripts here
67  if (ti.exons[imax]->start<tj.exons[0]->end ||
68      tj.exons[jmax]->start<ti.exons[0]->end )
69          return false; //intron chains do not overlap at all
70 
71  uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
72  int i=1; //exon idx to the right of the current intron of ti
73  int j=1; //exon idx to the right of the current intron of tj
74  //find the first intron overlap:
75  while (i<=imax && j<=jmax) {
76     eistart=ti.exons[i-1]->end;
77     eiend=ti.exons[i]->start;
78     ejstart=tj.exons[j-1]->end;
79     ejend=tj.exons[j]->start;
80     if (ejend<eistart) { j++; continue; }
81     if (eiend<ejstart) { i++; continue; }
82     //we found an intron overlap
83     break;
84     }
85  if ((i>1 && j>1) || i>imax || j>jmax) {
86      return false; //either no intron overlaps found at all
87                   //or it's not the first intron for at least one of the transcripts
88      }
89  if (eistart!=ejstart || eiend!=ejend) return false; //not an exact intron match
90  //we have the first matching intron on the left
91 
92  if (j>i) {
93    //i==1, ti's start must not conflict with the previous intron of tj
94    if (ti.start<tj.exons[j-1]->start) return false;
95    //so i's first intron starts AFTER j's first intron
96    // then j must contain i, so i's last intron must end with or before j's last intron
97    if (ti.exons[imax]->start>tj.exons[jmax]->start) return false;
98       //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
99    }
100   else if (i>j) {
101    //j==1, tj's start must not conflict with the previous intron of ti
102    if (tj.start<ti.exons[i-1]->start) return false;
103    //so j's intron chain starts AFTER i's
104    // then i must contain j, so j's last intron must end with or before j's last intron
105    if (tj.exons[jmax]->start>ti.exons[imax]->start) return false;
106       //comment out the line above for just "intronCompatible()" check
107    }
108  //now check if the rest of the introns overlap, in the same sequence
109  int i_start=i; //first (leftmost) matching intron of ti (1-based index)
110  int j_start=j; //first (leftmost) matching intron of tj
111  i++;
112  j++;
113  while (i<=imax && j<=jmax) {
114   if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
115       ti.exons[i]->start!=tj.exons[j]->start) return false;
116   i++;
117   j++;
118   }
119  i--;
120  j--;
121  if (i==imax && j<jmax) {
122    // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
123    if (ti.end>tj.exons[j]->end) return false;
124    }
125  else if (j==jmax && i<imax) {
126    if (tj.end>ti.exons[i]->end) return false;
127    }
128  if (no5share && imax!=jmax) {
129 	 //if they share the 5' intron, they are NOT to be considered redundant
130 	 if (ti.strand=='+') {
131 			 if (i_start==1 && j_start==1) return false;
132 	 }
133 	 else { //reverse strand
134 		 if (i==imax && j==jmax) return false;
135 	 }
136  }
137  return true; //they are intron-redundant
138 }
139 
t_contains(GffObj & a,GffObj & b)140 bool t_contains(GffObj& a, GffObj& b) {
141  //returns true if b's intron chain (or single exon) is included in a
142  if (b.exons.Count()>=a.exons.Count()) return false;
143  if (b.exons.Count()==1) {
144     //check if b is contained in any of a's exons:
145     for (int i=0;i<a.exons.Count();i++) {
146        if (b.start>=a.exons[i]->start && b.end<=a.exons[i]->end) return true;
147        }
148     return false;
149     }
150  if (intronRedundant(a,b)) {
151     //intronRedudant allows b's initial/terminal exons to extend beyond a's boundaries
152     //but we don't allow this kind of behavior here
153     return (b.start>=a.start && b.end<=a.end);
154     }
155   else return false;
156  }
157 
is_Redundant(GffObj * m,GList<GffObj> * mrnas,bool no5share=false)158 int is_Redundant(GffObj*m, GList<GffObj>* mrnas, bool no5share=false) {
159  //first locate the list index of the mrna starting just ABOVE
160  //the end of this mrna
161  if (mrnas->Count()==0) return -1;
162  int nidx=qsearch_mrnas(m->end, *mrnas);
163  if (nidx==0) return -1;
164  if (nidx==-1) nidx=mrnas->Count();//all can overlap
165  for (int i=nidx-1;i>=0;i--) {
166      GffObj& omrna=*mrnas->Get(i);
167      if (m->start>omrna.end) {
168           if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
169           continue;
170           }
171      if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
172 
173      if (intronRedundant(*m, omrna, no5share)) return i;
174      }
175  return -1;
176 }
177 
t_dominates(GffObj * a,GffObj * b)178 bool t_dominates(GffObj* a, GffObj* b) {
179  // for redundant / intron compatible transfrags:
180  // returns true if a "dominates" b, i.e. a has more exons or is longer
181  if (a->exons.Count()==b->exons.Count())
182          return (a->covlen>b->covlen);
183     else return (a->exons.Count()>b->exons.Count());
184 }
185 
betterDupRef(GffObj * a,GffObj * b)186 bool betterDupRef(GffObj* a, GffObj* b) {
187   if (a->exons.Count()!=b->exons.Count())
188     return (a->exons.Count()>b->exons.Count());
189   if (a->hasCDS()!=b->hasCDS())
190      return (a->hasCDS()>b->hasCDS());
191    //for annotation purposes, it's more important to keep the
192    //longer transcript, instead of the one that was loaded first
193   if (a->covlen != b->covlen)
194          return (a->covlen > b->covlen);
195     else return (a->track_id < b->track_id);
196 }
197 
parse_mRNAs(GfList & mrnas,GList<GSeqData> & glstdata,bool is_ref_set,int check_for_dups,int qfidx,bool only_multiexon)198 int parse_mRNAs(GfList& mrnas,
199 				 GList<GSeqData>& glstdata,
200 				 bool is_ref_set,
201 				 int check_for_dups,
202 				 int qfidx, bool only_multiexon) {
203 	int tredundant=0; //redundant transcripts discarded
204 	int total_kept=0;
205 	int total_seen=mrnas.Count();
206 	for (int k=0;k<mrnas.Count();k++) {
207 		GffObj* m=mrnas[k];
208 		int i=-1;
209 		GSeqData f(m->gseq_id);
210 		GSeqData* gdata=NULL;
211 		uint tlen=m->len();
212 		if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too..
213 			if (gtf_tracking_verbose)
214 			      GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen);
215 			continue;
216 			}
217 		if (only_multiexon && m->exons.Count()<2) {
218 			continue;
219 			}
220 		//GStr feature(m->getFeatureName());
221 		//feature.lower();
222 		//bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0);
223 		//if (m->exons.Count()==0 && gene_or_locus) {
224 		if (m->isDiscarded()) {
225 			//discard generic "gene" or "locus" features with no other detailed subfeatures
226 			if (!is_ref_set && gtf_tracking_verbose)
227 			   GMessage("Warning: discarding non-transfrag (GFF generic gene/locus container?) %s\n",m->getID());
228 			continue;
229 			}
230 
231 		if (m->exons.Count()==0) {
232 				if (gtf_tracking_verbose && !is_ref_set)
233 				 GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID());
234 				m->addExon(m->start,m->end);
235 				}
236 		if (glstdata.Found(&f,i)) gdata=glstdata[i];
237 		else {
238 			gdata=new GSeqData(m->gseq_id);
239 			glstdata.Add(gdata);
240 			}
241 
242 		double fpkm=0;
243 		double cov=0;
244 		double conf_hi=0;
245 		double conf_lo=0;
246 
247 		GList<GffObj>* target_mrnas=NULL;
248 		if (is_ref_set) { //-- ref transcripts
249 		   if (m->strand=='.') {
250 		     //unknown strand - discard from reference set (!)
251 		     continue;
252 		     }
253 		   total_kept++;
254 		   target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r);
255 		   if (check_for_dups) {
256 		     //check all gdata->mrnas_r (ref_data) for duplicate ref transcripts
257 		     int rpidx=-1;
258 		     GffObj* rp= is_RefDup(m, *target_mrnas, rpidx);
259 		     if (rp!=NULL) { //duplicate found
260 		      //discard one of them
261 		      //but let's keep the gene_name if present
262 		      //DEBUG:
263 		      //GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID());
264 		      tredundant++;
265 		      total_kept--;
266 		      if (betterDupRef(rp, m)) {
267 		           if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) {
268 		                  rp->setGeneName(m->getGeneName());
269 		                  }
270 		           continue;
271 		           }
272 		         else {
273 		           if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) {
274 		                  m->setGeneName(rp->getGeneName());
275 		                  }
276 		           ((CTData*)(rp->uptr))->mrna=NULL;
277 		           rp->isUsed(false);
278 		           target_mrnas->Forget(rpidx);
279 		           target_mrnas->Delete(rpidx);
280 		           }
281 		       }
282 		     } //check for duplicate ref transcripts
283 		   } //ref transcripts
284 		else { //-- query transfrags
285 		   if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); }
286 		     else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); }
287 		        else { m->strand='.'; target_mrnas=&(gdata->umrnas); }
288 		   total_kept++;
289 		   if (check_for_dups) { //check for redundancy
290 		     // check if there is a redundancy between this and another already loaded Cufflinks transcript
291 		     int cidx =  is_Redundant(m, target_mrnas, (check_for_dups>1));
292 		     if (cidx>=0) {
293 		        //always discard the redundant transcript with the fewer exons OR shorter
294 			     tredundant++;
295 		         total_kept--;
296 		    	 if (t_dominates(target_mrnas->Get(cidx),m)) {
297 		            //new transcript is shorter, discard it
298 		        	if (gtf_tracking_verbose) {
299 		        		GMessage(" transfrag %s discarded (made redundant by %s)\n", m->getID(), target_mrnas->Get(cidx)->getID());
300 		        	}
301 		            continue;
302 		        }
303 		        else {
304 		            //discard the older transfrag
305 		        	if (gtf_tracking_verbose) {
306 		        		GMessage(" transfrag %s discarded (made redundant by %s)\n", target_mrnas->Get(cidx)->getID(), m->getID());
307 		        	}
308 		            ((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL;
309 		            target_mrnas->Get(cidx)->isUsed(false);
310 		            target_mrnas->Forget(cidx);
311 		            target_mrnas->Delete(cidx);
312 		            //the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually
313 		        }
314 		     }
315 		   }// redundant transfrag check
316 		   if (m->gscore==0.0)
317 		     m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance
318 		   //const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM");
319 		   const char* expr = m->getAttr("FPKM");
320 		   if (expr!=NULL) {
321 		       if (expr[0]=='"') expr++;
322 		       fpkm=strtod(expr, NULL);
323 		       } else { //backward compatibility: read RPKM if FPKM not found
324 		       //expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM");
325 		       expr=m->getAttr("RPKM");
326 		       if (expr!=NULL) {
327 		           if (expr[0]=='"') expr++;
328 		           fpkm=strtod(expr, NULL);
329 		           }
330 		       }
331 		   //const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov");
332 		   const char* scov=m->getAttr("cov");
333 		   if (scov!=NULL) {
334 		       if (scov[0]=='"') scov++;
335 		       cov=strtod(scov, NULL);
336 		       }
337 		   //const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi");
338 		   const char* sconf_hi=m->getAttr("conf_hi");
339 		   if (sconf_hi!=NULL){
340 		       if (sconf_hi[0]=='"') sconf_hi++;
341 		       conf_hi=strtod(sconf_hi, NULL);
342 		       }
343 		   //const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo");
344 		   const char* sconf_lo=m->getAttr("conf_lo");
345 		   if (sconf_lo!=NULL) {
346 		       if (sconf_lo[0]=='"') sconf_lo++;
347 		       conf_lo=strtod(sconf_lo, NULL);
348 		       }
349 		   } //Cufflinks transfrags
350 		target_mrnas->Add(m);
351 		m->isUsed(true);
352 		CTData* mdata=new CTData(m);
353 		mdata->qset=qfidx;
354 		gdata->tdata.Add(mdata);
355 		if (!is_ref_set) {
356 		// Cufflinks - attributes parsing
357 		   mdata->FPKM=fpkm;
358 		   mdata->cov=cov;
359 		   mdata->conf_hi=conf_hi;
360 		   mdata->conf_lo=conf_lo;
361 		   }
362 	}//for each mrna read
363 	if (gtf_tracking_verbose) {
364 		if (is_ref_set)
365        GMessage(" Kept %d ref transcripts out of %d\n", total_kept, total_seen);
366 		else
367 	   GMessage(" Kept %d transfrags out of %d\n", total_kept, total_seen);
368 	}
369  //if (mrna_deleted>0)
370  //  mrnas.Pack();
371 
372  //return (is_ref_set ? refdiscarded : tredundant);
373 	return tredundant;
374 }
375 
singleExonTMatch(GffObj & m,GffObj & r,int & ovlen)376 bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) {
377  //if (m.exons.Count()>1 || r.exons.Count()>1..)
378  GSeg mseg(m.start, m.end);
379  ovlen=mseg.overlapLen(r.start,r.end);
380  int lmax=GMAX(r.covlen, m.covlen);
381  return ((ovlen >= lmax*0.8) // fuzz matching for single-exon transcripts: 80% of the longer one
382              || (ovlen >= r.covlen*0.9)); // fuzzy reverse-containment - the reference transcript is shorter
383 }
384 
tMatch(GffObj & a,GffObj & b,int & ovlen,bool fuzzunspl,bool contain_only)385 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl, bool contain_only) {
386 	//strict intron chain match, or single-exon match
387 	int imax=a.exons.Count()-1;
388 	int jmax=b.exons.Count()-1;
389 	ovlen=0;
390 	if (imax!=jmax) return false; //different number of exons
391 	if (imax==0) { //single-exon mRNAs
392 		if (contain_only) {
393 		   return ((a.start>=b.start && a.end<=b.end) ||
394 		           (b.start>=a.start && b.end<=a.end));
395 		}
396 		if (fuzzunspl) {
397 			//fuzz match for single-exon transfrags:
398 			// it's a match if they overlap at least 80% of longest one
399 			//ovlen=a.exons[0]->overlapLen(b.exons[0]);
400 			//int maxlen=GMAX(a.covlen,b.covlen);
401 			//return (ovlen>=maxlen*0.8);
402 			return (singleExonTMatch(a,b,ovlen));
403 		}
404 	  else {
405 			//only exact match
406 			ovlen=a.covlen;
407 			return (a.exons[0]->start==b.exons[0]->start &&
408 					a.exons[0]->end==b.exons[0]->end);
409 		}
410 	}
411 	if ( a.exons[imax]->start<b.exons[0]->end ||
412 		b.exons[jmax]->start<a.exons[0]->end )
413 		return false; //intron chains do not overlap at all
414 	//check intron overlaps
415 	ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
416 	ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
417 	for (int i=1;i<=imax;i++) {
418 		if (i<imax) ovlen+=a.exons[i]->len();
419 		if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
420 			(a.exons[i]->start!=b.exons[i]->start)) {
421 			return false; //intron mismatch
422 		}
423 	}
424 	if (contain_only)
425 		     return ((a.start>=b.start && a.end<=b.end) ||
426 		           (b.start>=a.start && b.end<=a.end));
427 		else return true;
428 }
429 
430 
cluster_mRNAs(GList<GffObj> & mrnas,GList<GLocus> & loci,int qfidx)431 void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) {
432 	//mrnas sorted by start coordinate
433 	//and so are the loci
434 	//int rdisc=0;
435 		for (int t=0;t<mrnas.Count();t++) {
436 		GArray<int> mrgloci(false);
437 		GffObj* mrna=mrnas[t];
438 		int lfound=0; //count of parent loci
439 		/*for (int l=0;l<loci.Count();l++) {
440 			if (loci[l]->end<mrna->exons.First()->start) continue;
441 			if (loci[l]->start>mrna->exons.Last()->end) break; */
442 		 for (int l=loci.Count()-1;l>=0;l--) {
443 		   if (loci[l]->end<mrna->exons.First()->start) {
444 		       if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break;
445 		       continue;
446 		       }
447 		   if (loci[l]->start>mrna->exons.Last()->end) continue;
448 			//here we have mrna overlapping loci[l]
449 			if (loci[l]->add_mRNA(mrna)) {
450 				//a parent locus was found
451 				lfound++;
452 				mrgloci.Add(l); //locus indices added here, in decreasing order
453 			}
454 		}//loci loop
455 		//if (lfound<0) continue; //mrna was a ref duplicate, skip it
456 		if (lfound==0) {
457 			//create a locus with only this mRNA
458  			 loci.Add(new GLocus(mrna, qfidx));
459 		    }
460 		 else if (lfound>1) {
461 			//more than one locus found parenting this mRNA, merge loci
462 		     lfound--;
463 			 for (int l=0;l<lfound;l++) {
464 				  int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
465 				  loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna);
466 				  loci.Delete(mlidx);
467 			    }
468 		    }
469 	}//mrnas loop
470 	//if (rdisc>0) mrnas.Pack();
471 	//return rdisc;
472 }
473 
fix_umrnas(GSeqData & seqdata,GSeqData * rdata,FILE * fdis=NULL)474 int fix_umrnas(GSeqData& seqdata, GSeqData* rdata, FILE* fdis=NULL) {
475 	//attempt to find the strand for seqdata.umrnas
476 	//based on a) overlaps with oriented reference mRNAs if present
477 	//         b) overlaps with oriented mRNAs from the same input set
478 	//int incount=seqdata.umrnas.Count();
479 	if (rdata!=NULL) { //we have reference mrnas
480 		for (int i=0;i<rdata->mrnas_f.Count();i++) {
481 			for (int j=0;j<seqdata.umrnas.Count();j++) {
482 				if (rdata->mrnas_f[i]->gseq_id!=seqdata.umrnas[j]->gseq_id) continue;
483 				if (seqdata.umrnas[j]->strand!='.') continue;
484 				uint ustart=seqdata.umrnas[j]->exons.First()->start;
485 				uint uend=seqdata.umrnas[j]->exons.Last()->end;
486 				uint rstart=rdata->mrnas_f[i]->exons.First()->start;
487 				uint rend=rdata->mrnas_f[i]->exons.Last()->end;
488 				if (ustart>rend) break;
489 				if (rstart>uend) continue;
490 				if (rdata->mrnas_f[i]->exonOverlap(ustart,uend)) {
491 					seqdata.umrnas[j]->strand='+';
492 				}
493 				else { //within intron
494 					//if (seqdata.umrnas[j]->ulink==NULL ||
495 					//     seqdata.umrnas[j]->ulink->covlen<rdata->mrnas_f[i]->covlen) {
496 					CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
497 					mdata->addOvl('i',rdata->mrnas_f[i]);
498 				}
499 			}
500 		}
501 		for (int i=0;i<rdata->mrnas_r.Count();i++) {
502 			for (int j=0;j<seqdata.umrnas.Count();j++) {
503 				if (seqdata.umrnas[j]->strand!='.') continue;
504 				uint ustart=seqdata.umrnas[j]->exons.First()->start;
505 				uint uend=seqdata.umrnas[j]->exons.Last()->end;
506 				uint rstart=rdata->mrnas_r[i]->exons.First()->start;
507 				uint rend=rdata->mrnas_r[i]->exons.Last()->end;
508 				if (ustart>rend) break;
509 				if (rstart>uend) continue;
510 				if (rdata->mrnas_r[i]->exonOverlap(ustart,uend)) {
511 					seqdata.umrnas[j]->strand='-';
512 				}
513 				else { //within intron
514 					CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
515 					mdata->addOvl('i',rdata->mrnas_r[i]);
516 				}
517 
518 			}
519 		}
520 	}//we have reference transcripts
521 	//---- now compare to other qry transcripts
522 	for (int i=0;i<seqdata.mrnas_f.Count();i++) {
523 		for (int j=0;j<seqdata.umrnas.Count();j++) {
524 			if (seqdata.umrnas[j]->strand!='.') continue;
525 			uint ustart=seqdata.umrnas[j]->exons.First()->start;
526 			uint uend=seqdata.umrnas[j]->exons.Last()->end;
527 			uint rstart=seqdata.mrnas_f[i]->exons.First()->start;
528 			uint rend=seqdata.mrnas_f[i]->exons.Last()->end;
529 			if (ustart>rend) break;
530 			if (rstart>uend) continue;
531 			if (seqdata.mrnas_f[i]->exonOverlap(ustart,uend)) {
532 				seqdata.umrnas[j]->strand='+';
533 			}
534 		}
535 	}
536 	for (int i=0;i<seqdata.mrnas_r.Count();i++) {
537 		for (int j=0;j<seqdata.umrnas.Count();j++) {
538 			if (seqdata.umrnas[j]->strand!='.') continue;
539 			uint ustart=seqdata.umrnas[j]->exons.First()->start;
540 			uint uend=seqdata.umrnas[j]->exons.Last()->end;
541 			uint rstart=seqdata.mrnas_r[i]->exons.First()->start;
542 			uint rend=seqdata.mrnas_r[i]->exons.Last()->end;
543 			if (ustart>rend) break;
544 			if (rstart>uend) continue;
545 			//overlap
546 			if (seqdata.mrnas_r[i]->exonOverlap(ustart,uend)) {
547 				seqdata.umrnas[j]->strand='-';
548 			}
549 		}
550     }
551 	int fcount=0;
552 	int fixed=0;
553 	for (int i=0;i<seqdata.umrnas.Count();i++) {
554 		if (seqdata.umrnas[i]->strand=='+') {
555 			seqdata.mrnas_f.Add(seqdata.umrnas[i]);
556 			fixed++;
557 			seqdata.umrnas.Forget(i);
558 		}
559 		else if (seqdata.umrnas[i]->strand=='-') {
560 		    seqdata.mrnas_r.Add(seqdata.umrnas[i]);
561 		    seqdata.umrnas.Forget(i);
562 		    fixed++;
563 		}
564 		else {  //discard mRNAs not settled
565 			//seqdata.umrnas[i]->strand='.'; ?
566 			if (fdis!=NULL) {
567 				seqdata.umrnas[i]->printGtf(fdis);
568 				}
569 			fcount++;
570 		}
571 	}
572 
573 	seqdata.umrnas.Pack();
574 	//if (gtf_tracking_verbose) {
575 	//	GMessage(" %d out of %d (%d left, %d) unoriented transfrags were assigned a strand based on overlaps.\n", fixed, incount, seqdata.umrnas.Count(), fcount);
576 	//}
577 	return fixed;
578 }
579 
580 //retrieve ref_data for a specific genomic sequence
getRefData(int gid,GList<GSeqData> & ref_data)581 GSeqData* getRefData(int gid, GList<GSeqData>& ref_data) {
582 	int ri=-1;
583 	GSeqData f(gid);
584 	GSeqData* r=NULL;
585 	if (ref_data.Found(&f,ri))
586 		r=ref_data[ri];
587 	return r;
588 }
589 
read_transcripts(FILE * f,GList<GSeqData> & seqdata,boost::crc_32_type & crc_result,bool keepAttrs)590 void read_transcripts(FILE* f, GList<GSeqData>& seqdata,
591 #ifdef CUFFLINKS
592   boost::crc_32_type& crc_result,
593 #endif
594    bool keepAttrs) {
595 	rewind(f);
596 	GffReader gffr(f, true); //loading only recognizable transcript features
597 	gffr.showWarnings(gtf_tracking_verbose);
598 
599 	//          keepAttrs    mergeCloseExons   noExonAttrs
600 	gffr.readAll(keepAttrs,          true,        true);
601 #ifdef CUFFLINKS
602      crc_result = gffr.current_crc_result();
603 #endif
604 	//                               is_ref?    check_for_dups,
605 	parse_mRNAs(gffr.gflst, seqdata, false,       0);
606 }
607 
cmpGSeqByName(const pointer p1,const pointer p2)608 int cmpGSeqByName(const pointer p1, const pointer p2) {
609  return strcmp(((GSeqData*)p1)->gseq_name, ((GSeqData*)p2)->gseq_name);
610 }
611 
sort_GSeqs_byName(GList<GSeqData> & seqdata)612 void sort_GSeqs_byName(GList<GSeqData>& seqdata) {
613   seqdata.setSorted(&cmpGSeqByName);
614 }
615 
read_mRNAs(FILE * f,GList<GSeqData> & seqdata,GList<GSeqData> * ref_data,int check_for_dups,int qfidx,const char * fname,bool only_multiexon)616 void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data,
617 	         int check_for_dups, int qfidx, const char* fname, bool only_multiexon) {
618 	//>>>>> read all transcripts/features from a GTF/GFF3 file
619 	//int imrna_counter=0;
620 #ifdef HEAPROFILE
621     if (IsHeapProfilerRunning())
622       HeapProfilerDump("00");
623 #endif
624 	int loci_counter=0;
625 	if (ref_data==NULL) ref_data=&seqdata;
626 	bool isRefData=(&seqdata==ref_data);
627 	                          //(f, transcripts_only)
628 	GffReader* gffr=new GffReader(f, true); //load only transcript annotations
629 	gffr->showWarnings(gtf_tracking_verbose);
630 	//            keepAttrs   mergeCloseExons   noExonAttrs
631 	gffr->readAll(!isRefData,          true,        isRefData || gtf_tracking_largeScale);
632 	//so it will read exon attributes only for low number of Cufflinks files
633 #ifdef HEAPROFILE
634     if (IsHeapProfilerRunning())
635       HeapProfilerDump("post_readAll");
636 #endif
637 
638 	int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon);
639 #ifdef HEAPROFILE
640     if (IsHeapProfilerRunning())
641       HeapProfilerDump("post_parse_mRNAs");
642 #endif
643 	if (gtf_tracking_verbose && d>0) {
644 	  if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d);
645 	            else GMessage(" %d redundant cufflinks transfrags discarded.\n",d);
646 	  }
647 	//imrna_counter=gffr->mrnas.Count();
648 	delete gffr; //free the extra memory and unused GffObjs
649 #ifdef HEAPROFILE
650     if (IsHeapProfilerRunning())
651       HeapProfilerDump("post_del_gffr");
652 #endif
653 
654 	//for each genomic sequence, cluster transcripts
655 	int oriented_by_overlap=0;
656 	int initial_unoriented=0;
657 	int final_unoriented=0;
658 	GStr bname(fname);
659 	GStr s;
660 	if (!bname.is_empty()) {
661 		int di=bname.rindex('.');
662 		if (di>0) bname.cut(di);
663 		int p=bname.rindex('/');
664 		if (p<0) p=bname.rindex('\\');
665 		if (p>=0) bname.remove(0,p);
666 	}
667 	FILE* fdis=NULL;
668 	FILE* frloci=NULL;
669 
670 	for (int g=0;g<seqdata.Count();g++) {
671 		//find the corresponding refseqdata with the same gseq_id
672 		int gseq_id=seqdata[g]->get_gseqid();
673 		if (!isRefData) { //cufflinks data, find corresponding ref data
674 			GSeqData* rdata=getRefData(gseq_id, *ref_data);
675 			initial_unoriented+=seqdata[g]->umrnas.Count();
676 			if (seqdata[g]->umrnas.Count()>0) {
677 			    oriented_by_overlap+=fix_umrnas(*seqdata[g], rdata, fdis);
678 			    final_unoriented+=seqdata[g]->umrnas.Count();
679 			    }
680 			}
681 		//>>>>> group mRNAs into locus-clusters (based on exon overlap)
682 		cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx);
683 		cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx);
684 		if (!isRefData) {
685 			cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx);
686 			}
687 		loci_counter+=seqdata[g]->loci_f.Count();
688 		loci_counter+=seqdata[g]->loci_r.Count();
689 //		if (refData) {
690 //			if (frloci==NULL) {
691 //				s=bname;
692 //				s.append(".loci.lst");
693 //				frloci=fopen(s.chars(), "w");
694 //			}
695 //			writeLoci(frloci, seqdata[g]->loci_f);
696 //			writeLoci(frloci, seqdata[g]->loci_r);
697 //		}//write ref loci
698 	}//for each genomic sequence
699 	if (fdis!=NULL) fclose(fdis);
700 	if (frloci!=NULL) fclose(frloci);
701 	if (initial_unoriented || final_unoriented) {
702 	  if (gtf_tracking_verbose) GMessage(" Found %d transfrags with undetermined strand (%d out of initial %d were fixed by overlaps)\n",
703 			    final_unoriented, oriented_by_overlap, initial_unoriented);
704 	}
705 	//if (fdis!=NULL) remove(s.chars()); remove 0-length file
706 #ifdef HEAPROFILE
707     if (IsHeapProfilerRunning())
708       HeapProfilerDump("post_cluster");
709 #endif
710 }
711 
qsearch_mrnas(uint x,GList<GffObj> & mrnas)712 int qsearch_mrnas(uint x, GList<GffObj>& mrnas) {
713   //binary search
714   //do the simplest tests first:
715   if (mrnas[0]->start>x) return 0;
716   if (mrnas.Last()->start<x) return -1;
717   uint istart=0;
718   int i=0;
719   int idx=-1;
720   int maxh=mrnas.Count()-1;
721   int l=0;
722   int h = maxh;
723   while (l <= h) {
724      i = (l+h)>>1;
725      istart=mrnas[i]->start;
726      if (istart < x)  l = i + 1;
727           else {
728              if (istart == x) { //found matching coordinate here
729                   idx=i;
730                   while (idx<=maxh && mrnas[idx]->start==x) {
731                      idx++;
732                      }
733                   return (idx>maxh) ? -1 : idx;
734                   }
735              h = i - 1;
736              }
737      } //while
738  idx = l;
739  while (idx<=maxh && mrnas[idx]->start<=x) {
740     idx++;
741     }
742  return (idx>maxh) ? -1 : idx;
743 }
744 
qsearch_loci(uint x,GList<GLocus> & loci)745 int qsearch_loci(uint x, GList<GLocus>& loci) {
746  // same as above, but for GSeg lists
747   //binary search
748   //do the simplest tests first:
749   if (loci[0]->start>x) return 0;
750   if (loci.Last()->start<x) return -1;
751   uint istart=0;
752   int i=0;
753   int idx=-1;
754   int maxh=loci.Count()-1;
755   int l=0;
756   int h = maxh;
757   while (l <= h) {
758      i = (l + h) >> 1;
759      istart=loci[i]->start;
760      if (istart < x) l=i+1;
761                 else {
762                    if (istart == x) { //found matching coordinate here
763                         idx=i;
764                         while (idx<=maxh && loci[idx]->start==x) {
765                            idx++;
766                            }
767                         return (idx>maxh) ? -1 : idx;
768                         }
769                    h=i-1;
770                    }
771      } //while
772  idx = l;
773  while (idx<=maxh && loci[idx]->start<=x) {
774     idx++;
775     }
776  return (idx>maxh) ? -1 : idx;
777 }
778 
779