1 /*
2 * gtf_tracking.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 9/5/09.
6 * Copyright 2009 Geo Pertea. All rights reserved.
7 *
8 */
9
10 #include "gtf_tracking.h"
11
12 bool gtf_tracking_verbose = false;
13 bool gtf_tracking_largeScale=false; //many input Cufflinks files processed at once by cuffcompare, discard exon attributes
14
15 int GXConsensus::count=0;
16
getGSeqName(int gseq_id)17 char* getGSeqName(int gseq_id) {
18 return GffObj::names->gseqs.getName(gseq_id);
19 }
20
cmpByPtr(const pointer p1,const pointer p2)21 int cmpByPtr(const pointer p1, const pointer p2) {
22 return (p1>p2) ? 1: ((p1==p2)? 0 : -1);
23 }
24
betterRef(GffObj * a,GffObj * b)25 bool betterRef(GffObj* a, GffObj* b) {
26 if (a==NULL || b==NULL) return (a!=NULL);
27 if (a->exons.Count()!=b->exons.Count()) return (a->exons.Count()>b->exons.Count());
28 if (a->hasCDS() && !b->hasCDS())
29 return true;
30 else {
31 if (b->hasCDS() && !a->hasCDS()) return false;
32 return (a->covlen>b->covlen);
33 }
34 }
35
is_RefDup(GffObj * m,GList<GffObj> & mrnas,int & dupidx)36 GffObj* is_RefDup(GffObj* m, GList<GffObj>& mrnas, int& dupidx) {
37 //mrnas MUST be sorted by start coordinate
38 int ovlen=0;
39 dupidx=-1;
40 if (mrnas.Count()==0) return NULL;
41 int nidx=qsearch_mrnas(m->end, mrnas);
42 if (nidx==0) return NULL;
43 if (nidx==-1) nidx=mrnas.Count();//all can overlap
44 for (int i=nidx-1;i>=0;i--) {
45 GffObj& omrna=*mrnas[i];
46 if (m->start>omrna.end) {
47 if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
48 continue;
49 }
50 if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
51 //locus overlap here:
52 if (tMatch(*m, omrna, ovlen, false, true)) {
53 dupidx=i;
54 return mrnas[i];
55 }
56 }
57 return NULL;
58 }
59
60
intronRedundant(GffObj & ti,GffObj & tj,bool no5share=false)61 bool intronRedundant(GffObj& ti, GffObj& tj, bool no5share=false) {
62 //two transcripts are "intron redundant" iff one transcript's intron chain
63 // is a sub-chain of the other's
64 int imax=ti.exons.Count()-1;
65 int jmax=tj.exons.Count()-1;
66 if (imax==0 || jmax==0) return false; //don't deal with single-exon transcripts here
67 if (ti.exons[imax]->start<tj.exons[0]->end ||
68 tj.exons[jmax]->start<ti.exons[0]->end )
69 return false; //intron chains do not overlap at all
70
71 uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
72 int i=1; //exon idx to the right of the current intron of ti
73 int j=1; //exon idx to the right of the current intron of tj
74 //find the first intron overlap:
75 while (i<=imax && j<=jmax) {
76 eistart=ti.exons[i-1]->end;
77 eiend=ti.exons[i]->start;
78 ejstart=tj.exons[j-1]->end;
79 ejend=tj.exons[j]->start;
80 if (ejend<eistart) { j++; continue; }
81 if (eiend<ejstart) { i++; continue; }
82 //we found an intron overlap
83 break;
84 }
85 if ((i>1 && j>1) || i>imax || j>jmax) {
86 return false; //either no intron overlaps found at all
87 //or it's not the first intron for at least one of the transcripts
88 }
89 if (eistart!=ejstart || eiend!=ejend) return false; //not an exact intron match
90 //we have the first matching intron on the left
91
92 if (j>i) {
93 //i==1, ti's start must not conflict with the previous intron of tj
94 if (ti.start<tj.exons[j-1]->start) return false;
95 //so i's first intron starts AFTER j's first intron
96 // then j must contain i, so i's last intron must end with or before j's last intron
97 if (ti.exons[imax]->start>tj.exons[jmax]->start) return false;
98 //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
99 }
100 else if (i>j) {
101 //j==1, tj's start must not conflict with the previous intron of ti
102 if (tj.start<ti.exons[i-1]->start) return false;
103 //so j's intron chain starts AFTER i's
104 // then i must contain j, so j's last intron must end with or before j's last intron
105 if (tj.exons[jmax]->start>ti.exons[imax]->start) return false;
106 //comment out the line above for just "intronCompatible()" check
107 }
108 //now check if the rest of the introns overlap, in the same sequence
109 int i_start=i; //first (leftmost) matching intron of ti (1-based index)
110 int j_start=j; //first (leftmost) matching intron of tj
111 i++;
112 j++;
113 while (i<=imax && j<=jmax) {
114 if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
115 ti.exons[i]->start!=tj.exons[j]->start) return false;
116 i++;
117 j++;
118 }
119 i--;
120 j--;
121 if (i==imax && j<jmax) {
122 // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
123 if (ti.end>tj.exons[j]->end) return false;
124 }
125 else if (j==jmax && i<imax) {
126 if (tj.end>ti.exons[i]->end) return false;
127 }
128 if (no5share && imax!=jmax) {
129 //if they share the 5' intron, they are NOT to be considered redundant
130 if (ti.strand=='+') {
131 if (i_start==1 && j_start==1) return false;
132 }
133 else { //reverse strand
134 if (i==imax && j==jmax) return false;
135 }
136 }
137 return true; //they are intron-redundant
138 }
139
t_contains(GffObj & a,GffObj & b)140 bool t_contains(GffObj& a, GffObj& b) {
141 //returns true if b's intron chain (or single exon) is included in a
142 if (b.exons.Count()>=a.exons.Count()) return false;
143 if (b.exons.Count()==1) {
144 //check if b is contained in any of a's exons:
145 for (int i=0;i<a.exons.Count();i++) {
146 if (b.start>=a.exons[i]->start && b.end<=a.exons[i]->end) return true;
147 }
148 return false;
149 }
150 if (intronRedundant(a,b)) {
151 //intronRedudant allows b's initial/terminal exons to extend beyond a's boundaries
152 //but we don't allow this kind of behavior here
153 return (b.start>=a.start && b.end<=a.end);
154 }
155 else return false;
156 }
157
is_Redundant(GffObj * m,GList<GffObj> * mrnas,bool no5share=false)158 int is_Redundant(GffObj*m, GList<GffObj>* mrnas, bool no5share=false) {
159 //first locate the list index of the mrna starting just ABOVE
160 //the end of this mrna
161 if (mrnas->Count()==0) return -1;
162 int nidx=qsearch_mrnas(m->end, *mrnas);
163 if (nidx==0) return -1;
164 if (nidx==-1) nidx=mrnas->Count();//all can overlap
165 for (int i=nidx-1;i>=0;i--) {
166 GffObj& omrna=*mrnas->Get(i);
167 if (m->start>omrna.end) {
168 if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
169 continue;
170 }
171 if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
172
173 if (intronRedundant(*m, omrna, no5share)) return i;
174 }
175 return -1;
176 }
177
t_dominates(GffObj * a,GffObj * b)178 bool t_dominates(GffObj* a, GffObj* b) {
179 // for redundant / intron compatible transfrags:
180 // returns true if a "dominates" b, i.e. a has more exons or is longer
181 if (a->exons.Count()==b->exons.Count())
182 return (a->covlen>b->covlen);
183 else return (a->exons.Count()>b->exons.Count());
184 }
185
betterDupRef(GffObj * a,GffObj * b)186 bool betterDupRef(GffObj* a, GffObj* b) {
187 if (a->exons.Count()!=b->exons.Count())
188 return (a->exons.Count()>b->exons.Count());
189 if (a->hasCDS()!=b->hasCDS())
190 return (a->hasCDS()>b->hasCDS());
191 //for annotation purposes, it's more important to keep the
192 //longer transcript, instead of the one that was loaded first
193 if (a->covlen != b->covlen)
194 return (a->covlen > b->covlen);
195 else return (a->track_id < b->track_id);
196 }
197
parse_mRNAs(GfList & mrnas,GList<GSeqData> & glstdata,bool is_ref_set,int check_for_dups,int qfidx,bool only_multiexon)198 int parse_mRNAs(GfList& mrnas,
199 GList<GSeqData>& glstdata,
200 bool is_ref_set,
201 int check_for_dups,
202 int qfidx, bool only_multiexon) {
203 int tredundant=0; //redundant transcripts discarded
204 int total_kept=0;
205 int total_seen=mrnas.Count();
206 for (int k=0;k<mrnas.Count();k++) {
207 GffObj* m=mrnas[k];
208 int i=-1;
209 GSeqData f(m->gseq_id);
210 GSeqData* gdata=NULL;
211 uint tlen=m->len();
212 if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too..
213 if (gtf_tracking_verbose)
214 GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen);
215 continue;
216 }
217 if (only_multiexon && m->exons.Count()<2) {
218 continue;
219 }
220 //GStr feature(m->getFeatureName());
221 //feature.lower();
222 //bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0);
223 //if (m->exons.Count()==0 && gene_or_locus) {
224 if (m->isDiscarded()) {
225 //discard generic "gene" or "locus" features with no other detailed subfeatures
226 if (!is_ref_set && gtf_tracking_verbose)
227 GMessage("Warning: discarding non-transfrag (GFF generic gene/locus container?) %s\n",m->getID());
228 continue;
229 }
230
231 if (m->exons.Count()==0) {
232 if (gtf_tracking_verbose && !is_ref_set)
233 GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID());
234 m->addExon(m->start,m->end);
235 }
236 if (glstdata.Found(&f,i)) gdata=glstdata[i];
237 else {
238 gdata=new GSeqData(m->gseq_id);
239 glstdata.Add(gdata);
240 }
241
242 double fpkm=0;
243 double cov=0;
244 double conf_hi=0;
245 double conf_lo=0;
246
247 GList<GffObj>* target_mrnas=NULL;
248 if (is_ref_set) { //-- ref transcripts
249 if (m->strand=='.') {
250 //unknown strand - discard from reference set (!)
251 continue;
252 }
253 total_kept++;
254 target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r);
255 if (check_for_dups) {
256 //check all gdata->mrnas_r (ref_data) for duplicate ref transcripts
257 int rpidx=-1;
258 GffObj* rp= is_RefDup(m, *target_mrnas, rpidx);
259 if (rp!=NULL) { //duplicate found
260 //discard one of them
261 //but let's keep the gene_name if present
262 //DEBUG:
263 //GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID());
264 tredundant++;
265 total_kept--;
266 if (betterDupRef(rp, m)) {
267 if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) {
268 rp->setGeneName(m->getGeneName());
269 }
270 continue;
271 }
272 else {
273 if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) {
274 m->setGeneName(rp->getGeneName());
275 }
276 ((CTData*)(rp->uptr))->mrna=NULL;
277 rp->isUsed(false);
278 target_mrnas->Forget(rpidx);
279 target_mrnas->Delete(rpidx);
280 }
281 }
282 } //check for duplicate ref transcripts
283 } //ref transcripts
284 else { //-- query transfrags
285 if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); }
286 else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); }
287 else { m->strand='.'; target_mrnas=&(gdata->umrnas); }
288 total_kept++;
289 if (check_for_dups) { //check for redundancy
290 // check if there is a redundancy between this and another already loaded Cufflinks transcript
291 int cidx = is_Redundant(m, target_mrnas, (check_for_dups>1));
292 if (cidx>=0) {
293 //always discard the redundant transcript with the fewer exons OR shorter
294 tredundant++;
295 total_kept--;
296 if (t_dominates(target_mrnas->Get(cidx),m)) {
297 //new transcript is shorter, discard it
298 if (gtf_tracking_verbose) {
299 GMessage(" transfrag %s discarded (made redundant by %s)\n", m->getID(), target_mrnas->Get(cidx)->getID());
300 }
301 continue;
302 }
303 else {
304 //discard the older transfrag
305 if (gtf_tracking_verbose) {
306 GMessage(" transfrag %s discarded (made redundant by %s)\n", target_mrnas->Get(cidx)->getID(), m->getID());
307 }
308 ((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL;
309 target_mrnas->Get(cidx)->isUsed(false);
310 target_mrnas->Forget(cidx);
311 target_mrnas->Delete(cidx);
312 //the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually
313 }
314 }
315 }// redundant transfrag check
316 if (m->gscore==0.0)
317 m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance
318 //const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM");
319 const char* expr = m->getAttr("FPKM");
320 if (expr!=NULL) {
321 if (expr[0]=='"') expr++;
322 fpkm=strtod(expr, NULL);
323 } else { //backward compatibility: read RPKM if FPKM not found
324 //expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM");
325 expr=m->getAttr("RPKM");
326 if (expr!=NULL) {
327 if (expr[0]=='"') expr++;
328 fpkm=strtod(expr, NULL);
329 }
330 }
331 //const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov");
332 const char* scov=m->getAttr("cov");
333 if (scov!=NULL) {
334 if (scov[0]=='"') scov++;
335 cov=strtod(scov, NULL);
336 }
337 //const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi");
338 const char* sconf_hi=m->getAttr("conf_hi");
339 if (sconf_hi!=NULL){
340 if (sconf_hi[0]=='"') sconf_hi++;
341 conf_hi=strtod(sconf_hi, NULL);
342 }
343 //const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo");
344 const char* sconf_lo=m->getAttr("conf_lo");
345 if (sconf_lo!=NULL) {
346 if (sconf_lo[0]=='"') sconf_lo++;
347 conf_lo=strtod(sconf_lo, NULL);
348 }
349 } //Cufflinks transfrags
350 target_mrnas->Add(m);
351 m->isUsed(true);
352 CTData* mdata=new CTData(m);
353 mdata->qset=qfidx;
354 gdata->tdata.Add(mdata);
355 if (!is_ref_set) {
356 // Cufflinks - attributes parsing
357 mdata->FPKM=fpkm;
358 mdata->cov=cov;
359 mdata->conf_hi=conf_hi;
360 mdata->conf_lo=conf_lo;
361 }
362 }//for each mrna read
363 if (gtf_tracking_verbose) {
364 if (is_ref_set)
365 GMessage(" Kept %d ref transcripts out of %d\n", total_kept, total_seen);
366 else
367 GMessage(" Kept %d transfrags out of %d\n", total_kept, total_seen);
368 }
369 //if (mrna_deleted>0)
370 // mrnas.Pack();
371
372 //return (is_ref_set ? refdiscarded : tredundant);
373 return tredundant;
374 }
375
singleExonTMatch(GffObj & m,GffObj & r,int & ovlen)376 bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) {
377 //if (m.exons.Count()>1 || r.exons.Count()>1..)
378 GSeg mseg(m.start, m.end);
379 ovlen=mseg.overlapLen(r.start,r.end);
380 int lmax=GMAX(r.covlen, m.covlen);
381 return ((ovlen >= lmax*0.8) // fuzz matching for single-exon transcripts: 80% of the longer one
382 || (ovlen >= r.covlen*0.9)); // fuzzy reverse-containment - the reference transcript is shorter
383 }
384
tMatch(GffObj & a,GffObj & b,int & ovlen,bool fuzzunspl,bool contain_only)385 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl, bool contain_only) {
386 //strict intron chain match, or single-exon match
387 int imax=a.exons.Count()-1;
388 int jmax=b.exons.Count()-1;
389 ovlen=0;
390 if (imax!=jmax) return false; //different number of exons
391 if (imax==0) { //single-exon mRNAs
392 if (contain_only) {
393 return ((a.start>=b.start && a.end<=b.end) ||
394 (b.start>=a.start && b.end<=a.end));
395 }
396 if (fuzzunspl) {
397 //fuzz match for single-exon transfrags:
398 // it's a match if they overlap at least 80% of longest one
399 //ovlen=a.exons[0]->overlapLen(b.exons[0]);
400 //int maxlen=GMAX(a.covlen,b.covlen);
401 //return (ovlen>=maxlen*0.8);
402 return (singleExonTMatch(a,b,ovlen));
403 }
404 else {
405 //only exact match
406 ovlen=a.covlen;
407 return (a.exons[0]->start==b.exons[0]->start &&
408 a.exons[0]->end==b.exons[0]->end);
409 }
410 }
411 if ( a.exons[imax]->start<b.exons[0]->end ||
412 b.exons[jmax]->start<a.exons[0]->end )
413 return false; //intron chains do not overlap at all
414 //check intron overlaps
415 ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
416 ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
417 for (int i=1;i<=imax;i++) {
418 if (i<imax) ovlen+=a.exons[i]->len();
419 if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
420 (a.exons[i]->start!=b.exons[i]->start)) {
421 return false; //intron mismatch
422 }
423 }
424 if (contain_only)
425 return ((a.start>=b.start && a.end<=b.end) ||
426 (b.start>=a.start && b.end<=a.end));
427 else return true;
428 }
429
430
cluster_mRNAs(GList<GffObj> & mrnas,GList<GLocus> & loci,int qfidx)431 void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) {
432 //mrnas sorted by start coordinate
433 //and so are the loci
434 //int rdisc=0;
435 for (int t=0;t<mrnas.Count();t++) {
436 GArray<int> mrgloci(false);
437 GffObj* mrna=mrnas[t];
438 int lfound=0; //count of parent loci
439 /*for (int l=0;l<loci.Count();l++) {
440 if (loci[l]->end<mrna->exons.First()->start) continue;
441 if (loci[l]->start>mrna->exons.Last()->end) break; */
442 for (int l=loci.Count()-1;l>=0;l--) {
443 if (loci[l]->end<mrna->exons.First()->start) {
444 if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break;
445 continue;
446 }
447 if (loci[l]->start>mrna->exons.Last()->end) continue;
448 //here we have mrna overlapping loci[l]
449 if (loci[l]->add_mRNA(mrna)) {
450 //a parent locus was found
451 lfound++;
452 mrgloci.Add(l); //locus indices added here, in decreasing order
453 }
454 }//loci loop
455 //if (lfound<0) continue; //mrna was a ref duplicate, skip it
456 if (lfound==0) {
457 //create a locus with only this mRNA
458 loci.Add(new GLocus(mrna, qfidx));
459 }
460 else if (lfound>1) {
461 //more than one locus found parenting this mRNA, merge loci
462 lfound--;
463 for (int l=0;l<lfound;l++) {
464 int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
465 loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna);
466 loci.Delete(mlidx);
467 }
468 }
469 }//mrnas loop
470 //if (rdisc>0) mrnas.Pack();
471 //return rdisc;
472 }
473
fix_umrnas(GSeqData & seqdata,GSeqData * rdata,FILE * fdis=NULL)474 int fix_umrnas(GSeqData& seqdata, GSeqData* rdata, FILE* fdis=NULL) {
475 //attempt to find the strand for seqdata.umrnas
476 //based on a) overlaps with oriented reference mRNAs if present
477 // b) overlaps with oriented mRNAs from the same input set
478 //int incount=seqdata.umrnas.Count();
479 if (rdata!=NULL) { //we have reference mrnas
480 for (int i=0;i<rdata->mrnas_f.Count();i++) {
481 for (int j=0;j<seqdata.umrnas.Count();j++) {
482 if (rdata->mrnas_f[i]->gseq_id!=seqdata.umrnas[j]->gseq_id) continue;
483 if (seqdata.umrnas[j]->strand!='.') continue;
484 uint ustart=seqdata.umrnas[j]->exons.First()->start;
485 uint uend=seqdata.umrnas[j]->exons.Last()->end;
486 uint rstart=rdata->mrnas_f[i]->exons.First()->start;
487 uint rend=rdata->mrnas_f[i]->exons.Last()->end;
488 if (ustart>rend) break;
489 if (rstart>uend) continue;
490 if (rdata->mrnas_f[i]->exonOverlap(ustart,uend)) {
491 seqdata.umrnas[j]->strand='+';
492 }
493 else { //within intron
494 //if (seqdata.umrnas[j]->ulink==NULL ||
495 // seqdata.umrnas[j]->ulink->covlen<rdata->mrnas_f[i]->covlen) {
496 CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
497 mdata->addOvl('i',rdata->mrnas_f[i]);
498 }
499 }
500 }
501 for (int i=0;i<rdata->mrnas_r.Count();i++) {
502 for (int j=0;j<seqdata.umrnas.Count();j++) {
503 if (seqdata.umrnas[j]->strand!='.') continue;
504 uint ustart=seqdata.umrnas[j]->exons.First()->start;
505 uint uend=seqdata.umrnas[j]->exons.Last()->end;
506 uint rstart=rdata->mrnas_r[i]->exons.First()->start;
507 uint rend=rdata->mrnas_r[i]->exons.Last()->end;
508 if (ustart>rend) break;
509 if (rstart>uend) continue;
510 if (rdata->mrnas_r[i]->exonOverlap(ustart,uend)) {
511 seqdata.umrnas[j]->strand='-';
512 }
513 else { //within intron
514 CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
515 mdata->addOvl('i',rdata->mrnas_r[i]);
516 }
517
518 }
519 }
520 }//we have reference transcripts
521 //---- now compare to other qry transcripts
522 for (int i=0;i<seqdata.mrnas_f.Count();i++) {
523 for (int j=0;j<seqdata.umrnas.Count();j++) {
524 if (seqdata.umrnas[j]->strand!='.') continue;
525 uint ustart=seqdata.umrnas[j]->exons.First()->start;
526 uint uend=seqdata.umrnas[j]->exons.Last()->end;
527 uint rstart=seqdata.mrnas_f[i]->exons.First()->start;
528 uint rend=seqdata.mrnas_f[i]->exons.Last()->end;
529 if (ustart>rend) break;
530 if (rstart>uend) continue;
531 if (seqdata.mrnas_f[i]->exonOverlap(ustart,uend)) {
532 seqdata.umrnas[j]->strand='+';
533 }
534 }
535 }
536 for (int i=0;i<seqdata.mrnas_r.Count();i++) {
537 for (int j=0;j<seqdata.umrnas.Count();j++) {
538 if (seqdata.umrnas[j]->strand!='.') continue;
539 uint ustart=seqdata.umrnas[j]->exons.First()->start;
540 uint uend=seqdata.umrnas[j]->exons.Last()->end;
541 uint rstart=seqdata.mrnas_r[i]->exons.First()->start;
542 uint rend=seqdata.mrnas_r[i]->exons.Last()->end;
543 if (ustart>rend) break;
544 if (rstart>uend) continue;
545 //overlap
546 if (seqdata.mrnas_r[i]->exonOverlap(ustart,uend)) {
547 seqdata.umrnas[j]->strand='-';
548 }
549 }
550 }
551 int fcount=0;
552 int fixed=0;
553 for (int i=0;i<seqdata.umrnas.Count();i++) {
554 if (seqdata.umrnas[i]->strand=='+') {
555 seqdata.mrnas_f.Add(seqdata.umrnas[i]);
556 fixed++;
557 seqdata.umrnas.Forget(i);
558 }
559 else if (seqdata.umrnas[i]->strand=='-') {
560 seqdata.mrnas_r.Add(seqdata.umrnas[i]);
561 seqdata.umrnas.Forget(i);
562 fixed++;
563 }
564 else { //discard mRNAs not settled
565 //seqdata.umrnas[i]->strand='.'; ?
566 if (fdis!=NULL) {
567 seqdata.umrnas[i]->printGtf(fdis);
568 }
569 fcount++;
570 }
571 }
572
573 seqdata.umrnas.Pack();
574 //if (gtf_tracking_verbose) {
575 // GMessage(" %d out of %d (%d left, %d) unoriented transfrags were assigned a strand based on overlaps.\n", fixed, incount, seqdata.umrnas.Count(), fcount);
576 //}
577 return fixed;
578 }
579
580 //retrieve ref_data for a specific genomic sequence
getRefData(int gid,GList<GSeqData> & ref_data)581 GSeqData* getRefData(int gid, GList<GSeqData>& ref_data) {
582 int ri=-1;
583 GSeqData f(gid);
584 GSeqData* r=NULL;
585 if (ref_data.Found(&f,ri))
586 r=ref_data[ri];
587 return r;
588 }
589
read_transcripts(FILE * f,GList<GSeqData> & seqdata,boost::crc_32_type & crc_result,bool keepAttrs)590 void read_transcripts(FILE* f, GList<GSeqData>& seqdata,
591 #ifdef CUFFLINKS
592 boost::crc_32_type& crc_result,
593 #endif
594 bool keepAttrs) {
595 rewind(f);
596 GffReader gffr(f, true); //loading only recognizable transcript features
597 gffr.showWarnings(gtf_tracking_verbose);
598
599 // keepAttrs mergeCloseExons noExonAttrs
600 gffr.readAll(keepAttrs, true, true);
601 #ifdef CUFFLINKS
602 crc_result = gffr.current_crc_result();
603 #endif
604 // is_ref? check_for_dups,
605 parse_mRNAs(gffr.gflst, seqdata, false, 0);
606 }
607
cmpGSeqByName(const pointer p1,const pointer p2)608 int cmpGSeqByName(const pointer p1, const pointer p2) {
609 return strcmp(((GSeqData*)p1)->gseq_name, ((GSeqData*)p2)->gseq_name);
610 }
611
sort_GSeqs_byName(GList<GSeqData> & seqdata)612 void sort_GSeqs_byName(GList<GSeqData>& seqdata) {
613 seqdata.setSorted(&cmpGSeqByName);
614 }
615
read_mRNAs(FILE * f,GList<GSeqData> & seqdata,GList<GSeqData> * ref_data,int check_for_dups,int qfidx,const char * fname,bool only_multiexon)616 void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data,
617 int check_for_dups, int qfidx, const char* fname, bool only_multiexon) {
618 //>>>>> read all transcripts/features from a GTF/GFF3 file
619 //int imrna_counter=0;
620 #ifdef HEAPROFILE
621 if (IsHeapProfilerRunning())
622 HeapProfilerDump("00");
623 #endif
624 int loci_counter=0;
625 if (ref_data==NULL) ref_data=&seqdata;
626 bool isRefData=(&seqdata==ref_data);
627 //(f, transcripts_only)
628 GffReader* gffr=new GffReader(f, true); //load only transcript annotations
629 gffr->showWarnings(gtf_tracking_verbose);
630 // keepAttrs mergeCloseExons noExonAttrs
631 gffr->readAll(!isRefData, true, isRefData || gtf_tracking_largeScale);
632 //so it will read exon attributes only for low number of Cufflinks files
633 #ifdef HEAPROFILE
634 if (IsHeapProfilerRunning())
635 HeapProfilerDump("post_readAll");
636 #endif
637
638 int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon);
639 #ifdef HEAPROFILE
640 if (IsHeapProfilerRunning())
641 HeapProfilerDump("post_parse_mRNAs");
642 #endif
643 if (gtf_tracking_verbose && d>0) {
644 if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d);
645 else GMessage(" %d redundant cufflinks transfrags discarded.\n",d);
646 }
647 //imrna_counter=gffr->mrnas.Count();
648 delete gffr; //free the extra memory and unused GffObjs
649 #ifdef HEAPROFILE
650 if (IsHeapProfilerRunning())
651 HeapProfilerDump("post_del_gffr");
652 #endif
653
654 //for each genomic sequence, cluster transcripts
655 int oriented_by_overlap=0;
656 int initial_unoriented=0;
657 int final_unoriented=0;
658 GStr bname(fname);
659 GStr s;
660 if (!bname.is_empty()) {
661 int di=bname.rindex('.');
662 if (di>0) bname.cut(di);
663 int p=bname.rindex('/');
664 if (p<0) p=bname.rindex('\\');
665 if (p>=0) bname.remove(0,p);
666 }
667 FILE* fdis=NULL;
668 FILE* frloci=NULL;
669
670 for (int g=0;g<seqdata.Count();g++) {
671 //find the corresponding refseqdata with the same gseq_id
672 int gseq_id=seqdata[g]->get_gseqid();
673 if (!isRefData) { //cufflinks data, find corresponding ref data
674 GSeqData* rdata=getRefData(gseq_id, *ref_data);
675 initial_unoriented+=seqdata[g]->umrnas.Count();
676 if (seqdata[g]->umrnas.Count()>0) {
677 oriented_by_overlap+=fix_umrnas(*seqdata[g], rdata, fdis);
678 final_unoriented+=seqdata[g]->umrnas.Count();
679 }
680 }
681 //>>>>> group mRNAs into locus-clusters (based on exon overlap)
682 cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx);
683 cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx);
684 if (!isRefData) {
685 cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx);
686 }
687 loci_counter+=seqdata[g]->loci_f.Count();
688 loci_counter+=seqdata[g]->loci_r.Count();
689 // if (refData) {
690 // if (frloci==NULL) {
691 // s=bname;
692 // s.append(".loci.lst");
693 // frloci=fopen(s.chars(), "w");
694 // }
695 // writeLoci(frloci, seqdata[g]->loci_f);
696 // writeLoci(frloci, seqdata[g]->loci_r);
697 // }//write ref loci
698 }//for each genomic sequence
699 if (fdis!=NULL) fclose(fdis);
700 if (frloci!=NULL) fclose(frloci);
701 if (initial_unoriented || final_unoriented) {
702 if (gtf_tracking_verbose) GMessage(" Found %d transfrags with undetermined strand (%d out of initial %d were fixed by overlaps)\n",
703 final_unoriented, oriented_by_overlap, initial_unoriented);
704 }
705 //if (fdis!=NULL) remove(s.chars()); remove 0-length file
706 #ifdef HEAPROFILE
707 if (IsHeapProfilerRunning())
708 HeapProfilerDump("post_cluster");
709 #endif
710 }
711
qsearch_mrnas(uint x,GList<GffObj> & mrnas)712 int qsearch_mrnas(uint x, GList<GffObj>& mrnas) {
713 //binary search
714 //do the simplest tests first:
715 if (mrnas[0]->start>x) return 0;
716 if (mrnas.Last()->start<x) return -1;
717 uint istart=0;
718 int i=0;
719 int idx=-1;
720 int maxh=mrnas.Count()-1;
721 int l=0;
722 int h = maxh;
723 while (l <= h) {
724 i = (l+h)>>1;
725 istart=mrnas[i]->start;
726 if (istart < x) l = i + 1;
727 else {
728 if (istart == x) { //found matching coordinate here
729 idx=i;
730 while (idx<=maxh && mrnas[idx]->start==x) {
731 idx++;
732 }
733 return (idx>maxh) ? -1 : idx;
734 }
735 h = i - 1;
736 }
737 } //while
738 idx = l;
739 while (idx<=maxh && mrnas[idx]->start<=x) {
740 idx++;
741 }
742 return (idx>maxh) ? -1 : idx;
743 }
744
qsearch_loci(uint x,GList<GLocus> & loci)745 int qsearch_loci(uint x, GList<GLocus>& loci) {
746 // same as above, but for GSeg lists
747 //binary search
748 //do the simplest tests first:
749 if (loci[0]->start>x) return 0;
750 if (loci.Last()->start<x) return -1;
751 uint istart=0;
752 int i=0;
753 int idx=-1;
754 int maxh=loci.Count()-1;
755 int l=0;
756 int h = maxh;
757 while (l <= h) {
758 i = (l + h) >> 1;
759 istart=loci[i]->start;
760 if (istart < x) l=i+1;
761 else {
762 if (istart == x) { //found matching coordinate here
763 idx=i;
764 while (idx<=maxh && loci[idx]->start==x) {
765 idx++;
766 }
767 return (idx>maxh) ? -1 : idx;
768 }
769 h=i-1;
770 }
771 } //while
772 idx = l;
773 while (idx<=maxh && loci[idx]->start<=x) {
774 idx++;
775 }
776 return (idx>maxh) ? -1 : idx;
777 }
778
779