1 //#include "tablemaker.h"
2 #include "rlink.h"
3 #include <numeric>
4 
5 extern GStr ballgown_dir;
6 
rc_cov_inc(int i)7 int rc_cov_inc(int i) {
8   return ++i;
9 }
10 
keepGuide(GffObj * t,GPVec<RC_TData> * rc_tdata,GPVec<RC_Feature> * rc_edata,GPVec<RC_Feature> * rc_idata)11 void BundleData::keepGuide(GffObj* t, GPVec<RC_TData>* rc_tdata,
12 		 GPVec<RC_Feature>* rc_edata, GPVec<RC_Feature>* rc_idata) {
13 	if (rc_data==NULL) {
14 	  rc_init(t, rc_tdata, rc_edata, rc_idata);
15 	}
16 	keepguides.Add(t);
17 	t->udata=(int)rc_data->addTranscript(*t);
18 }
19 
20 struct COvlSorter {
operator ()COvlSorter21   bool operator() (pair<int, const RC_Feature*> i,
22       pair<int, const RC_Feature*> j) {
23     return (i.first>j.first); //sort in decreasing order of overlap length
24   }
25 } OvlSorter;
26 
rc_updateExonCounts(const RC_ExonOvl & exonovl,int nh)27 void rc_updateExonCounts(const RC_ExonOvl& exonovl, int nh) {
28   //this only gets read overlaps > 5bp and otherwise filtered in evalReadAln()
29   exonovl.feature->rcount++;
30   if (nh>1) {
31 	  exonovl.feature->mrcount += (1.0/nh);
32 	  exonovl.feature->movlcount += ((double)exonovl.ovlen/nh);
33   }
34   else { // nh<=1
35 	  exonovl.feature->mrcount++;
36 	  exonovl.feature->movlcount += exonovl.ovlen;
37 	  exonovl.feature->ucount++;
38   }
39 }
40 
evalReadAln(GReadAlnData & alndata,char & xstrand)41 bool BundleData::evalReadAln(GReadAlnData& alndata, char& xstrand) {
42 	            //GBamRecord& brec, char& strand, int nh) {
43  if (rc_data==NULL) {
44 	  return false; //no ref transcripts available for this reads' region
45  }
46  GBamRecord& brec=*(alndata.brec);
47  int mate_pos=brec.mate_start();
48  int nh=alndata.nh;
49  if ((int)brec.end<rc_data->lmin || (int)brec.start>rc_data->rmax) {
50 	 return false; //hit outside coverage area
51  }
52  if (rc_data->g_exons.Count()==0 || rc_data->g_tdata.Count()==0)
53 	 return false; //nothing to do without transcripts
54  //check this read alignment against ref exons and introns
55  char strandbits=0;
56  bool result=false;
57  bool is_in_guide=true; // exons and junctions are in reference transcripts but they might be in different guides
58  for (int i=0;i<brec.exons.Count();i++) {
59 	 if (ballgown)
60 		 rc_data->updateCov(xstrand, nh, brec.exons[i].start, brec.exons[i].len());
61 	 GArray<RC_ExonOvl> exonOverlaps(true, true); //overlaps sorted by decreasing length
62 	 if (rc_data->findOvlExons(exonOverlaps, brec.exons[i].start,
63 			 brec.exons[i].end, xstrand, mate_pos)) {
64 		 result=true;
65 		 int max_ovl=exonOverlaps[0].ovlen;
66 		 if(is_in_guide && (uint)max_ovl<brec.exons[i].len()) is_in_guide=false;
67 		 //alndata.g_exonovls.Add(new GVec<RC_ExonOvl>(exonOverlaps));
68 			 for (int k=0;k<exonOverlaps.Count();++k) {
69 				 //if (exonOverlaps[k].ovlen < 5) break; //ignore very short overlaps
70 				 if (k && (exonOverlaps[k].mate_ovl < exonOverlaps[0].mate_ovl
71 						 || exonOverlaps[k].ovlen+5<max_ovl) )
72 					 break; //ignore further overlaps after a mate matched or if they are shorter than max_overlap-5
73 				 if (exonOverlaps[k].feature->strand=='+') strandbits |= 0x01;
74 				 else if (exonOverlaps[k].feature->strand=='-') strandbits |= 0x02;
75 				 //TODO: perhaps we could use a better approach for non-overlapping ref exons
76 				 //      spanned by this same read alignment
77 				 //counting this overlap for multiple exons if it's similarly large
78 				 //(in the shared region of overlapping exons)
79 				 rc_updateExonCounts(exonOverlaps[k], nh);
80 			 }
81 	 } //ref exon overlaps
82 	 if (i>0) { //intron processing
83 		 int j_l=brec.exons[i-1].end+1;
84 		 int j_r=brec.exons[i].start-1;
85 		 RC_Feature* ri=rc_data->findIntron(j_l, j_r, xstrand);
86 		 alndata.juncs.Add(new CJunction(j_l, j_r)); //don't set strand, etc. for now
87 		 if (ri) { //update guide intron counts
88 			 ri->rcount++;
89 			 ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
90 			 if (nh==1)  ri->ucount++;
91 			 alndata.juncs.Last()->guide_match=1;
92 		 }
93 		 else is_in_guide=false;
94 
95 	 } //intron processing
96  }
97  if (xstrand=='.' && strandbits && strandbits<3) {
98 	xstrand = (strandbits==1) ? '+' : '-';
99  }
100 
101  if(!mergeMode && is_in_guide) alndata.in_guide=true;
102 
103  return result;
104 }
105 
rc_fwopen(const char * fname)106 FILE* rc_fwopen(const char* fname) {
107  if (strcmp(fname,"-")==0) return stdout;
108  GStr fpath(ballgown_dir); //always ends with '/'
109  //fpath += '.';
110  fpath += fname;
111  //fpath += "/";
112  //fpath += fname;
113  fpath += ".ctab";
114  FILE* fh=fopen(fpath.chars(), "w");
115  if (fh==NULL) {
116    fprintf(stderr, "Error: cannot create file %s\n",
117         fpath.chars());
118    exit(1);
119    }
120  return fh;
121 }
122 
rc_frenopen(const char * fname)123 FILE* rc_frenopen(const char* fname) {
124 	//if (strcmp(fname,"-")==0) return stdout;
125 	GStr fpath(ballgown_dir);
126 	//fpath += '.';
127 	fpath += fname;
128 	//fpath += "/";
129 	//fpath += fname;
130 	fpath += ".ctab";
131 	GStr fren(fpath);
132 	fren += ".tmp";
133 	//rename fpath to fren and open fren for reading
134 	if (rename(fpath.chars(), fren.chars())!=0) {
135 		 GError("Error: cannot rename %s to %s!\n", fpath.chars(), fren.chars());
136 	}
137 	FILE* fh=fopen(fren.chars(), "r");
138 	if (fh==NULL) {
139 	  GError("Error: cannot open file %s\n", fren.chars());
140 	}
141 	return fh;
142 }
143 
rc_frendel(const char * fname)144 void rc_frendel(const char* fname) {
145 	GStr fpath(ballgown_dir);
146 	//fpath += '.';
147 	fpath += fname;
148 	fpath += ".ctab";
149 	fpath += ".tmp";
150 	if (remove(fpath.chars())!=0) {
151 		GMessage("Warning: could not remove file %s!\n",fpath.chars());
152 	}
153 }
154 
rc_write_f2t(FILE * fh,map<uint,set<uint>> & f2t)155 void rc_write_f2t(FILE* fh, map<uint, set<uint> >& f2t) {
156   for (map<uint, set<uint> >::iterator m=f2t.begin(); m!=f2t.end(); ++m) {
157     uint f_id=(*m).first;
158     set<uint>& tset = (*m).second;
159     for (set<uint>::iterator it=tset.begin();it!=tset.end();++it) {
160  	 uint t_id = *it;
161  	 fprintf(fh, "%u\t%u\n", f_id, t_id);
162     }
163   }
164   fflush(fh);
165 }
166 
167 
rc_update_exons(RC_BundleData & rc)168 void rc_update_exons(RC_BundleData& rc) {
169 	//update stdev etc. for all exons in bundle
170 	for (int f=0;f<rc.g_exons.Count(); ++f) {
171       RC_Feature& exon = *(rc.g_exons[f]);
172       //assert( exon.l >= rc.lmin );
173       int L=exon.l-rc.lmin;
174       int xlen=exon.r-exon.l+1;
175       if (exon.l < rc.lmin) {
176     	  //shouldn't be here, bundle read-counting boundaries should be based on exons!
177     	  if (exon.r<rc.lmin) continue;
178     	  xlen-=(rc.lmin-exon.l+1);
179     	  L=0;
180       }
181       if (rc.rmax<exon.r) {
182     	  if (exon.l>rc.rmax) continue; //should never happen
183     	  xlen-=(exon.r-rc.rmax+1);
184       }
185       int R=L+xlen;
186       vector<int>::iterator xcov_begin;
187       vector<int>::iterator xcov_end;
188       vector<float>::iterator xmcov_begin;
189       vector<float>::iterator xmcov_end;
190       if (exon.strand=='+' || exon.strand=='.') {
191          xcov_begin  = rc.f_cov.begin()+L;
192          xcov_end = rc.f_cov.begin()+R;
193          xmcov_begin = rc.f_mcov.begin()+L;
194          xmcov_end = rc.f_mcov.begin()+R;
195       } else {
196         xcov_begin  = rc.r_cov.begin()+L;
197         xcov_end = rc.r_cov.begin()+R;
198         xmcov_begin = rc.r_mcov.begin()+L;
199         xmcov_end = rc.r_mcov.begin()+R;
200       }
201 
202       double avg = (double)accumulate(xcov_begin, xcov_end, 0) / xlen;
203       vector<double> diff(xlen);
204       transform(xcov_begin, xcov_end, diff.begin(),
205                      bind2nd( minus<double>(), avg));
206       double sq_sum = inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
207       double stdev = sqrt(sq_sum / xlen);
208 
209       double mavg = (double)accumulate(xmcov_begin, xmcov_end, 0) / xlen;
210       vector<double> mdiff(xlen);
211       transform(xmcov_begin, xmcov_end, mdiff.begin(),
212                      bind2nd( minus<double>(), mavg));
213       sq_sum = inner_product(mdiff.begin(), mdiff.end(), mdiff.begin(), 0.0);
214       double mstdev = sqrt(sq_sum / xlen);
215       exon.avg=avg;
216       exon.stdev=stdev;
217       exon.mavg=mavg;
218       exon.mstdev=mstdev;
219 	} //for each exon in bundle
220 
221 
222 }
223 
224 
rc_write_RCfeature(GPVec<RC_TData> & rcdata,GPVec<RC_Feature> & features,FILE * & fdata,FILE * & f2t,bool is_exon=false)225 void rc_write_RCfeature( GPVec<RC_TData>& rcdata, GPVec<RC_Feature>& features, FILE*& fdata, FILE*& f2t,
226 		               bool is_exon=false) {
227   for (int i=0;i<features.Count();++i) {
228 	RC_Feature& f=*(features[i]);
229 	const char* ref_name=rcdata[f.t_ids[0]-1]->ref_t->getGSeqName();
230 	if (is_exon) {
231 	  fprintf(fdata, "%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\t%.4f\t%.4f\t%.4f\t%.4f\n",
232 		  f.id, ref_name, f.strand, f.l, f.r, f.rcount,
233 		  f.ucount, f.mrcount, f.avg, f.stdev, f.mavg, f.mstdev);
234 	}
235     else { //introns
236 	  fprintf(fdata,"%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\n",f.id, ref_name,
237 		  f.strand, f.l, f.r, f.rcount, f.ucount, f.mrcount);
238 	}
239   // f2t -------
240 	for (int t=0;t<f.t_ids.Count();++t)
241 	   fprintf(f2t, "%u\t%u\n", f.id, f.t_ids[t]);
242   } //for each feature
243   fclose(fdata);
244   fclose(f2t);
245 }
246 
247 
248 /*void rc_write_counts(const char* refname, BundleData& bundle) {
249  RC_BundleData& rc = *bundle.rc_data;
250  //if (rc.exons.size()==0) return;
251   *
252 */
rc_writeRC(GPVec<RC_TData> & RC_data,GPVec<RC_Feature> & RC_exons,GPVec<RC_Feature> & RC_introns,FILE * & f_tdata,FILE * & f_edata,FILE * & f_idata,FILE * & f_e2t,FILE * & f_i2t)253 void rc_writeRC(GPVec<RC_TData>& RC_data,
254 		GPVec<RC_Feature>& RC_exons,
255 		GPVec<RC_Feature>& RC_introns,
256 		FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata,
257         FILE* &f_e2t, FILE* &f_i2t) {
258 
259  for (int t=0;t<RC_data.Count();++t) {
260   //File: t_data.ctab
261   //t_id tname chr strand start end num_exons gene_id gene_name cufflinks_cov cufflinks_fpkm
262   RC_TData& sd=*RC_data[t];
263    const char* refname = sd.ref_t->getGSeqName();
264    const char* genename= sd.ref_t->getGeneName();
265    const char* geneID= sd.ref_t->getGeneID();
266    if (genename==NULL) genename=".";
267    if (geneID==NULL) geneID=".";
268 
269    fprintf(f_tdata, "%u\t%s\t%c\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%f\t%f\n",
270 	  sd.t_id, refname, sd.ref_t->strand, sd.l, sd.r, sd.ref_t->getID(),
271 	  sd.t_exons.Count(), sd.eff_len, geneID,
272 	  genename, sd.cov, sd.fpkm);
273  }//for each transcript
274  //fflush(f_tdata);
275  fclose(f_tdata);
276 
277  //File: e_data.ctab
278  //e_id chr gstart gend rcount ucount mrcount
279  rc_write_RCfeature(RC_data, RC_exons, f_edata, f_e2t, true);
280  //File: i_data.ctab
281  //i_id chr gstart gend rcount ucount mrcount
282  rc_write_RCfeature(RC_data, RC_introns, f_idata, f_i2t);
283 }
284 
rc_addFeatures(uint & c_e_id,GList<RC_Feature> & exonSet,GPVec<RC_Feature> & exonTable,uint & c_i_id,GList<RC_Feature> & intronSet,GPVec<RC_Feature> & intronTable)285 void RC_TData::rc_addFeatures(uint& c_e_id, GList<RC_Feature>& exonSet, GPVec<RC_Feature>& exonTable,
286                     uint& c_i_id, GList<RC_Feature>& intronSet, GPVec<RC_Feature>& intronTable) {
287   GASSERT(ref_t);
288   GffObj& m = *(ref_t);
289   int ecache_idx = exonSet.Count()-1;
290   int icache_idx = intronSet.Count()-1;
291   //int ecache_idx = e_idx_cache>=0 ? e_idx_cache : exonSet.Count()-1;
292   //int icache_idx = i_idx_cache>=0 ? i_idx_cache : intronSet.Count()-1;
293   for (int i = 0; i < m.exons.Count(); ++i)  {
294     addFeature((int)m.exons[i]->start, (int)m.exons[i]->end, t_exons, c_e_id, exonSet, exonTable, ecache_idx);
295     //if (i==0) e_idx_cache=ecache_idx;
296     if (i>0) { //store intron
297       //if (i==1) i_idx_cache=icache_idx;
298       addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id,
299     		  intronSet, intronTable, icache_idx);
300     } //for each intron
301   } //for each exon
302 }
303 
addFeature(int fl,int fr,GPVec<RC_Feature> & fvec,uint & f_id,GList<RC_Feature> & fset,GPVec<RC_Feature> & fdata,int & cache_idx)304 void RC_TData::addFeature(int fl, int fr, GPVec<RC_Feature>& fvec,
305                           uint& f_id, GList<RC_Feature>& fset,
306 						  GPVec<RC_Feature>& fdata, int& cache_idx) {
307   //f_id is the largest f_id inserted so far in fset
308   bool add_new = true;
309   RC_Feature* newseg=new RC_Feature(fl, fr, ref_t->strand, 0, this->t_id);
310   //RC_Feature* newfeature=NULL;
311   int fit=cache_idx<0 ? fset.Count()-1 : cache_idx;
312   int fp_id=-1;
313   if (fset.Count()>0) {
314     if (*newseg < *(fset[fit])) {
315       bool eq=false;
316       while (*newseg < *(fset[fit]) || (eq = (*newseg==*(fset[fit])))) {
317         if (eq) {
318           add_new = false;
319           fp_id = fset[fit]->id; //fset[fit]->id;
320           break;
321         }
322         //newseg< fset[fit]
323         --fit;
324         if (fit<0) break; //newseg should be inserted at 0
325       } //while newseg<fset[fit]
326       if (add_new) ++fit;
327          // newseg < fset[fit+1]
328          //we'll insert newseg at position fit+1
329     }
330     else { //newseg >= *fset[fit]
331       bool eq=false;
332       while (*(fset[fit]) < *newseg || (eq = (*newseg==*(fset[fit])))) {
333         if (eq) {
334           add_new = false;
335           fp_id = fset[fit]->id;
336           break;
337         }
338         ++fit;
339         if (fit==fset.Count()) {
340         	//newseg should be appended to the list
341         	break;
342         }
343       }
344       //if (fit<=fset.Count() && !add_new) {
345          //fset[fit-1] < newseg  < fset[fit]
346          //we'll insert newseg at position fit
347       //}
348     }
349   } //check existing set
350   if (add_new) { //did not see this feature before
351     newseg->id = ++f_id;
352     if (fit<0) fit = fset.Add(newseg);
353     else fset.sortInsert(fit, newseg);
354     if (fit<0) {
355       GError("Error: feature %d-%d (%c) already in feature set!\n",
356                  newseg->l, newseg->r, newseg->strand);
357     }
358     cache_idx=fit;
359     fp_id=fdata.Add(newseg)+1;
360 
361 #ifdef DEBUG
362     if (fdata.Count()!=(int)f_id) {
363     	GMessage("Error: fdata.Count=%d, f_id=%d\n", fdata.Count(), f_id);
364     }
365 #endif
366     GASSERT((uint)fdata.Count()==f_id);
367   }
368   else { //feature seen before, update its parent list
369    fdata[fp_id-1]->t_ids.Add(this->t_id);
370    delete newseg;
371   }
372   //fvec.push_back(newseg);
373   GASSERT(fdata[fp_id-1]->id==(uint)fp_id);
374   fvec.Add(fdata[fp_id-1]);
375 }
376 
Ballgown_setupFiles(FILE * & f_tdata,FILE * & f_edata,FILE * & f_idata,FILE * & f_e2t,FILE * & f_i2t)377 void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata,
378 	            FILE* &f_e2t, FILE* &f_i2t) {
379   if (f_tdata == NULL) {
380 	//first call, create the files
381 	 f_tdata = rc_fwopen("t_data");
382 	 fprintf(f_tdata, "t_id\tchr\tstrand\tstart\tend\tt_name\tnum_exons\tlength\tgene_id\tgene_name\tcov\tFPKM\n");
383 	 f_edata = rc_fwopen("e_data");
384     fprintf(f_edata, "e_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\tcov\tcov_sd\tmcov\tmcov_sd\n");
385 	 f_idata = rc_fwopen("i_data");
386     fprintf(f_idata, "i_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\n");
387     f_e2t = rc_fwopen("e2t");
388     fprintf(f_e2t,  "e_id\tt_id\n");
389     f_i2t = rc_fwopen("i2t");
390     fprintf(f_i2t,  "i_id\tt_id\n");
391   }
392 }
393