1 //#include "tablemaker.h"
2 #include "rlink.h"
3 #include <numeric>
4
5 extern GStr ballgown_dir;
6
rc_cov_inc(int i)7 int rc_cov_inc(int i) {
8 return ++i;
9 }
10
keepGuide(GffObj * t,GPVec<RC_TData> * rc_tdata,GPVec<RC_Feature> * rc_edata,GPVec<RC_Feature> * rc_idata)11 void BundleData::keepGuide(GffObj* t, GPVec<RC_TData>* rc_tdata,
12 GPVec<RC_Feature>* rc_edata, GPVec<RC_Feature>* rc_idata) {
13 if (rc_data==NULL) {
14 rc_init(t, rc_tdata, rc_edata, rc_idata);
15 }
16 keepguides.Add(t);
17 t->udata=(int)rc_data->addTranscript(*t);
18 }
19
20 struct COvlSorter {
operator ()COvlSorter21 bool operator() (pair<int, const RC_Feature*> i,
22 pair<int, const RC_Feature*> j) {
23 return (i.first>j.first); //sort in decreasing order of overlap length
24 }
25 } OvlSorter;
26
rc_updateExonCounts(const RC_ExonOvl & exonovl,int nh)27 void rc_updateExonCounts(const RC_ExonOvl& exonovl, int nh) {
28 //this only gets read overlaps > 5bp and otherwise filtered in evalReadAln()
29 exonovl.feature->rcount++;
30 if (nh>1) {
31 exonovl.feature->mrcount += (1.0/nh);
32 exonovl.feature->movlcount += ((double)exonovl.ovlen/nh);
33 }
34 else { // nh<=1
35 exonovl.feature->mrcount++;
36 exonovl.feature->movlcount += exonovl.ovlen;
37 exonovl.feature->ucount++;
38 }
39 }
40
evalReadAln(GReadAlnData & alndata,char & xstrand)41 bool BundleData::evalReadAln(GReadAlnData& alndata, char& xstrand) {
42 //GBamRecord& brec, char& strand, int nh) {
43 if (rc_data==NULL) {
44 return false; //no ref transcripts available for this reads' region
45 }
46 GBamRecord& brec=*(alndata.brec);
47 int mate_pos=brec.mate_start();
48 int nh=alndata.nh;
49 if ((int)brec.end<rc_data->lmin || (int)brec.start>rc_data->rmax) {
50 return false; //hit outside coverage area
51 }
52 if (rc_data->g_exons.Count()==0 || rc_data->g_tdata.Count()==0)
53 return false; //nothing to do without transcripts
54 //check this read alignment against ref exons and introns
55 char strandbits=0;
56 bool result=false;
57 bool is_in_guide=true; // exons and junctions are in reference transcripts but they might be in different guides
58 for (int i=0;i<brec.exons.Count();i++) {
59 if (ballgown)
60 rc_data->updateCov(xstrand, nh, brec.exons[i].start, brec.exons[i].len());
61 GArray<RC_ExonOvl> exonOverlaps(true, true); //overlaps sorted by decreasing length
62 if (rc_data->findOvlExons(exonOverlaps, brec.exons[i].start,
63 brec.exons[i].end, xstrand, mate_pos)) {
64 result=true;
65 int max_ovl=exonOverlaps[0].ovlen;
66 if(is_in_guide && (uint)max_ovl<brec.exons[i].len()) is_in_guide=false;
67 //alndata.g_exonovls.Add(new GVec<RC_ExonOvl>(exonOverlaps));
68 for (int k=0;k<exonOverlaps.Count();++k) {
69 //if (exonOverlaps[k].ovlen < 5) break; //ignore very short overlaps
70 if (k && (exonOverlaps[k].mate_ovl < exonOverlaps[0].mate_ovl
71 || exonOverlaps[k].ovlen+5<max_ovl) )
72 break; //ignore further overlaps after a mate matched or if they are shorter than max_overlap-5
73 if (exonOverlaps[k].feature->strand=='+') strandbits |= 0x01;
74 else if (exonOverlaps[k].feature->strand=='-') strandbits |= 0x02;
75 //TODO: perhaps we could use a better approach for non-overlapping ref exons
76 // spanned by this same read alignment
77 //counting this overlap for multiple exons if it's similarly large
78 //(in the shared region of overlapping exons)
79 rc_updateExonCounts(exonOverlaps[k], nh);
80 }
81 } //ref exon overlaps
82 if (i>0) { //intron processing
83 int j_l=brec.exons[i-1].end+1;
84 int j_r=brec.exons[i].start-1;
85 RC_Feature* ri=rc_data->findIntron(j_l, j_r, xstrand);
86 alndata.juncs.Add(new CJunction(j_l, j_r)); //don't set strand, etc. for now
87 if (ri) { //update guide intron counts
88 ri->rcount++;
89 ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
90 if (nh==1) ri->ucount++;
91 alndata.juncs.Last()->guide_match=1;
92 }
93 else is_in_guide=false;
94
95 } //intron processing
96 }
97 if (xstrand=='.' && strandbits && strandbits<3) {
98 xstrand = (strandbits==1) ? '+' : '-';
99 }
100
101 if(!mergeMode && is_in_guide) alndata.in_guide=true;
102
103 return result;
104 }
105
rc_fwopen(const char * fname)106 FILE* rc_fwopen(const char* fname) {
107 if (strcmp(fname,"-")==0) return stdout;
108 GStr fpath(ballgown_dir); //always ends with '/'
109 //fpath += '.';
110 fpath += fname;
111 //fpath += "/";
112 //fpath += fname;
113 fpath += ".ctab";
114 FILE* fh=fopen(fpath.chars(), "w");
115 if (fh==NULL) {
116 fprintf(stderr, "Error: cannot create file %s\n",
117 fpath.chars());
118 exit(1);
119 }
120 return fh;
121 }
122
rc_frenopen(const char * fname)123 FILE* rc_frenopen(const char* fname) {
124 //if (strcmp(fname,"-")==0) return stdout;
125 GStr fpath(ballgown_dir);
126 //fpath += '.';
127 fpath += fname;
128 //fpath += "/";
129 //fpath += fname;
130 fpath += ".ctab";
131 GStr fren(fpath);
132 fren += ".tmp";
133 //rename fpath to fren and open fren for reading
134 if (rename(fpath.chars(), fren.chars())!=0) {
135 GError("Error: cannot rename %s to %s!\n", fpath.chars(), fren.chars());
136 }
137 FILE* fh=fopen(fren.chars(), "r");
138 if (fh==NULL) {
139 GError("Error: cannot open file %s\n", fren.chars());
140 }
141 return fh;
142 }
143
rc_frendel(const char * fname)144 void rc_frendel(const char* fname) {
145 GStr fpath(ballgown_dir);
146 //fpath += '.';
147 fpath += fname;
148 fpath += ".ctab";
149 fpath += ".tmp";
150 if (remove(fpath.chars())!=0) {
151 GMessage("Warning: could not remove file %s!\n",fpath.chars());
152 }
153 }
154
rc_write_f2t(FILE * fh,map<uint,set<uint>> & f2t)155 void rc_write_f2t(FILE* fh, map<uint, set<uint> >& f2t) {
156 for (map<uint, set<uint> >::iterator m=f2t.begin(); m!=f2t.end(); ++m) {
157 uint f_id=(*m).first;
158 set<uint>& tset = (*m).second;
159 for (set<uint>::iterator it=tset.begin();it!=tset.end();++it) {
160 uint t_id = *it;
161 fprintf(fh, "%u\t%u\n", f_id, t_id);
162 }
163 }
164 fflush(fh);
165 }
166
167
rc_update_exons(RC_BundleData & rc)168 void rc_update_exons(RC_BundleData& rc) {
169 //update stdev etc. for all exons in bundle
170 for (int f=0;f<rc.g_exons.Count(); ++f) {
171 RC_Feature& exon = *(rc.g_exons[f]);
172 //assert( exon.l >= rc.lmin );
173 int L=exon.l-rc.lmin;
174 int xlen=exon.r-exon.l+1;
175 if (exon.l < rc.lmin) {
176 //shouldn't be here, bundle read-counting boundaries should be based on exons!
177 if (exon.r<rc.lmin) continue;
178 xlen-=(rc.lmin-exon.l+1);
179 L=0;
180 }
181 if (rc.rmax<exon.r) {
182 if (exon.l>rc.rmax) continue; //should never happen
183 xlen-=(exon.r-rc.rmax+1);
184 }
185 int R=L+xlen;
186 vector<int>::iterator xcov_begin;
187 vector<int>::iterator xcov_end;
188 vector<float>::iterator xmcov_begin;
189 vector<float>::iterator xmcov_end;
190 if (exon.strand=='+' || exon.strand=='.') {
191 xcov_begin = rc.f_cov.begin()+L;
192 xcov_end = rc.f_cov.begin()+R;
193 xmcov_begin = rc.f_mcov.begin()+L;
194 xmcov_end = rc.f_mcov.begin()+R;
195 } else {
196 xcov_begin = rc.r_cov.begin()+L;
197 xcov_end = rc.r_cov.begin()+R;
198 xmcov_begin = rc.r_mcov.begin()+L;
199 xmcov_end = rc.r_mcov.begin()+R;
200 }
201
202 double avg = (double)accumulate(xcov_begin, xcov_end, 0) / xlen;
203 vector<double> diff(xlen);
204 transform(xcov_begin, xcov_end, diff.begin(),
205 bind2nd( minus<double>(), avg));
206 double sq_sum = inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
207 double stdev = sqrt(sq_sum / xlen);
208
209 double mavg = (double)accumulate(xmcov_begin, xmcov_end, 0) / xlen;
210 vector<double> mdiff(xlen);
211 transform(xmcov_begin, xmcov_end, mdiff.begin(),
212 bind2nd( minus<double>(), mavg));
213 sq_sum = inner_product(mdiff.begin(), mdiff.end(), mdiff.begin(), 0.0);
214 double mstdev = sqrt(sq_sum / xlen);
215 exon.avg=avg;
216 exon.stdev=stdev;
217 exon.mavg=mavg;
218 exon.mstdev=mstdev;
219 } //for each exon in bundle
220
221
222 }
223
224
rc_write_RCfeature(GPVec<RC_TData> & rcdata,GPVec<RC_Feature> & features,FILE * & fdata,FILE * & f2t,bool is_exon=false)225 void rc_write_RCfeature( GPVec<RC_TData>& rcdata, GPVec<RC_Feature>& features, FILE*& fdata, FILE*& f2t,
226 bool is_exon=false) {
227 for (int i=0;i<features.Count();++i) {
228 RC_Feature& f=*(features[i]);
229 const char* ref_name=rcdata[f.t_ids[0]-1]->ref_t->getGSeqName();
230 if (is_exon) {
231 fprintf(fdata, "%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\t%.4f\t%.4f\t%.4f\t%.4f\n",
232 f.id, ref_name, f.strand, f.l, f.r, f.rcount,
233 f.ucount, f.mrcount, f.avg, f.stdev, f.mavg, f.mstdev);
234 }
235 else { //introns
236 fprintf(fdata,"%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\n",f.id, ref_name,
237 f.strand, f.l, f.r, f.rcount, f.ucount, f.mrcount);
238 }
239 // f2t -------
240 for (int t=0;t<f.t_ids.Count();++t)
241 fprintf(f2t, "%u\t%u\n", f.id, f.t_ids[t]);
242 } //for each feature
243 fclose(fdata);
244 fclose(f2t);
245 }
246
247
248 /*void rc_write_counts(const char* refname, BundleData& bundle) {
249 RC_BundleData& rc = *bundle.rc_data;
250 //if (rc.exons.size()==0) return;
251 *
252 */
rc_writeRC(GPVec<RC_TData> & RC_data,GPVec<RC_Feature> & RC_exons,GPVec<RC_Feature> & RC_introns,FILE * & f_tdata,FILE * & f_edata,FILE * & f_idata,FILE * & f_e2t,FILE * & f_i2t)253 void rc_writeRC(GPVec<RC_TData>& RC_data,
254 GPVec<RC_Feature>& RC_exons,
255 GPVec<RC_Feature>& RC_introns,
256 FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata,
257 FILE* &f_e2t, FILE* &f_i2t) {
258
259 for (int t=0;t<RC_data.Count();++t) {
260 //File: t_data.ctab
261 //t_id tname chr strand start end num_exons gene_id gene_name cufflinks_cov cufflinks_fpkm
262 RC_TData& sd=*RC_data[t];
263 const char* refname = sd.ref_t->getGSeqName();
264 const char* genename= sd.ref_t->getGeneName();
265 const char* geneID= sd.ref_t->getGeneID();
266 if (genename==NULL) genename=".";
267 if (geneID==NULL) geneID=".";
268
269 fprintf(f_tdata, "%u\t%s\t%c\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%f\t%f\n",
270 sd.t_id, refname, sd.ref_t->strand, sd.l, sd.r, sd.ref_t->getID(),
271 sd.t_exons.Count(), sd.eff_len, geneID,
272 genename, sd.cov, sd.fpkm);
273 }//for each transcript
274 //fflush(f_tdata);
275 fclose(f_tdata);
276
277 //File: e_data.ctab
278 //e_id chr gstart gend rcount ucount mrcount
279 rc_write_RCfeature(RC_data, RC_exons, f_edata, f_e2t, true);
280 //File: i_data.ctab
281 //i_id chr gstart gend rcount ucount mrcount
282 rc_write_RCfeature(RC_data, RC_introns, f_idata, f_i2t);
283 }
284
rc_addFeatures(uint & c_e_id,GList<RC_Feature> & exonSet,GPVec<RC_Feature> & exonTable,uint & c_i_id,GList<RC_Feature> & intronSet,GPVec<RC_Feature> & intronTable)285 void RC_TData::rc_addFeatures(uint& c_e_id, GList<RC_Feature>& exonSet, GPVec<RC_Feature>& exonTable,
286 uint& c_i_id, GList<RC_Feature>& intronSet, GPVec<RC_Feature>& intronTable) {
287 GASSERT(ref_t);
288 GffObj& m = *(ref_t);
289 int ecache_idx = exonSet.Count()-1;
290 int icache_idx = intronSet.Count()-1;
291 //int ecache_idx = e_idx_cache>=0 ? e_idx_cache : exonSet.Count()-1;
292 //int icache_idx = i_idx_cache>=0 ? i_idx_cache : intronSet.Count()-1;
293 for (int i = 0; i < m.exons.Count(); ++i) {
294 addFeature((int)m.exons[i]->start, (int)m.exons[i]->end, t_exons, c_e_id, exonSet, exonTable, ecache_idx);
295 //if (i==0) e_idx_cache=ecache_idx;
296 if (i>0) { //store intron
297 //if (i==1) i_idx_cache=icache_idx;
298 addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id,
299 intronSet, intronTable, icache_idx);
300 } //for each intron
301 } //for each exon
302 }
303
addFeature(int fl,int fr,GPVec<RC_Feature> & fvec,uint & f_id,GList<RC_Feature> & fset,GPVec<RC_Feature> & fdata,int & cache_idx)304 void RC_TData::addFeature(int fl, int fr, GPVec<RC_Feature>& fvec,
305 uint& f_id, GList<RC_Feature>& fset,
306 GPVec<RC_Feature>& fdata, int& cache_idx) {
307 //f_id is the largest f_id inserted so far in fset
308 bool add_new = true;
309 RC_Feature* newseg=new RC_Feature(fl, fr, ref_t->strand, 0, this->t_id);
310 //RC_Feature* newfeature=NULL;
311 int fit=cache_idx<0 ? fset.Count()-1 : cache_idx;
312 int fp_id=-1;
313 if (fset.Count()>0) {
314 if (*newseg < *(fset[fit])) {
315 bool eq=false;
316 while (*newseg < *(fset[fit]) || (eq = (*newseg==*(fset[fit])))) {
317 if (eq) {
318 add_new = false;
319 fp_id = fset[fit]->id; //fset[fit]->id;
320 break;
321 }
322 //newseg< fset[fit]
323 --fit;
324 if (fit<0) break; //newseg should be inserted at 0
325 } //while newseg<fset[fit]
326 if (add_new) ++fit;
327 // newseg < fset[fit+1]
328 //we'll insert newseg at position fit+1
329 }
330 else { //newseg >= *fset[fit]
331 bool eq=false;
332 while (*(fset[fit]) < *newseg || (eq = (*newseg==*(fset[fit])))) {
333 if (eq) {
334 add_new = false;
335 fp_id = fset[fit]->id;
336 break;
337 }
338 ++fit;
339 if (fit==fset.Count()) {
340 //newseg should be appended to the list
341 break;
342 }
343 }
344 //if (fit<=fset.Count() && !add_new) {
345 //fset[fit-1] < newseg < fset[fit]
346 //we'll insert newseg at position fit
347 //}
348 }
349 } //check existing set
350 if (add_new) { //did not see this feature before
351 newseg->id = ++f_id;
352 if (fit<0) fit = fset.Add(newseg);
353 else fset.sortInsert(fit, newseg);
354 if (fit<0) {
355 GError("Error: feature %d-%d (%c) already in feature set!\n",
356 newseg->l, newseg->r, newseg->strand);
357 }
358 cache_idx=fit;
359 fp_id=fdata.Add(newseg)+1;
360
361 #ifdef DEBUG
362 if (fdata.Count()!=(int)f_id) {
363 GMessage("Error: fdata.Count=%d, f_id=%d\n", fdata.Count(), f_id);
364 }
365 #endif
366 GASSERT((uint)fdata.Count()==f_id);
367 }
368 else { //feature seen before, update its parent list
369 fdata[fp_id-1]->t_ids.Add(this->t_id);
370 delete newseg;
371 }
372 //fvec.push_back(newseg);
373 GASSERT(fdata[fp_id-1]->id==(uint)fp_id);
374 fvec.Add(fdata[fp_id-1]);
375 }
376
Ballgown_setupFiles(FILE * & f_tdata,FILE * & f_edata,FILE * & f_idata,FILE * & f_e2t,FILE * & f_i2t)377 void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata,
378 FILE* &f_e2t, FILE* &f_i2t) {
379 if (f_tdata == NULL) {
380 //first call, create the files
381 f_tdata = rc_fwopen("t_data");
382 fprintf(f_tdata, "t_id\tchr\tstrand\tstart\tend\tt_name\tnum_exons\tlength\tgene_id\tgene_name\tcov\tFPKM\n");
383 f_edata = rc_fwopen("e_data");
384 fprintf(f_edata, "e_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\tcov\tcov_sd\tmcov\tmcov_sd\n");
385 f_idata = rc_fwopen("i_data");
386 fprintf(f_idata, "i_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\n");
387 f_e2t = rc_fwopen("e2t");
388 fprintf(f_e2t, "e_id\tt_id\n");
389 f_i2t = rc_fwopen("i2t");
390 fprintf(f_i2t, "i_id\tt_id\n");
391 }
392 }
393