1 /*
2 * abundances.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 4/27/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 * NOTE: some of the code in this file was derived from (Eriksson et al, 2008)
9 */
10
11
12 //#define BOOST_MATH_INSTRUMENT 1
13 //#include <iostream>
14 //#include <iomanip>
15
16 #include "abundances.h"
17 #include <numeric>
18 #include <limits>
19 #include <algorithm>
20
21 #include <boost/numeric/ublas/vector.hpp>
22 #include <boost/numeric/ublas/vector_proxy.hpp>
23 #include <boost/numeric/ublas/matrix.hpp>
24 #include <boost/numeric/ublas/triangular.hpp>
25
26 //#define BOOST_UBLAS_TYPE_CHECK 0
27 #include <boost/numeric/ublas/lu.hpp>
28
29 #include <boost/numeric/ublas/io.hpp>
30
31 #include <boost/random/mersenne_twister.hpp>
32 #include <boost/random/normal_distribution.hpp>
33 #include <boost/random/uniform_int.hpp>
34 #include <boost/random/variate_generator.hpp>
35 #include <boost/math/constants/constants.hpp>
36 #include <boost/math/tools/roots.hpp>
37 #include <complex>
38 #include <boost/math/tools/tuple.hpp>
39
40 #include <boost/math/distributions/chi_squared.hpp>
41
42 #include "filters.h"
43 #include "replicates.h"
44 #include "sampling.h"
45 #include "jensen_shannon.h"
46 #include "rounding.h"
47 #include "clustering.h"
48
49
50 #include "negative_binomial_distribution.h"
51
52
53 #include <Eigen/Dense>
54 //using Eigen::MatrixXd;
55
56 //#ifdef __USE_ISOC99
57 ///* INFINITY and NAN are defined by the ISO C99 standard */
58 //#else
59 //double my_infinity(void) {
60 // double zero = 0.0;
61 // return 1.0/zero;
62 //}
63 //double my_nan(void) {
64 // double zero = 0.0;
65 // return zero/zero;
66 //}
67 //#define INFINITY my_infinity()
68 //#define NAN my_nan()
69 //#endif
70
71 #define EulerGamma 0.57721566490153286060651209008240243104215933593992
72
73 /* The digamma function is the derivative of gammaln.
74
75 Reference:
76 J Bernardo,
77 Psi ( Digamma ) Function,
78 Algorithm AS 103,
79 Applied Statistics,
80 Volume 25, Number 3, pages 315-317, 1976.
81
82 From http://www.psc.edu/~burkardt/src/dirichlet/dirichlet.f
83 (with modifications for negative numbers and extra precision)
84 */
digamma(double x)85 double digamma(double x)
86 {
87 double neginf = -INFINITY;
88 static const double c = 12,
89 d1 = -EulerGamma,
90 d2 = 1.6449340668482264365, /* pi^2/6 */
91 s = 1e-6,
92 s3 = 1./12,
93 s4 = 1./120,
94 s5 = 1./252,
95 s6 = 1./240,
96 s7 = 1./132;
97 /*static const double s8 = 691/32760.0, s9 = 1/12.0, s10 = 3617/8160.0;*/
98 double result;
99 #if 0
100 static double cache_x = 0;
101 static int hits = 0, total = 0;
102 total++;
103 if(x == cache_x) {
104 hits++;
105 }
106 if(total % 1000 == 1) {
107 printf("hits = %d, total = %d, hits/total = %g\n", hits, total,
108 ((double)hits)/total);
109 }
110 cache_x = x;
111 #endif
112 if( x==1.0 )
113 return d1;
114
115 /* Illegal arguments */
116 if((x == neginf) || isnan(x)) {
117 return NAN;
118 }
119 /* Singularities */
120 if((x <= 0) && (floor(x) == x)) {
121 return neginf;
122 }
123 /* Negative values */
124 /* Use the reflection formula (Jeffrey 11.1.6):
125 * digamma(-x) = digamma(x+1) + pi*cot(pi*x)
126 *
127 * This is related to the identity
128 * digamma(-x) = digamma(x+1) - digamma(z) + digamma(1-z)
129 * where z is the fractional part of x
130 * For example:
131 * digamma(-3.1) = 1/3.1 + 1/2.1 + 1/1.1 + 1/0.1 + digamma(1-0.1)
132 * = digamma(4.1) - digamma(0.1) + digamma(1-0.1)
133 * Then we use
134 * digamma(1-z) - digamma(z) = pi*cot(pi*z)
135 */
136 if(x < 0) {
137 return digamma(1-x) + M_PI/tan(-M_PI*x);
138 }
139 /* Use Taylor series if argument <= S */
140 if(x <= s) return d1 - 1/x + d2*x;
141 /* Reduce to digamma(X + N) where (X + N) >= C */
142 result = 0;
143 while(x < c) {
144 result -= 1/x;
145 x++;
146 }
147 /* Use de Moivre's expansion if argument >= C */
148 /* This expansion can be computed in Maple via asympt(Psi(x),x) */
149 if(x >= c) {
150 double r = 1/x, t;
151 result += log(x) - 0.5*r;
152 r *= r;
153 #if 0
154 result -= r * (s3 - r * (s4 - r * (s5 - r * (s6 - r * s7))));
155 #else
156 /* this version for lame compilers */
157 t = (s5 - r * (s6 - r * s7));
158 result -= r * (s3 - r * (s4 - r * t));
159 #endif
160 }
161 return result;
162 }
163
164
165 /* The trigamma function is the derivative of the digamma function.
166
167 Reference:
168
169 B Schneider,
170 Trigamma Function,
171 Algorithm AS 121,
172 Applied Statistics,
173 Volume 27, Number 1, page 97-99, 1978.
174
175 From http://www.psc.edu/~burkardt/src/dirichlet/dirichlet.f
176 (with modification for negative arguments and extra precision)
177 */
trigamma(double x)178 double trigamma(double x)
179 {
180 double neginf = -INFINITY,
181 small = 1e-4,
182 large = 8,
183 c = 1.6449340668482264365, /* pi^2/6 = Zeta(2) */
184 c1 = -2.404113806319188570799476, /* -2 Zeta(3) */
185 b2 = 1./6,
186 b4 = -1./30,
187 b6 = 1./42,
188 b8 = -1./30,
189 b10 = 5./66;
190 double result;
191 /* Illegal arguments */
192 if((x == neginf) || isnan(x)) {
193 return NAN;
194 }
195 /* Singularities */
196 if((x <= 0) && (floor(x) == x)) {
197 return neginf;
198 }
199 /* Negative values */
200 /* Use the derivative of the digamma reflection formula:
201 * -trigamma(-x) = trigamma(x+1) - (pi*csc(pi*x))^2
202 */
203 if(x < 0) {
204 result = M_PI/sin(-M_PI*x);
205 return -trigamma(1-x) + result*result;
206 }
207 /* Use Taylor series if argument <= small */
208 if(x <= small) {
209 return 1/(x*x) + c + c1*x;
210 }
211 result = 0;
212 /* Reduce to trigamma(x+n) where ( X + N ) >= B */
213 while(x < large) {
214 result += 1/(x*x);
215 x++;
216 }
217 /* Apply asymptotic formula when X >= B */
218 /* This expansion can be computed in Maple via asympt(Psi(1,x),x) */
219 if(x >= large) {
220 double r = 1/(x*x), t;
221 #if 0
222 result += 0.5*r + (1 + r*(b2 + r*(b4 + r*(b6 + r*(b8 + r*b10)))))/x;
223 #else
224 t = (b4 + r*(b6 + r*(b8 + r*b10)));
225 result += 0.5*r + (1 + r*(b2 + r*t))/x;
226 #endif
227 }
228 return result;
229 }
230
231 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
negbin_log_likelihood(const vector<double> & samples,long double r,long double p)232 long double negbin_log_likelihood(const vector<double>& samples, long double r, long double p)
233 {
234 if (samples.empty() || r == 0 || p == 0)
235 {
236 return 1.0;
237 }
238
239 long double T1 = 0;
240 for (size_t i = 0; i < samples.size(); ++i)
241 {
242 T1 += lgamma(samples[i] + r);
243 }
244
245 long double T2 = 0;
246 for (size_t i = 0; i < samples.size(); ++i)
247 {
248 T2 += lgamma(samples[i] + 1);
249 }
250
251 long double T3 = samples.size() * lgamma(r);
252 long double T4 = samples.size() * r * logl(1 - p);
253
254 long double T5 = 0;
255 for (size_t i = 0; i < samples.size(); ++i)
256 {
257 T5 += samples[i] * logl(p);
258 }
259
260 assert ((isnan(T1) || isnan(T2) || isnan(T3) || isnan(T4) || isnan(T5)) == false);
261 long double log_likelihood = T1 - T2 - T3 + T4 + T5;
262 return log_likelihood;
263 }
264
265
266 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
poisson_log_likelihood(const vector<double> & samples,long double lambda)267 long double poisson_log_likelihood(const vector<double>& samples, long double lambda)
268 {
269 if (samples.empty() || lambda == 0.0)
270 {
271 return 1.0;
272 }
273
274 long double T1 = 0;
275 for (size_t i = 0; i < samples.size(); ++i)
276 {
277 T1 += samples[i] * logl(lambda);
278 }
279
280 long double T2 = samples.size() * lambda;
281
282 assert ((isnan(T1) || isnan(T2)) == false);
283 long double log_likelihood = T1 - T2;
284 return log_likelihood;
285 }
286
287
288
289 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
negbin_log_likelihood_helper(const vector<double> & samples,long double r)290 long double negbin_log_likelihood_helper(const vector<double>& samples, long double r)
291 {
292 long double p = 0;
293 long double mean_count = accumulate(samples.begin(), samples.end(), 0.0);
294
295 if (r == 0 || mean_count == 0)
296 {
297 fprintf(stderr, "Error: r must be > 0!\n");
298 return 0;
299 }
300
301 if (samples.empty() == false)
302 mean_count /= samples.size();
303
304 p = mean_count / (r + mean_count);
305
306 long double T1 = 0;
307 for (size_t i = 0; i < samples.size(); ++i)
308 {
309 T1 += lgamma(samples[i] + r);
310 }
311
312 long double T2 = 0;
313 for (size_t i = 0; i < samples.size(); ++i)
314 {
315 T2 += lgamma(samples[i] + 1);
316 }
317
318 long double T3 = samples.size() * lgamma(r);
319 long double T4 = samples.size() * r * log(1 - p);
320
321 long double T5 = 0;
322 for (size_t i = 0; i < samples.size(); ++i)
323 {
324 T5 += samples[i] * log(p);
325 }
326
327 assert ((isnan(T1) || isnan(T2) || isnan(T3) || isnan(T4) || isnan(T5)) == false);
328 long double log_likelihood = T1 - T2 - T3 + T4 + T5;
329 return log_likelihood;
330 }
331
332
333 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
negbin_log_likelihood_prime_helper(const vector<double> & samples,long double r)334 long double negbin_log_likelihood_prime_helper(const vector<double>& samples, long double r)
335 {
336 long double T1 = 0;
337 for (size_t i = 0; i < samples.size(); ++i)
338 {
339 T1 += digamma(samples[i] + r);
340 }
341
342 long double T2 = samples.size() * digamma(r);
343
344 long double mean_count = accumulate(samples.begin(), samples.end(), 0.0);
345 if (samples.empty() == false)
346 mean_count /= samples.size();
347
348 long double T3 = samples.size() * log(r / (r + mean_count));
349
350 assert ((isnan(T1) || isnan(T2) || isnan(T3)) == false);
351
352 long double log_likelihood_prime = T1 - T2 + T3;
353 return log_likelihood_prime;
354 }
355
356
357 struct negbin_ll_functor
358 {
negbin_ll_functornegbin_ll_functor359 negbin_ll_functor(const vector<double>& count_samples) : samples(count_samples){}
operator ()negbin_ll_functor360 boost::math::tuple<long double, long double> operator()(long double r)
361 {
362 long double llk = negbin_log_likelihood_helper(samples, r);
363 long double llk_d = negbin_log_likelihood_prime_helper(samples, r);
364 return boost::math::make_tuple(llk, llk_d);
365 }
366 private:
367 const vector<double>& samples;
368 };
369
370
fit_negbin_dist(const vector<double> samples,double & r,double & p)371 bool fit_negbin_dist(const vector<double> samples, double& r, double& p)
372 {
373
374 if (samples.size() == 0)
375 {
376 r = 0;
377 p = 0;
378 return true;
379 }
380
381 long double guess = accumulate(samples.begin(), samples.end(), 0.0);
382 if (samples.empty() == false)
383 guess /= samples.size();
384
385 if (guess == 0)
386 {
387 r = 0;
388 p = 0;
389 return true;
390 }
391
392 long double min = 0;
393 long double max = *std::max_element(samples.begin(), samples.end());
394 max *= max;
395 int digits = std::numeric_limits<double>::digits; // Maximum possible binary digits accuracy for type T.
396
397 boost::uintmax_t max_iters = 50;
398
399 r = boost::math::tools::newton_raphson_iterate(negbin_ll_functor(samples), guess, min, max, digits, max_iters);
400
401 long double mean_count = accumulate(samples.begin(), samples.end(), 0.0);
402 if (samples.empty() == false)
403 mean_count /= samples.size();
404
405 p = mean_count / (r + mean_count);
406
407 //fprintf(stderr, "r = %lg, p = %lg, max_r = %Lg\n", r, p, max);
408
409 if (isnan(r) || isnan(p))
410 {
411 fprintf(stderr, "warning: negative binomial parameters are Nan!\n");
412 return false;
413 }
414 return true;
415 }
416
417
418 //#define USE_LOG_CACHE
419
compute_compatibilities(const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<MateHit> & alignments,vector<vector<char>> & compatibilities)420 void compute_compatibilities(const vector<boost::shared_ptr<Abundance> >& transcripts,
421 const vector<MateHit>& alignments,
422 vector<vector<char> >& compatibilities)
423 {
424 int M = alignments.size();
425 int N = transcripts.size();
426
427 vector<Scaffold> alignment_scaffs;
428
429 for (size_t i = 0; i < alignments.size(); ++i)
430 {
431 const MateHit& hit = alignments[i];
432 alignment_scaffs.push_back(Scaffold(hit));
433 }
434
435 for (int j = 0; j < N; ++j)
436 {
437 boost::shared_ptr<Scaffold> transfrag_j = transcripts[j]->transfrag();
438 for (int i = 0; i < M; ++i)
439 {
440 if (transfrag_j->contains(alignment_scaffs[i])
441 && Scaffold::compatible(*transfrag_j, alignment_scaffs[i]))
442 {
443 compatibilities[j][i] = 1;
444 }
445 }
446 }
447 }
448
AbundanceGroup(const vector<boost::shared_ptr<Abundance>> & abundances,const ublas::matrix<double> & gamma_covariance,const ublas::matrix<double> & iterated_exp_count_covariance,const ublas::matrix<double> & count_covariance,const ublas::matrix<double> & fpkm_covariance,const set<boost::shared_ptr<ReadGroupProperties const>> & rg_props)449 AbundanceGroup::AbundanceGroup(const vector<boost::shared_ptr<Abundance> >& abundances,
450 const ublas::matrix<double>& gamma_covariance,
451 const ublas::matrix<double>& iterated_exp_count_covariance,
452 const ublas::matrix<double>& count_covariance,
453 const ublas::matrix<double>& fpkm_covariance,
454 const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props) :
455 _abundances(abundances),
456 _iterated_exp_count_covariance(iterated_exp_count_covariance),
457 _count_covariance(count_covariance),
458 _fpkm_covariance(fpkm_covariance),
459 _gamma_covariance(gamma_covariance),
460 _salient_frags(0.0),
461 _total_frags(0.0),
462 _read_group_props(rg_props)
463 {
464 // Calling calculate_FPKM_covariance() also estimates cross-replicate
465 // count variances
466 // calculate_FPKM_covariance();
467 double fpkm_var = 0.0;
468 for (size_t i = 0; i < _fpkm_covariance.size1(); ++i)
469 {
470 for (size_t j = 0; j < _fpkm_covariance.size2(); ++j)
471 {
472 assert (!isnan(_fpkm_covariance(i,j)) && !isinf(_fpkm_covariance(i,j)));
473 fpkm_var += _fpkm_covariance(i,j);
474 }
475 }
476
477 _FPKM_variance = fpkm_var;
478
479 if (FPKM() > 0 && final_est_run && library_type != "transfrags")
480 {
481
482 ublas::matrix<double> test = _fpkm_covariance;
483 double ret = cholesky_factorize(test);
484 if (ret != 0 || (_FPKM_variance < 0 && status() == NUMERIC_OK))
485 {
486 //fprintf(stderr, "Warning: total FPKM covariance is not positive definite (ret = %lg)!\n", ret);
487 for (size_t j = 0; j < _abundances.size(); ++j)
488 {
489 _abundances[j]->status(NUMERIC_FAIL);
490 }
491 }
492
493 if(!(FPKM() == 0 || fpkm_var > 0 || status() != NUMERIC_OK))
494 {
495 //cerr << _count_covariance << endl;
496 //cerr << _fpkm_covariance << endl;
497 }
498
499 // assert (FPKM() == 0 || fpkm_var > 0 || status() != NUMERIC_OK);
500 }
501
502 for (size_t i = 0; i < _abundances.size(); ++i)
503 {
504 if (_fpkm_samples.empty())
505 {
506 _fpkm_samples = vector<double>(_abundances[i]->fpkm_samples().size(), 0);
507 _member_fpkm_samples = vector<Eigen::VectorXd>(_abundances[i]->fpkm_samples().size(), Eigen::VectorXd::Zero(_abundances.size()));
508 }
509 for (size_t j = 0; j < _abundances[i]->fpkm_samples().size(); ++j)
510 {
511 _fpkm_samples[j] += _abundances[i]->fpkm_samples()[j];
512 _member_fpkm_samples[j](i) = _abundances[i]->fpkm_samples()[j];
513 }
514 }
515
516 calculate_conf_intervals();
517
518 if (no_js_tests == false && _read_group_props.size() >= min_reps_for_js_test)
519 {
520 calculate_kappas();
521 }
522 }
523
status() const524 AbundanceStatus AbundanceGroup::status() const
525 {
526 bool has_lowdata_member = false;
527 bool has_ok_member = false;
528 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
529 {
530 if (ab->status() == NUMERIC_FAIL)
531 {
532 return NUMERIC_FAIL;
533 }
534 else if (ab->status() == NUMERIC_LOW_DATA)
535 {
536 has_lowdata_member = true;
537 //return NUMERIC_LOW_DATA;
538 }
539 else if (ab->status() == NUMERIC_HI_DATA)
540 {
541 return NUMERIC_HI_DATA;
542 }
543 else if (ab->status() == NUMERIC_OK)
544 {
545 has_ok_member = true;
546 }
547 }
548
549 if (has_ok_member == false)
550 return NUMERIC_LOW_DATA;
551
552 return NUMERIC_OK;
553 }
554
FPKM_variance(double v)555 void TranscriptAbundance::FPKM_variance(double v)
556 {
557 assert (v >= 0);
558 assert(!isnan(v));
559 _FPKM_variance = v;
560 }
561
has_member_with_status(AbundanceStatus member_status) const562 bool AbundanceGroup::has_member_with_status(AbundanceStatus member_status) const
563 {
564 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
565 {
566 if (ab->status() == member_status)
567 {
568 return true;
569 }
570 }
571 return false;
572 }
573
num_fragments() const574 double AbundanceGroup::num_fragments() const
575 {
576 double num_f = 0;
577
578 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
579 {
580 num_f += ab->num_fragments();
581 }
582 assert (!isnan(num_f));
583 return num_f;
584 }
585
num_fragments_by_replicate() const586 CountPerReplicateTable AbundanceGroup::num_fragments_by_replicate() const
587 {
588 CountPerReplicateTable cpr;
589
590 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
591 {
592 if (cpr.empty())
593 {
594 cpr = ab->num_fragments_by_replicate();
595 }
596 else
597 {
598 CountPerReplicateTable ab_cpr = ab->num_fragments_by_replicate();
599 for (CountPerReplicateTable::const_iterator itr = ab_cpr.begin();
600 itr != ab_cpr.end();
601 ++itr)
602 {
603 CountPerReplicateTable::iterator cpr_itr = cpr.find(itr->first);
604 assert (cpr_itr != cpr.end());
605 cpr_itr->second += itr->second;
606 }
607 }
608 }
609
610 //assert (cpr.empty() != false);
611 return cpr;
612 }
613
FPKM_by_replicate() const614 FPKMPerReplicateTable AbundanceGroup::FPKM_by_replicate() const
615 {
616 FPKMPerReplicateTable fpr;
617
618 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
619 {
620 FPKMPerReplicateTable ab_fpr = ab->FPKM_by_replicate();
621
622 for (FPKMPerReplicateTable::const_iterator itr = ab_fpr.begin();
623 itr != ab_fpr.end();
624 ++itr)
625 {
626 FPKMPerReplicateTable::iterator fpr_itr = fpr.find(itr->first);
627 if (fpr_itr != fpr.end())
628 fpr_itr->second += itr->second;
629 else
630 fpr[itr->first] = itr->second;
631 }
632 }
633
634 //assert (cpr.empty() != false);
635 return fpr;
636 }
637
status_by_replicate() const638 StatusPerReplicateTable AbundanceGroup::status_by_replicate() const
639 {
640 StatusPerReplicateTable fpr;
641
642 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
643 {
644 if (fpr.empty())
645 {
646 fpr = ab->status_by_replicate();
647 }
648 else
649 {
650 StatusPerReplicateTable ab_fpr = ab->status_by_replicate();
651 for (StatusPerReplicateTable::const_iterator itr = ab_fpr.begin();
652 itr != ab_fpr.end();
653 ++itr)
654 {
655 StatusPerReplicateTable::iterator fpr_itr = fpr.find(itr->first);
656 assert (fpr_itr != fpr.end());
657
658 AbundanceStatus s = itr->second;
659
660 if (s == NUMERIC_FAIL)
661 {
662 fpr_itr->second = NUMERIC_FAIL;
663 }
664 else if (s == NUMERIC_LOW_DATA && (fpr_itr->second != NUMERIC_HI_DATA && fpr_itr->second != NUMERIC_FAIL && fpr_itr->second != NUMERIC_OK))
665 {
666 fpr_itr->second = NUMERIC_LOW_DATA;
667 }
668 else if (s == NUMERIC_HI_DATA)
669 {
670 fpr_itr->second = NUMERIC_HI_DATA;
671 }
672 else if (s == NUMERIC_OK && (fpr_itr->second != NUMERIC_HI_DATA && fpr_itr->second != NUMERIC_FAIL))
673 {
674 fpr_itr->second = NUMERIC_OK;
675 }
676 }
677 }
678 }
679
680 //assert (cpr.empty() != false);
681 return fpr;
682 }
683
mass_variance() const684 double AbundanceGroup::mass_variance() const
685 {
686 double mass_var = 0;
687
688 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
689 {
690 mass_var += ab->mass_variance();
691 }
692 return mass_var;
693 }
694
695 // This tracks the final modeled variance in the assigned counts.
num_fragment_var() const696 double AbundanceGroup::num_fragment_var() const
697 {
698 double frag_var = 0.0;
699 for (size_t i = 0; i < _abundances.size(); ++i)
700 {
701 for (size_t j = 0; j < _abundances.size(); ++j)
702 {
703 frag_var += _count_covariance(i,j);
704 }
705 }
706 return frag_var;
707 }
708
709 // This tracks the final modeled variance in the assigned counts.
num_fragment_uncertainty_var() const710 double AbundanceGroup::num_fragment_uncertainty_var() const
711 {
712 double frag_var = 0.0;
713 for (size_t i = 0; i < _abundances.size(); ++i)
714 {
715 for (size_t j = 0; j < _abundances.size(); ++j)
716 {
717 frag_var += _iterated_exp_count_covariance(i,j);
718 }
719 }
720 return frag_var;
721 }
722
FPKM() const723 double AbundanceGroup::FPKM() const
724 {
725 double fpkm = 0;
726
727 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
728 {
729 fpkm += ab->FPKM();
730 }
731
732 return fpkm;
733 }
734
gamma() const735 double AbundanceGroup::gamma() const
736 {
737 double gamma = 0;
738
739 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
740 {
741 gamma += ab->gamma();
742 }
743
744 return gamma;
745 }
746
clear_non_serialized_data()747 void TranscriptAbundance::clear_non_serialized_data()
748 {
749 _fpkm_samples.clear();
750 std::vector<double>().swap(_fpkm_samples);
751
752 if (_cond_probs)
753 {
754 _cond_probs->clear();
755 std::vector<double>().swap(*_cond_probs);
756 }
757
758 if (_transfrag)
759 {
760 _transfrag->clear_hits();
761 _transfrag = boost::shared_ptr<Scaffold>();
762 }
763 }
764
clear_non_serialized_data()765 void AbundanceGroup::clear_non_serialized_data()
766 {
767
768 for (size_t i = 0; i < _abundances.size(); ++i)
769 {
770 _abundances[i]->clear_non_serialized_data();
771 }
772
773 _fpkm_samples.clear();
774 std::vector<double>().swap(_fpkm_samples);
775 _member_fpkm_samples.clear();
776 std::vector<Eigen::VectorXd>().swap(_member_fpkm_samples);
777 _assigned_count_samples.clear();
778 std::vector<Eigen::VectorXd>().swap(_assigned_count_samples);
779 }
780
filter_group(const vector<bool> & to_keep,AbundanceGroup & filtered_group) const781 void AbundanceGroup::filter_group(const vector<bool>& to_keep,
782 AbundanceGroup& filtered_group) const
783 {
784 //filtered_group = AbundanceGroup();
785
786 assert (to_keep.size() == _abundances.size());
787
788 size_t num_kept = 0;
789 BOOST_FOREACH(bool keeper, to_keep)
790 {
791 num_kept += keeper;
792 }
793
794 ublas::matrix<double> new_cov = ublas::zero_matrix<double>(num_kept,num_kept);
795 ublas::matrix<double> new_iterated_em_count_cov = ublas::zero_matrix<double>(num_kept,num_kept);
796 ublas::matrix<double> new_count_cov = ublas::zero_matrix<double>(num_kept,num_kept);
797 ublas::matrix<double> new_fpkm_cov = ublas::zero_matrix<double>(num_kept,num_kept);
798
799 vector<boost::shared_ptr<Abundance> > new_ab;
800
801 //vector<vector<double> > new_fpkm_samples(_fpkm_samples.size(), vector<double>(num_kept, 0));
802
803 // rebuild covariance matrix and abundance vector after filtration
804
805 size_t next_cov_row = 0;
806 for (size_t i = 0; i < _abundances.size(); ++i)
807 {
808 if (to_keep[i])
809 {
810 new_ab.push_back(_abundances[i]);
811 size_t next_cov_col = 0;
812 for (size_t j = 0; j < _abundances.size(); ++j)
813 {
814 if (to_keep[j])
815 {
816 new_cov(next_cov_row,next_cov_col) = _gamma_covariance(i, j);
817 new_iterated_em_count_cov(next_cov_row,next_cov_col) = _iterated_exp_count_covariance(i, j);
818 new_count_cov(next_cov_row,next_cov_col) = _count_covariance(i, j);
819 new_fpkm_cov(next_cov_row,next_cov_col) = _fpkm_covariance(i, j);
820 next_cov_col++;
821 }
822 }
823 next_cov_row++;
824 }
825 }
826
827
828 // size_t curr_abundance_idx = 0;
829 // for (size_t i = 0; i < _abundances.size(); ++i)
830 // {
831 // if (to_keep[i])
832 // {
833 // for (size_t j = 0; j < _fpkm_samples.size(); ++j)
834 // {
835 // new_fpkm_samples[j][curr_abundance_idx] = _fpkm_samples[j][i];
836 // }
837 // curr_abundance_idx++;
838 // }
839 //
840 // }
841
842
843
844 filtered_group = AbundanceGroup(new_ab,
845 new_cov,
846 new_iterated_em_count_cov,
847 new_count_cov,
848 new_fpkm_cov,
849 _read_group_props);
850 //assert (filtered_group.FPKM() == 0 || new_fpkm_samples.size() > 0);
851
852 filtered_group.description(_description);
853 }
854
get_transfrags(vector<boost::shared_ptr<Abundance>> & transfrags) const855 void AbundanceGroup::get_transfrags(vector<boost::shared_ptr<Abundance> >& transfrags) const
856 {
857 transfrags.clear();
858 BOOST_FOREACH(boost::shared_ptr<Abundance> pA, _abundances)
859 {
860 boost::shared_ptr<Scaffold> pS = pA->transfrag();
861 if (pS)
862 {
863 transfrags.push_back(pA);
864 }
865 }
866 }
867
gene_id() const868 set<string> AbundanceGroup::gene_id() const
869 {
870 set<string> s;
871
872 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
873 {
874 set<string> sub = pA->gene_id();
875 s.insert(sub.begin(), sub.end());
876 }
877
878 return s;
879 }
880
gene_name() const881 set<string> AbundanceGroup::gene_name() const
882 {
883 set<string> s;
884
885 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
886 {
887 set<string> sub = pA->gene_name();
888 s.insert(sub.begin(), sub.end());
889 }
890
891 return s;
892 }
893
894
tss_id() const895 set<string> AbundanceGroup::tss_id() const
896 {
897 set<string> s;
898
899 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
900 {
901 set<string> sub = pA->tss_id();
902 s.insert(sub.begin(), sub.end());
903 }
904
905 return s;
906 }
907
protein_id() const908 set<string> AbundanceGroup::protein_id() const
909 {
910 set<string> s;
911
912 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
913 {
914 set<string> sub = pA->protein_id();
915 s.insert(sub.begin(), sub.end());
916 }
917
918 return s;
919 }
920
locus_tag() const921 const string& AbundanceGroup::locus_tag() const
922 {
923 static string default_locus_tag = "-";
924 const string* pLast = NULL;
925 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
926 {
927 if (pLast)
928 {
929 if (pA->locus_tag() != *pLast)
930 {
931 assert (false);
932 return default_locus_tag;
933 }
934 }
935 pLast = &(pA->locus_tag());
936 }
937 if (pLast)
938 {
939 return *pLast;
940 }
941 //assert (false);
942 return default_locus_tag;
943 }
944
reference_tag() const945 const string& AbundanceGroup::reference_tag() const
946 {
947 static string default_reference_tag = "-";
948 const string* pLast = NULL;
949 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
950 {
951 if (pLast)
952 {
953 if (pA->reference_tag() != *pLast)
954 {
955 assert (false);
956 return default_reference_tag;
957 }
958 }
959 pLast = &(pA->reference_tag());
960 }
961 if (pLast)
962 {
963 return *pLast;
964 }
965 //assert (false);
966 return default_reference_tag;
967 }
968
effective_length() const969 double AbundanceGroup::effective_length() const
970 {
971 double eff_len = 0.0;
972 double group_fpkm = FPKM();
973 if (group_fpkm == 0)
974 return 0;
975 BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
976 {
977 eff_len += (ab->effective_length() * (ab->FPKM() / group_fpkm));
978 }
979 return eff_len;
980 }
981
982 //void AbundanceGroup::collect_read_group_props()
983 //{
984 // size_t M = alignments.size();
985 //
986 // for (size_t i = 0; i < M; ++i)
987 // {
988 // if (!alignments[i].left_alignment())
989 // continue;
990 // boost::shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
991 //
992 // _read_group_props.insert(rg_props;
993 // }
994 //}
995
collect_per_replicate_mass(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts)996 void AbundanceGroup::collect_per_replicate_mass(const vector<MateHit>& alignments,
997 vector<boost::shared_ptr<Abundance> >& transcripts)
998 {
999 size_t M = alignments.size();
1000 size_t N = transcripts.size();
1001
1002 //_count_per_replicate.clear();
1003
1004 for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
1005 itr != _count_per_replicate.end();
1006 ++itr)
1007 {
1008 itr->second = 0.0;
1009 }
1010
1011 if (transcripts.empty())
1012 return;
1013
1014 //map<boost::shared_ptr<ReadGroupProperties const>, double> count_per_replicate;
1015
1016 vector<boost::shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
1017 compute_cond_probs_and_effective_lengths(alignments, transcripts, mapped_transcripts);
1018
1019 for (size_t i = 0; i < M; ++i)
1020 {
1021 if (!alignments[i].left_alignment())
1022 continue;
1023
1024 bool mapped = false;
1025 for (size_t j = 0; j < N; ++j)
1026 {
1027 if (_abundances[j]->cond_probs()->at(i) > 0)
1028 {
1029 mapped = true;
1030 break;
1031 }
1032 }
1033 if (mapped)
1034 {
1035 boost::shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
1036 //assert (parent != NULL);
1037 pair<map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator, bool> inserted;
1038 inserted = _count_per_replicate.insert(make_pair(rg_props, 0.0));
1039 _read_group_props.insert(rg_props);
1040
1041 // these are the *internally* scaled masses.
1042 double more_mass = alignments[i].collapse_mass();
1043 double curr_mass = inserted.first->second;
1044 assert (isnan(more_mass) == false);
1045 inserted.first->second += more_mass;
1046 }
1047 }
1048 }
1049
calculate_locus_scaled_mass_and_variance(const vector<boost::shared_ptr<Abundance>> & transcripts)1050 void AbundanceGroup::calculate_locus_scaled_mass_and_variance(const vector<boost::shared_ptr<Abundance> >& transcripts)
1051 {
1052 size_t N = transcripts.size();
1053
1054 if (transcripts.empty())
1055 return;
1056
1057 double avg_X_g = 0.0;
1058 double avg_mass_fraction = 0.0;
1059
1060 // as long as all the read groups share the same dispersion model (currently true)
1061 // then all the variances from each read group will be the same, so this
1062 // averaging step isn't strictly necessary. Computing it this way is simply
1063 // convenient.
1064 vector<double> avg_mass_variances(N, 0.0);
1065
1066 double external_scale_factor = -1.0;
1067 for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
1068 itr != _count_per_replicate.end();
1069 ++itr)
1070 {
1071 boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
1072
1073 if (external_scale_factor < 0)
1074 {
1075 external_scale_factor = rg_props->external_scale_factor();
1076 }
1077 else
1078 {
1079 assert (external_scale_factor == rg_props->external_scale_factor());
1080 }
1081
1082 // Since the _count_per_replicate table stores internally scaled
1083 // fragment counts, we need to scale the fragment counts up so we
1084 // can compare between conditions, rather than just between replicates
1085 // of this condition.
1086 double scaled_mass = itr->second;
1087 double scaled_total_mass = rg_props->normalized_map_mass();
1088 avg_X_g += scaled_mass;
1089 boost::shared_ptr<MassDispersionModel const> disperser = rg_props->mass_dispersion_model();
1090 for (size_t j = 0; j < N; ++j)
1091 {
1092 double scaled_variance;
1093 //scaled_variance = disperser->scale_mass_variance(scaled_mass * _abundances[j]->gamma());
1094 scaled_variance = _abundances[j]->gamma() * disperser->scale_mass_variance(scaled_mass);
1095 avg_mass_variances[j] += scaled_variance;
1096 }
1097 assert (disperser->scale_mass_variance(scaled_mass) != 0 || scaled_mass == 0);
1098
1099 //assert (scaled_total_mass != 0.0);
1100 avg_mass_fraction += (scaled_mass / scaled_total_mass);
1101 }
1102
1103 double num_replicates = _count_per_replicate.size();
1104
1105 if (num_replicates)
1106 {
1107 avg_X_g /= num_replicates;
1108 avg_mass_fraction /= num_replicates;
1109 for (size_t j = 0; j < N; ++j)
1110 {
1111 avg_mass_variances[j] /= num_replicates;
1112 }
1113 }
1114
1115 for (size_t j = 0; j < _abundances.size(); ++j)
1116 {
1117 _abundances[j]->num_fragments(_abundances[j]->gamma() * avg_X_g);
1118
1119 double j_avg_mass_fraction = _abundances[j]->gamma() * avg_mass_fraction;
1120
1121 _abundances[j]->mass_variance(avg_mass_variances[j]);
1122
1123 if (j_avg_mass_fraction > 0)
1124 {
1125 double FPKM = j_avg_mass_fraction * 1000000000/ _abundances[j]->effective_length();
1126 FPKM *= 1.0 / external_scale_factor;
1127
1128 _abundances[j]->FPKM(FPKM);
1129 }
1130 else
1131 {
1132 _abundances[j]->FPKM(0);
1133 _abundances[j]->mass_variance(0);
1134 }
1135 }
1136
1137 }
1138
1139 int total_cond_prob_calls = 0;
collapse_equivalent_hits(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts,vector<MateHit> & nr_alignments,vector<double> & log_conv_factors,bool require_overlap=true)1140 void collapse_equivalent_hits(const vector<MateHit>& alignments,
1141 vector<boost::shared_ptr<Abundance> >& transcripts,
1142 vector<MateHit>& nr_alignments,
1143 vector<double>& log_conv_factors,
1144 bool require_overlap = true)
1145 {
1146 int N = transcripts.size();
1147 int M = alignments.size();
1148
1149 nr_alignments.clear();
1150
1151 vector<vector<char> > compatibilities(N, vector<char>(M,0));
1152 compute_compatibilities(transcripts, alignments, compatibilities);
1153
1154 vector<vector<double> > cached_cond_probs (M, vector<double>());
1155
1156 vector<bool> replaced(M, false);
1157 int num_replaced = 0;
1158
1159 vector<BiasCorrectionHelper> bchs;
1160 for (size_t j = 0; j < N; ++j)
1161 {
1162 bchs.push_back(BiasCorrectionHelper(transcripts[j]->transfrag()));
1163 }
1164
1165 double total_mass_pre_collapse = 0.0;
1166
1167 for(int i = 0 ; i < M; ++i)
1168 {
1169 total_mass_pre_collapse += alignments[i].collapse_mass();
1170
1171 vector<double> cond_probs_i(N,0);
1172 if (replaced[i] == true)
1173 continue;
1174
1175 if (cached_cond_probs[i].empty())
1176 {
1177 for (int j = 0; j < N; ++j)
1178 {
1179 boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
1180
1181 if (compatibilities[j][i]==1)
1182 {
1183 total_cond_prob_calls++;
1184 cond_probs_i[j] = bchs[j].get_cond_prob(alignments[i]);
1185 }
1186
1187 }
1188 cached_cond_probs[i] = cond_probs_i;
1189 }
1190 else
1191 {
1192 cond_probs_i = cached_cond_probs[i];
1193 }
1194
1195 MateHit* curr_align = NULL;
1196
1197 nr_alignments.push_back(alignments[i]);
1198 curr_align = &nr_alignments.back();
1199 log_conv_factors.push_back(0);
1200
1201 if (corr_multi && alignments[i].is_multi()) // don't reduce other hits into multihits
1202 continue;
1203
1204 bool seen_olap = false;
1205
1206 for(int k = i + 1 ; k < M; ++k)
1207 {
1208 if (replaced[k] || (corr_multi && alignments[k].is_multi()) || alignments[i].read_group_props() != alignments[k].read_group_props())
1209 continue;
1210 if (require_overlap && !::overlap_in_genome(curr_align->left(), curr_align->right(),
1211 alignments[k].left(), alignments[k].right()))
1212 {
1213 if (seen_olap)
1214 break;
1215 else
1216 continue;
1217 }
1218 else
1219 {
1220 seen_olap = true;
1221 }
1222
1223 vector<double>* cond_probs_k;
1224 double last_cond_prob = -1;
1225
1226 bool equiv = true;
1227
1228 if (cached_cond_probs[k].empty())
1229 {
1230 cached_cond_probs[k] = vector<double>(N, 0.0);
1231 cond_probs_k = &cached_cond_probs[k];
1232 for (int j = 0; j < N; ++j)
1233 {
1234 boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
1235
1236 if (compatibilities[j][k]==1)
1237 {
1238 total_cond_prob_calls++;
1239 (*cond_probs_k)[j] = bchs[j].get_cond_prob(alignments[k]);
1240 }
1241 }
1242 //cached_cond_probs[k] = cond_probs_k;
1243 }
1244 else
1245 {
1246 cond_probs_k = &cached_cond_probs[k];
1247 }
1248
1249
1250 for (int j = 0; j < N; ++j)
1251 {
1252 if ((*cond_probs_k)[j] != 0 && cond_probs_i[j] != 0)
1253 {
1254 double cp_j = (*cond_probs_k)[j];
1255 double cp_i = cond_probs_i[j];
1256 double ratio = cp_j / cp_i;
1257 if (last_cond_prob == -1)
1258 {
1259 //assert(ratio < 5);
1260 last_cond_prob = ratio;
1261 }
1262 else
1263 {
1264 if (last_cond_prob != ratio)
1265 //if (abs(last_cond_prob - ratio) > 0.001)
1266 {
1267 equiv = false;
1268 break;
1269 }
1270 }
1271 }
1272 else if ((*cond_probs_k)[j] == 0 && cond_probs_i[j] == 0)
1273 {
1274 // just do nothing in this iter.
1275 // last_cond_prob = 0.0;
1276 }
1277 else
1278 {
1279 equiv = false;
1280 break;
1281 }
1282 }
1283
1284 // cond_prob_i vector is a scalar multiple of cond_prob_k, so we
1285 // can collapse k into i via the mass.
1286 if (equiv)
1287 {
1288 if (last_cond_prob > 0.0)
1289 {
1290 //assert(curr_align->read_group_props() == alignments[k].read_group_props());
1291 assert (last_cond_prob > 0);
1292 //double mass_muliplier = sqrt(last_cond_prob);
1293 double mass_multiplier = log(last_cond_prob);
1294 //assert(last_cond_prob < 5);
1295 assert (!isinf(mass_multiplier) && !isnan(mass_multiplier));
1296 log_conv_factors[log_conv_factors.size() - 1] += mass_multiplier;
1297 replaced[k] = true;
1298 cached_cond_probs[k].clear();
1299 vector<double>(cached_cond_probs[k]).swap(cached_cond_probs[k]);
1300 num_replaced++;
1301 double more_mass = alignments[k].collapse_mass();
1302 curr_align->incr_collapse_mass(more_mass);
1303 }
1304 else
1305 {
1306 replaced[k] = true;
1307 num_replaced++;
1308 cached_cond_probs[k].clear();
1309 vector<double>(cached_cond_probs[k]).swap(cached_cond_probs[k]);
1310 }
1311 }
1312
1313 }
1314 }
1315
1316 N = transcripts.size();
1317 //M = nr_alignments.size();
1318
1319 for (int j = 0; j < N; ++j)
1320 {
1321 boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
1322 vector<double>& cond_probs = *(new vector<double>(nr_alignments.size(),0));
1323
1324 BiasCorrectionHelper& bch = bchs[j];
1325
1326 size_t last_cond_prob_idx = 0;
1327 for(int i = 0 ; i < M; ++i)
1328 {
1329 if (!cached_cond_probs[i].empty())
1330 {
1331 if (compatibilities[j][i]==1)
1332 {
1333 assert (cached_cond_probs[i].size() > j);
1334 cond_probs[last_cond_prob_idx] = cached_cond_probs[i][j];
1335 }
1336 last_cond_prob_idx++;
1337 }
1338 }
1339
1340 assert (last_cond_prob_idx == nr_alignments.size());
1341
1342 transcripts[j]->effective_length(bch.get_effective_length());
1343 transcripts[j]->cond_probs(&cond_probs);
1344 }
1345
1346 double total_mass_post_collapse = 0.0;
1347 for(int i = 0 ; i < nr_alignments.size(); ++i)
1348 {
1349 total_mass_post_collapse += nr_alignments[i].collapse_mass();
1350 }
1351
1352 //assert(abs(total_mass_pre_collapse - total_mass_post_collapse) < 1);
1353
1354 if (nr_alignments.size())
1355 {
1356 verbose_msg("\nReduced %lu frags to %lu (%lf percent)\n", alignments.size(), nr_alignments.size(), 100.0 * (1 - nr_alignments.size()/(double)alignments.size()));
1357 }
1358 }
1359
collapse_equivalent_hits_helper(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts,vector<MateHit> & nr_alignments,vector<double> & log_conv_factors)1360 void collapse_equivalent_hits_helper(const vector<MateHit>& alignments,
1361 vector<boost::shared_ptr<Abundance> >& transcripts,
1362 vector<MateHit>& nr_alignments,
1363 vector<double>& log_conv_factors)
1364 {
1365 int N = transcripts.size();
1366 int M = alignments.size();
1367
1368 if (N == 1)
1369 {
1370 nr_alignments = alignments;
1371 log_conv_factors = vector<double>(M, 0.0);
1372 return;
1373 }
1374 // TODO: Remove this short cut after verifying that it doesn't really make sense
1375 // for large bundles. The collapse is almost certainly more efficient.
1376 // If there's a lot of transcripts, just use the old, overlap constrained
1377 // version of the equivalence collapse.
1378 if (N > 24)
1379 {
1380 collapse_equivalent_hits(alignments,
1381 transcripts,
1382 nr_alignments,
1383 log_conv_factors,
1384 true);
1385 return;
1386 }
1387
1388 vector<vector<const MateHit*> > compat_table(1 << N);
1389 vector<vector<char> > compatibilities(N, vector<char>(M,0));
1390 compute_compatibilities(transcripts, alignments, compatibilities);
1391
1392 for(int i = 0; i < M; ++i)
1393 {
1394 size_t compat_mask = 0;
1395 for (int j = 0; j < N; ++j)
1396 {
1397 compat_mask |= ((compatibilities[j][i] !=0) << j);
1398 }
1399 assert (compat_mask < compat_table.size());
1400 compat_table[compat_mask].push_back(&(alignments[i]));
1401 }
1402
1403 for (size_t i = 0; i < compat_table.size(); ++i)
1404 {
1405 vector<MateHit> tmp_hits;
1406 vector<MateHit> tmp_nr_hits;
1407 vector<double> tmp_log_conv_factors;
1408
1409 for (size_t j = 0; j < compat_table[i].size(); ++j)
1410 {
1411 tmp_hits.push_back(*(compat_table[i][j]));
1412 }
1413 if (tmp_hits.empty())
1414 continue;
1415 collapse_equivalent_hits(tmp_hits,
1416 transcripts,
1417 tmp_nr_hits,
1418 tmp_log_conv_factors,
1419 false);
1420 copy(tmp_nr_hits.begin(), tmp_nr_hits.end(), back_inserter(nr_alignments));
1421 copy(tmp_log_conv_factors.begin(), tmp_log_conv_factors.end(), back_inserter(log_conv_factors));
1422 }
1423 }
1424
1425
1426
1427
1428 // Sample from a multivariate normal to estimate the gamma distribution
generate_count_assignment_samples(int num_draws,const vector<double> & count_mean,const ublas::matrix<double> & count_covariance,vector<ublas::vector<double>> & assigned_count_samples)1429 bool generate_count_assignment_samples(int num_draws,
1430 const vector<double>& count_mean,
1431 const ublas::matrix<double>& count_covariance,
1432 vector<ublas::vector<double> >& assigned_count_samples)
1433 {
1434 double total_frag_counts = accumulate(count_mean.begin(), count_mean.end(), 0.0);
1435
1436 if (total_frag_counts == 0)
1437 {
1438 assigned_count_samples = vector<ublas::vector<double> > (num_draws, ublas::zero_vector<double>(count_mean.size()));
1439 return true;
1440 }
1441
1442 boost::mt19937 rng(random_seed);
1443
1444 ublas::vector<double> mle_frag_counts = ublas::zero_vector<double>(count_mean.size());
1445
1446 for (size_t j = 0; j < count_mean.size(); ++j)
1447 {
1448 mle_frag_counts(j) = count_mean[j];
1449 }
1450
1451 // cerr << "********************" << endl;
1452 // cerr << "initial MLE counts: " << mle_frag_counts << endl;
1453
1454 ublas::matrix<double> mle_count_covar = count_covariance;
1455
1456 ublas::matrix<double> epsilon = ublas::zero_matrix<double>(count_mean.size(),count_mean.size());
1457 for (size_t i = 0; i < count_mean.size(); ++i)
1458 {
1459 epsilon(i,i) = 1e-6;
1460 }
1461
1462 mle_count_covar += epsilon; // modify matrix to avoid problems during inverse
1463
1464 double ret = cholesky_factorize(mle_count_covar);
1465 if (ret != 0)
1466 {
1467 fprintf(stderr, "Warning: Iterated expectation count covariance matrix cannot be cholesky factorized!\n");
1468 //fprintf(stderr, "Warning: FPKM covariance is not positive definite (ret = %lg)!\n", ret);
1469 // for (size_t j = 0; j < _abundances.size(); ++j)
1470 // {
1471 // _abundances[j]->status(NUMERIC_FAIL);
1472 // }
1473 return false;
1474 }
1475
1476 // cerr << endl << "Cholesky factored covariance matrix: " << endl;
1477 // for (unsigned i = 0; i < _count_covariance.size1 (); ++ i)
1478 // {
1479 // ublas::matrix_row<ublas::matrix<double> > mr (mle_count_covar, i);
1480 // cerr << i << " : " << _abundances[i]->num_fragments() << " : ";
1481 // std::cerr << i << " : " << mr << std::endl;
1482 // }
1483 //cerr << "======" << endl;
1484
1485 multinormal_generator<double> generator(mle_frag_counts, mle_count_covar);
1486 //vector<Eigen::VectorXd> multinormal_samples;
1487
1488 assigned_count_samples = vector<ublas::vector<double> > (num_draws, ublas::zero_vector<double>(count_mean.size()));
1489
1490 boost::uniform_01<> uniform_dist;
1491 boost::mt19937 null_rng(random_seed);
1492 boost::variate_generator<boost::mt19937&, boost::uniform_01<> > uniform_gen(null_rng, uniform_dist);
1493
1494
1495 for (size_t assign_idx = 0; assign_idx < num_draws; ++assign_idx)
1496 {
1497
1498 boost::numeric::ublas::vector<double> random_count_assign = generator.next_rand();
1499 //cerr << random_count_assign << endl;
1500
1501 for (size_t r_idx = 0; r_idx < random_count_assign.size(); ++r_idx)
1502 {
1503 if (random_count_assign(r_idx) < 0)
1504 random_count_assign(r_idx) = 0;
1505 }
1506
1507 double total_sample_counts = accumulate(random_count_assign.begin(), random_count_assign.end(), 0.0);
1508 if (total_sample_counts > 0)
1509 random_count_assign = total_frag_counts * (random_count_assign / total_sample_counts);
1510 else
1511 random_count_assign = ublas::zero_vector<double>(count_mean.size());
1512 assigned_count_samples[assign_idx] = random_count_assign;
1513 }
1514
1515 return true;
1516 }
1517
calculate_gamma_mle_covariance(const std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<AbundanceGroup>> & ab_group_per_replicate,ublas::vector<double> & estimated_gamma_mean,ublas::matrix<double> & estimated_gamma_covariance)1518 void calculate_gamma_mle_covariance(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
1519 ublas::vector<double>& estimated_gamma_mean,
1520 ublas::matrix<double>& estimated_gamma_covariance)
1521 {
1522 vector<ublas::vector<double> > all_assigned_count_samples;
1523
1524 if (ab_group_per_replicate.empty())
1525 return;
1526 int num_transcripts = ab_group_per_replicate.begin()->second->abundances().size();
1527 if (num_transcripts == 1)
1528 {
1529 estimated_gamma_mean = ublas::vector<double>(1);
1530 estimated_gamma_mean(0) = 1.0;
1531
1532 estimated_gamma_covariance = ublas::matrix<double>(1,1);
1533 estimated_gamma_covariance(0,0) = 0.0;
1534 return;
1535 }
1536
1537 for(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1538 itr != ab_group_per_replicate.end();
1539 ++itr)
1540 {
1541 ublas::vector<double> count_mean = ublas::zero_vector<double>(itr->second->abundances().size());
1542 for (size_t i = 0; i < itr->second->abundances().size(); ++i)
1543 {
1544 count_mean(i) = itr->second->abundances()[i]->gamma();
1545 }
1546
1547 all_assigned_count_samples.push_back(count_mean);
1548 }
1549
1550 // for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1551 // {
1552 // double total = accumulate(all_assigned_count_samples[i].begin(), all_assigned_count_samples[i].end(), 0.0);
1553 // if (total > 0)
1554 // all_assigned_count_samples[i] /= total;
1555 // }
1556
1557 estimated_gamma_mean = ublas::zero_vector<double>(num_transcripts);
1558 estimated_gamma_covariance = ublas::zero_matrix<double>(num_transcripts, num_transcripts);
1559
1560 for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1561 {
1562 for (int j = 0; j < all_assigned_count_samples[i].size(); ++j)
1563 {
1564 assert (!isnan(all_assigned_count_samples[i](j)) && !isinf(all_assigned_count_samples[i](j)));
1565 }
1566
1567 estimated_gamma_mean += all_assigned_count_samples[i];
1568 //
1569 //expected_relative_abundances += relative_abundances[i];
1570 }
1571
1572 if (all_assigned_count_samples.size() > 1)
1573 {
1574 estimated_gamma_mean /= (all_assigned_count_samples.size() - 1);
1575 }
1576
1577 for (size_t i = 0; i < num_transcripts; ++i)
1578 {
1579 for (size_t j = 0; j < num_transcripts; ++j)
1580 {
1581 for (size_t k = 0 ; k < all_assigned_count_samples.size(); ++k)
1582 {
1583 double c = (all_assigned_count_samples[k](i) - estimated_gamma_mean(i)) * (all_assigned_count_samples[k](j) - estimated_gamma_mean(j));
1584 estimated_gamma_covariance(i,j) += c;
1585
1586 assert (!isinf(estimated_gamma_covariance(i,j)) && !isnan(estimated_gamma_covariance(i,j)));
1587 }
1588 }
1589 }
1590
1591 if (all_assigned_count_samples.size() > 1)
1592 {
1593 estimated_gamma_covariance /= (all_assigned_count_samples.size() - 1);
1594 }
1595 }
1596
calculate_fragment_assignment_distribution(const std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<const AbundanceGroup>> & ab_group_per_replicate,ublas::vector<double> & estimated_gamma_mean,ublas::matrix<double> & estimated_gamma_covariance,vector<ublas::vector<double>> & all_assigned_count_samples)1597 void calculate_fragment_assignment_distribution(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate,
1598 ublas::vector<double>& estimated_gamma_mean,
1599 ublas::matrix<double>& estimated_gamma_covariance,
1600 vector<ublas::vector<double> >& all_assigned_count_samples)
1601 {
1602
1603 all_assigned_count_samples.clear();
1604
1605 if (ab_group_per_replicate.empty())
1606 return;
1607 int num_transcripts = ab_group_per_replicate.begin()->second->abundances().size();
1608 if (num_transcripts == 1)
1609 {
1610 estimated_gamma_mean = ublas::vector<double>(1);
1611 estimated_gamma_mean(0) = 1.0;
1612
1613 estimated_gamma_covariance = ublas::matrix<double>(1,1);
1614 estimated_gamma_covariance(0,0) = 0.0;
1615 return;
1616 }
1617
1618 for(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1619 itr != ab_group_per_replicate.end();
1620 ++itr)
1621 {
1622 vector<double> count_mean;
1623 for (size_t i = 0; i < itr->second->abundances().size(); ++i)
1624 {
1625 count_mean.push_back(itr->second->abundances()[i]->num_fragments());
1626 }
1627
1628 ublas::matrix<double> count_covariance = itr->second->iterated_count_cov();
1629 ublas::matrix<double> mle_error = ublas::zero_matrix<double>(count_mean.size(), count_mean.size());
1630 boost::shared_ptr<const MleErrorModel> mle_model = itr->first->mle_error_model();
1631 if (mle_model != NULL)
1632 {
1633 for (size_t i = 0; i < count_mean.size(); ++i)
1634 {
1635 double mle_var = mle_model->scale_mle_variance(count_mean[i]);
1636 mle_error(i,i) = max(0.0, mle_var);
1637 }
1638 count_covariance += mle_error;
1639 // cerr << endl << "MLE error correction: " << endl;
1640 // for (unsigned i = 0; i < mle_error.size1 (); ++ i)
1641 // {
1642 // ublas::matrix_row<ublas::matrix<double> > mr (mle_error, i);
1643 // cerr << i << " : " << count_mean[i] << " : ";
1644 // std::cerr << i << " : " << mr << std::endl;
1645 // }
1646 // cerr << "======" << endl;
1647 }
1648
1649
1650 vector<ublas::vector<double> > assigned_count_samples;
1651 generate_count_assignment_samples(num_frag_assignments,
1652 count_mean,
1653 count_covariance,
1654 assigned_count_samples);
1655
1656 all_assigned_count_samples.insert(all_assigned_count_samples.end(), assigned_count_samples.begin(), assigned_count_samples.end());
1657 }
1658
1659 for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1660 {
1661 double total = accumulate(all_assigned_count_samples[i].begin(), all_assigned_count_samples[i].end(), 0.0);
1662 if (total > 0)
1663 all_assigned_count_samples[i] /= total;
1664 //cerr << all_assigned_count_samples[i] << endl;
1665 }
1666
1667 estimated_gamma_mean = ublas::zero_vector<double>(num_transcripts);
1668 estimated_gamma_covariance = ublas::zero_matrix<double>(num_transcripts, num_transcripts);
1669
1670 for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1671 {
1672 for (int j = 0; j < all_assigned_count_samples[i].size(); ++j)
1673 {
1674 assert (!isnan(all_assigned_count_samples[i](j)) && !isinf(all_assigned_count_samples[i](j)));
1675 }
1676
1677 estimated_gamma_mean += all_assigned_count_samples[i];
1678 //
1679 //expected_relative_abundances += relative_abundances[i];
1680 }
1681
1682 if (all_assigned_count_samples.size() > 1)
1683 {
1684 estimated_gamma_mean /= (all_assigned_count_samples.size() - 1);
1685 }
1686
1687 for (size_t i = 0; i < num_transcripts; ++i)
1688 {
1689 for (size_t j = 0; j < num_transcripts; ++j)
1690 {
1691 for (size_t k = 0 ; k < all_assigned_count_samples.size(); ++k)
1692 {
1693 double c = (all_assigned_count_samples[k](i) - estimated_gamma_mean(i)) * (all_assigned_count_samples[k](j) - estimated_gamma_mean(j));
1694 estimated_gamma_covariance(i,j) += c;
1695
1696 assert (!isinf(estimated_gamma_covariance(i,j)) && !isnan(estimated_gamma_covariance(i,j)));
1697 }
1698 }
1699 }
1700
1701 if (all_assigned_count_samples.size() > 1)
1702 {
1703 estimated_gamma_covariance /= (all_assigned_count_samples.size() - 1);
1704 }
1705
1706 // cerr << "Gamma covariance: " << endl;
1707 // for (unsigned i = 0; i < estimated_gamma_covariance.size1 (); ++ i)
1708 // {
1709 // ublas::matrix_row<ublas::matrix<double> > mr (estimated_gamma_covariance, i);
1710 // cerr << i << " : " << estimated_gamma_mean[i] << " : ";
1711 // std::cerr << i << " : " << mr << std::endl;
1712 // }
1713
1714 }
1715
1716
1717 #define PERFORM_EQUIV_COLLAPSE 1
1718
1719 //void AbundanceGroup::calculate_abundance_for_replicate(const vector<MateHit>& alignments)
1720 //{
1721 //
1722 //}
1723
1724
calculate_abundance_group_variance(const vector<boost::shared_ptr<Abundance>> & transcripts,const std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<const AbundanceGroup>> & ab_group_per_replicate)1725 void AbundanceGroup::calculate_abundance_group_variance(const vector<boost::shared_ptr<Abundance> >& transcripts,
1726 const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate)
1727 {
1728 if (final_est_run) // Only on last estimation run
1729 {
1730 // Simulate NB draws and fragment assignment under uncertainty to sample
1731 // from the BNBs.
1732
1733 ublas::matrix<double> count_assign_covariance;
1734
1735 ublas::vector<double> estimated_gamma_mean;
1736 ublas::matrix<double> estimated_gamma_covariance;
1737
1738 vector<ublas::vector<double> > gamma_samples;
1739 calculate_fragment_assignment_distribution(ab_group_per_replicate,
1740 estimated_gamma_mean,
1741 estimated_gamma_covariance,
1742 gamma_samples);
1743 ublas::vector<double> estimated_count_mean = estimated_gamma_mean * num_fragments();
1744 ublas::matrix<double> estimated_count_covariance = estimated_gamma_covariance * num_fragments() * num_fragments();
1745
1746 //cerr << estimated_count_mean << endl;
1747 //cerr << estimated_count_covariance << endl;
1748 vector<double> frags_per_transcript;
1749 vector<double> frag_variances;
1750
1751 for (size_t i = 0; i < _abundances.size(); ++i)
1752 {
1753 assert (estimated_count_mean.size() > i);
1754 frags_per_transcript.push_back(estimated_count_mean[i]);
1755 //frags_per_transcript.push_back(_abundances[i]->num_fragments());
1756
1757 frag_variances.push_back(_abundances[i]->mass_variance());
1758 }
1759
1760 simulate_count_covariance(frags_per_transcript, frag_variances, estimated_count_covariance, transcripts, _count_covariance, _assigned_count_samples, &gamma_samples);
1761
1762 generate_fpkm_samples();
1763
1764 calculate_FPKM_covariance();
1765
1766 // Derive confidence intervals from the FPKM variance/covariance matrix
1767 calculate_conf_intervals();
1768
1769 // Calculate the inter-group relative abundances and variances
1770 if (no_js_tests == false && _read_group_props.size() >= min_reps_for_js_test)
1771 {
1772 calculate_kappas();
1773 }
1774
1775 }
1776
1777 // //cerr << _count_covariance << endl;
1778 // for (size_t i = 0; i < _abundances.size(); ++i)
1779 // {
1780 // for (size_t j = 0; j < _abundances.size(); ++j)
1781 // {
1782 // if (i != j)
1783 // {
1784 // assert(!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
1785 // if (_abundances[i]->transfrag()->contains(*_abundances[j]->transfrag()) &&
1786 // Scaffold::compatible(*_abundances[i]->transfrag(),*_abundances[j]->transfrag()))
1787 // {
1788 // _abundances[j]->status(NUMERIC_LOW_DATA);
1789 // }
1790 // }
1791 // }
1792 // }
1793
1794 //assert (FPKM() == 0 || _assigned_count_samples.size() > 0);
1795
1796 //fprintf(stderr, "Total calls to get_cond_prob = %d\n", total_cond_prob_calls);
1797
1798 }
1799
calculate_abundance_for_replicate(const vector<MateHit> & alignments,bool perform_collapse)1800 void AbundanceGroup::calculate_abundance_for_replicate(const vector<MateHit>& alignments, bool perform_collapse)
1801 {
1802 vector<boost::shared_ptr<Abundance> > transcripts;
1803
1804 get_transfrags(transcripts);
1805 vector<boost::shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
1806
1807 vector<MateHit> nr_alignments;
1808 vector<double> joint_mle_gammas;
1809
1810 vector<MateHit> non_equiv_alignments;
1811 vector<double> log_conv_factors;
1812
1813 if (final_est_run || corr_multi || corr_bias)
1814 {
1815 compute_cond_probs_and_effective_lengths(alignments, transcripts, mapped_transcripts);
1816 collect_per_replicate_mass(alignments, transcripts);
1817 calculate_gammas(alignments, log_conv_factors, transcripts, mapped_transcripts);
1818 for (size_t i = 0; i < _abundances.size(); ++i)
1819 {
1820 joint_mle_gammas.push_back(_abundances[i]->gamma());
1821 }
1822 }
1823
1824 calculate_iterated_exp_count_covariance(joint_mle_gammas, alignments, transcripts, _iterated_exp_count_covariance);
1825
1826 for (size_t i = 0; i < _abundances.size(); ++i)
1827 {
1828 _abundances[i]->num_fragment_uncertainty_var(_iterated_exp_count_covariance(i,i));
1829 }
1830
1831 if (corr_multi && !final_est_run)
1832 {
1833 update_multi_reads(alignments, mapped_transcripts);
1834 }
1835
1836 // Calculate the initial estimates for the number of fragments originating
1837 // from each transcript, the FPKMs, and set the NB variances
1838 calculate_locus_scaled_mass_and_variance(transcripts);
1839 }
1840
aggregate_replicate_abundances(const map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<const AbundanceGroup>> & ab_group_per_replicate)1841 void AbundanceGroup::aggregate_replicate_abundances(const map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate)
1842 {
1843 for (size_t i = 0; i < _abundances.size(); ++i)
1844 {
1845 CountPerReplicateTable cpr;
1846 FPKMPerReplicateTable fpr;
1847 StatusPerReplicateTable spr;
1848
1849 double avg_fpkm = 0.0;
1850 double avg_num_frags = 0.0;
1851 double avg_gamma = 0.0;
1852 double avg_mass_variance = 0.0;
1853 double avg_effective_length = 0.0;
1854
1855 map<AbundanceStatus, int> status_table;
1856 status_table[NUMERIC_OK] = 0;
1857 status_table[NUMERIC_LOW_DATA] = 0;
1858 status_table[NUMERIC_FAIL] = 0;
1859 status_table[NUMERIC_HI_DATA] = 0;
1860
1861 for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1862 itr != ab_group_per_replicate.end();
1863 ++itr)
1864 {
1865 status_table[itr->second->abundances()[i]->status()] += 1;
1866 }
1867
1868 for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1869 itr != ab_group_per_replicate.end();
1870 ++itr)
1871 {
1872 const vector<boost::shared_ptr<Abundance> >& sc_ab = itr->second->abundances();
1873 assert(itr->second->abundances().size() == _abundances.size());
1874 cpr[itr->first] = itr->second->abundances()[i]->num_fragments();
1875 //fprintf(stderr, "FPKM = %lg\n", itr->second->abundances()[i]->FPKM());
1876 fpr[itr->first] = itr->second->abundances()[i]->FPKM();
1877 spr[itr->first] = itr->second->abundances()[i]->status();
1878
1879 /*
1880 if (itr->second->abundances()[i]->status() == NUMERIC_OK)
1881 {
1882 avg_fpkm += itr->second->abundances()[i]->FPKM() / (double)status_table[NUMERIC_OK];
1883 avg_num_frags += itr->second->abundances()[i]->num_fragments() / (double)status_table[NUMERIC_OK];
1884 avg_gamma += itr->second->abundances()[i]->gamma() / (double)status_table[NUMERIC_OK];
1885 avg_mass_variance += itr->second->abundances()[i]->mass_variance() / (double)status_table[NUMERIC_OK];
1886 avg_effective_length += itr->second->abundances()[i]->effective_length() / (double)status_table[NUMERIC_OK];
1887 }
1888 */
1889 avg_fpkm += itr->second->abundances()[i]->FPKM() / (double)ab_group_per_replicate.size();
1890 avg_num_frags += itr->second->abundances()[i]->num_fragments() / (double)ab_group_per_replicate.size();
1891 avg_gamma += itr->second->abundances()[i]->gamma() / (double)ab_group_per_replicate.size();
1892 avg_mass_variance += itr->second->abundances()[i]->mass_variance() / (double)ab_group_per_replicate.size();
1893 avg_effective_length += itr->second->abundances()[i]->effective_length() / (double)ab_group_per_replicate.size();
1894
1895 }
1896
1897 _abundances[i]->FPKM(avg_fpkm);
1898 _abundances[i]->gamma(avg_gamma);
1899 _abundances[i]->num_fragments(avg_num_frags);
1900 _abundances[i]->mass_variance(avg_mass_variance);
1901 _abundances[i]->effective_length(avg_effective_length);
1902
1903
1904 // if there was at least one good replicate, set the status to OK. The reduced power will be reflected
1905 // during testing
1906 if (status_table[NUMERIC_OK] >= 1)
1907 {
1908 _abundances[i]->status(NUMERIC_OK);
1909 }
1910 else
1911 {
1912 if (status_table[NUMERIC_LOW_DATA] >= status_table[NUMERIC_FAIL])
1913 {
1914 _abundances[i]->status(NUMERIC_LOW_DATA);
1915 }
1916 else if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_FAIL])
1917 {
1918 _abundances[i]->status(NUMERIC_HI_DATA);
1919 }
1920 // else if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_LOW_DATA]) // not sure this ever happens in practice
1921 // {
1922 // _abundances[i]->status(NUMERIC_FAIL);
1923 // }
1924 else
1925 {
1926 _abundances[i]->status(NUMERIC_FAIL);
1927 }
1928 }
1929
1930 _abundances[i]->num_fragments_by_replicate(cpr);
1931 _abundances[i]->FPKM_by_replicate(fpr);
1932 _abundances[i]->status_by_replicate(spr);
1933 }
1934 }
1935
calculate_abundance(const vector<MateHit> & alignments,bool perform_collapse,bool calculate_variance)1936 void AbundanceGroup::calculate_abundance(const vector<MateHit>& alignments, bool perform_collapse, bool calculate_variance)
1937 {
1938 vector<boost::shared_ptr<Abundance> > transcripts;
1939
1940 get_transfrags(transcripts);
1941
1942 map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> > alignments_per_read_group;
1943
1944 for(std::set<boost::shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
1945 itr != _read_group_props.end();
1946 ++itr)
1947 {
1948 alignments_per_read_group[*itr] = vector<MateHit>();
1949
1950 vector<MateHit>& rep_hits = alignments_per_read_group[*itr];
1951
1952 for (size_t i = 0; i < alignments.size(); ++i)
1953 {
1954 if (alignments[i].read_group_props() == *itr)
1955 {
1956 rep_hits.push_back(alignments[i]);
1957 }
1958 }
1959 }
1960
1961 collect_per_replicate_mass(alignments, transcripts);
1962
1963 std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> > ab_group_per_replicate;
1964
1965 calculate_per_replicate_abundances(transcripts,
1966 alignments_per_read_group,
1967 ab_group_per_replicate);
1968
1969 std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> > const_ab_group_per_replicate;
1970
1971 for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >::iterator itr = ab_group_per_replicate.begin();
1972 itr != ab_group_per_replicate.end(); ++itr)
1973 {
1974 const_ab_group_per_replicate[itr->first] = itr->second;
1975 }
1976
1977 aggregate_replicate_abundances(const_ab_group_per_replicate);
1978
1979 if (calculate_variance)
1980 {
1981 calculate_abundance_group_variance(transcripts, const_ab_group_per_replicate);
1982 }
1983 }
1984
update_multi_reads(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> transcripts)1985 void AbundanceGroup::update_multi_reads(const vector<MateHit>& alignments, vector<boost::shared_ptr<Abundance> > transcripts)
1986 {
1987 size_t M = alignments.size();
1988 size_t N = transcripts.size();
1989
1990 if (transcripts.empty())
1991 return;
1992
1993 for (size_t i = 0; i < M; ++i)
1994 {
1995 if (alignments[i].is_multi())
1996 {
1997 double expr = 0.0;
1998 for (size_t j = 0; j < N; ++j)
1999 {
2000 expr += _abundances[j]->cond_probs()->at(i) * _abundances[j]->FPKM() * _abundances[j]->effective_length();
2001 }
2002 alignments[i].read_group_props()->multi_read_table()->add_expr(alignments[i], expr);
2003 }
2004 }
2005 }
2006
2007
solve_beta(long double A,long double B,long double C)2008 long double solve_beta(long double A, long double B, long double C)
2009 {
2010 long double a = -C/B;
2011 long double b = (A + 4*A*C/(B*B) - (4*C/B));
2012 long double c = -A + B - 5*A*A*C/(B*B*B) + 10*A*C/(B*B) - 5*C/B;
2013 long double d = 2*A*A*A*C/(B*B*B*B) - 6*A*A*C/(B*B*B) + 6*A*C/(B*B) - 2*C/B;
2014 complex<long double> q((3*a*c - b*b)/(a*a*9.0));
2015 complex<long double> r((9.0*a*c*b - 27.0*a*a*d - 2.0*b*b*b)/(a*a*a*54.0));
2016 complex<long double> s1 = std::pow((r + std::sqrt(q*q*q + r*r)),complex<long double>(1/3.0));
2017 complex<long double> s2 = std::pow((r - std::sqrt(q*q*q + r*r)),complex<long double>(1/3.0));
2018 complex<long double> R1 = s1 + s2 - complex<long double>(b/(a*3.0));
2019 complex<long double> R2 = -(s1+s2)/complex<long double>(2.0) - complex<long double>(b/(a*3.0)) + (s1-s2) * complex<long double>(0, sqrtl(3.0)/2.0);
2020 complex<long double> R3 = -(s1+s2)/complex<long double>(2.0) - complex<long double>(b/(a*3.0)) - (s1-s2) * complex<long double>(0, sqrtl(3.0)/2.0);
2021
2022 vector<long double> roots;
2023 if (R1.imag() == 0)
2024 roots.push_back(R1.real());
2025 if (R2.imag() == 0)
2026 roots.push_back(R2.real());
2027 if (R3.imag() == 0)
2028 roots.push_back(R3.real());
2029 sort(roots.begin(), roots.end());
2030
2031 if (roots.empty())
2032 return 0;
2033
2034 long double root = roots.back();
2035 return root;
2036 }
2037
2038
2039 // This function takes the point estimate of the number of fragments from
2040 // a transcript, the iterated expection count matrix, and the locus level
2041 // cross replicate variance, and calculates the transcript-level cross-replicate
2042 // count variance
estimate_count_variance(long double & variance,double gamma_t,double psi_t_count_var,double X_g,double V_X_g_t,double l_t)2043 bool estimate_count_variance(long double& variance,
2044 double gamma_t,
2045 double psi_t_count_var,
2046 double X_g,
2047 double V_X_g_t,
2048 double l_t)
2049 {
2050 if (l_t == 0)
2051 {
2052 return true;
2053 }
2054
2055 long double A = X_g * gamma_t;
2056
2057 long double B = V_X_g_t;
2058
2059 long double C = psi_t_count_var;
2060
2061 variance = 0.0;
2062 bool numeric_ok = true;
2063
2064 long double dispersion = V_X_g_t - (X_g * gamma_t);
2065
2066 if (psi_t_count_var < 0)
2067 {
2068 //fprintf (stderr, "Warning: psi_t is negative! (psi_t = %lf)\n", psi_t);
2069 psi_t_count_var = 0;
2070 }
2071 assert (psi_t_count_var >= 0);
2072
2073 // we multiply A with the constants here to make things work out
2074 // at the end of the routine when we multiply by the square of those
2075 // constants
2076 long double poisson_variance = A + psi_t_count_var;
2077 long double alpha = 0.0;
2078 long double beta = 0.0;
2079 long double bnb_mean = 0.0;
2080 long double r = 0.0;
2081
2082 if (dispersion < -1 || abs(dispersion) < 1)
2083 {
2084 // default to poisson dispersion
2085 variance = poisson_variance;
2086 }
2087 else // there's some detectable overdispersion here, use mixture of negative binomials
2088 {
2089 if (psi_t_count_var < 1)
2090 {
2091 // default to regular negative binomial case.
2092 variance = V_X_g_t;
2093 }
2094 else
2095 {
2096 r = ceil((A * A) / (B - A));
2097
2098 if (r < 0)
2099 {
2100 numeric_ok = false;
2101 }
2102
2103 // exact cubic
2104 beta = solve_beta(A,B,C);
2105 alpha = 1.0 - (A/(A-B)) * beta;
2106 //fprintf(stdout, "%Lg\t%Lg\t%Lg\n",A,alpha,beta);
2107 //if (beta <= 2 || alpha <= 1)
2108 if (alpha <= 2)
2109 {
2110 //printf ("Warning: beta for is %Lg\n", beta);
2111 numeric_ok = false;
2112 variance = V_X_g_t;
2113 }
2114 else
2115 {
2116 bnb_mean = r * beta / (alpha - 1.0);
2117 variance = r * (alpha + r - 1.0) * beta * (alpha + beta - 1);
2118 variance /= (alpha - 2.0) * (alpha - 1.0) * (alpha - 1.0);
2119 }
2120 if (variance < 0)
2121 {
2122 numeric_ok = false;
2123 variance = V_X_g_t;
2124 }
2125
2126 if (variance == 0 && A != 0)
2127 {
2128 variance = poisson_variance;
2129 }
2130
2131 assert (!numeric_ok || variance >= poisson_variance);
2132 assert (!numeric_ok || variance >= V_X_g_t);
2133
2134 if (variance < poisson_variance)
2135 variance = poisson_variance;
2136
2137 if (variance < V_X_g_t)
2138 variance = V_X_g_t;
2139
2140 //assert (abs(FPKM - mean) < 1e-3);
2141 }
2142 }
2143
2144 if (variance < 0)
2145 variance = 0;
2146
2147 variance = ceil(variance);
2148
2149 assert (!numeric_ok || (!isinf(variance) && !isnan(variance)));
2150 assert (!numeric_ok || variance != 0 || A == 0);
2151 return numeric_ok;
2152 }
2153
2154 // This version of the function draws directly from the iterated expectation count covariance matrix
2155 // and treats the betas as normally distributed (which is a good approximation), allowing good capture
2156 // of their covariance without resorting to lots of expensive fragment level sampling.
simulate_count_covariance(const vector<double> & num_fragments,const vector<double> & frag_variances,const ublas::matrix<double> & iterated_exp_count_covariance,const vector<boost::shared_ptr<Abundance>> & transcripts,ublas::matrix<double> & count_covariance,vector<Eigen::VectorXd> & assigned_count_samples,vector<ublas::vector<double>> * gamma_samples=NULL)2157 bool simulate_count_covariance(const vector<double>& num_fragments,
2158 const vector<double>& frag_variances,
2159 const ublas::matrix<double>& iterated_exp_count_covariance,
2160 const vector<boost::shared_ptr<Abundance> >& transcripts,
2161 ublas::matrix<double>& count_covariance,
2162 vector<Eigen::VectorXd>& assigned_count_samples,
2163 vector<ublas::vector<double> >* gamma_samples = NULL)
2164 {
2165 count_covariance = ublas::zero_matrix<double>(transcripts.size(), transcripts.size());
2166
2167 if (frag_variances.size() <= 1)
2168 {
2169 count_covariance(0,0) = frag_variances[0];
2170 //return;
2171 }
2172
2173 double total_frag_counts = accumulate(num_fragments.begin(), num_fragments.end(), 0.0);
2174 double total_var = accumulate(frag_variances.begin(), frag_variances.end(), 0.0);
2175
2176 if (total_frag_counts == 0)
2177 {
2178 count_covariance = ublas::zero_matrix<double>(transcripts.size(), transcripts.size());
2179
2180 size_t num_frags = num_frag_assignments;
2181 if (gamma_samples && gamma_samples->empty() == false)
2182 num_frags = gamma_samples->size();
2183
2184 assigned_count_samples = vector<Eigen::VectorXd> (num_frag_count_draws * num_frags, Eigen::VectorXd::Zero(transcripts.size()));
2185 return true;
2186 }
2187
2188 vector<ublas::vector<double> > assigned_gamma_samples;
2189
2190 boost::mt19937 rng(random_seed);
2191
2192 if (gamma_samples == NULL || gamma_samples->empty())
2193 {
2194 vector<Eigen::VectorXd > generated_counts (num_frag_count_draws, Eigen::VectorXd::Zero(transcripts.size()));
2195 ublas::vector<double> mle_frag_counts = ublas::zero_vector<double>(transcripts.size());
2196
2197 for (size_t j = 0; j < transcripts.size(); ++j)
2198 {
2199 mle_frag_counts(j) = num_fragments[j];
2200 }
2201
2202 // cerr << "********************" << endl;
2203 // cerr << "initial MLE counts: " << mle_frag_counts << endl;
2204
2205 ublas::matrix<double> mle_count_covar = iterated_exp_count_covariance;
2206
2207 ublas::matrix<double> epsilon = ublas::zero_matrix<double>(transcripts.size(),transcripts.size());
2208 for (size_t i = 0; i < transcripts.size(); ++i)
2209 {
2210 epsilon(i,i) = 1e-6;
2211 }
2212
2213 mle_count_covar += epsilon; // modify matrix to avoid problems during inverse
2214
2215 double ret = cholesky_factorize(mle_count_covar);
2216 if (ret != 0)
2217 {
2218 fprintf(stderr, "Warning: Iterated expectation count covariance matrix cannot be cholesky factorized!\n");
2219 //fprintf(stderr, "Warning: FPKM covariance is not positive definite (ret = %lg)!\n", ret);
2220 // for (size_t j = 0; j < _abundances.size(); ++j)
2221 // {
2222 // _abundances[j]->status(NUMERIC_FAIL);
2223 // }
2224 return false;
2225 }
2226
2227 multinormal_generator<double> generator(mle_frag_counts, mle_count_covar);
2228
2229 for (size_t assign_idx = 0; assign_idx < num_frag_assignments; ++assign_idx)
2230 {
2231 boost::numeric::ublas::vector<double> random_count_assign;
2232 double total_sample_counts = 0;
2233 do {
2234 random_count_assign = generator.next_rand();
2235 //cerr << random_count_assign << endl;
2236
2237 for (size_t r_idx = 0; r_idx < random_count_assign.size(); ++r_idx)
2238 {
2239 if (random_count_assign(r_idx) < 0)
2240 random_count_assign(r_idx) = 0;
2241 }
2242
2243 total_sample_counts = accumulate(random_count_assign.begin(), random_count_assign.end(), 0.0);
2244 if (total_sample_counts > 0)
2245 random_count_assign = total_frag_counts * (random_count_assign / total_sample_counts);
2246 else
2247 random_count_assign = boost::numeric::ublas::zero_vector<double>(transcripts.size());
2248 } while(total_sample_counts <= 0);
2249
2250 assigned_gamma_samples.push_back(random_count_assign);
2251 }
2252 }
2253 else
2254 {
2255 for (size_t assign_idx = 0; assign_idx < gamma_samples->size(); ++assign_idx)
2256 {
2257 boost::numeric::ublas::vector<double> random_count_assign = total_frag_counts * (*gamma_samples)[assign_idx];
2258 assigned_gamma_samples.push_back(random_count_assign);
2259 }
2260 }
2261
2262 assigned_count_samples = vector<Eigen::VectorXd> (num_frag_count_draws * assigned_gamma_samples.size(), Eigen::VectorXd::Zero(transcripts.size()));
2263
2264 Eigen::VectorXd expected_generated_counts = Eigen::VectorXd::Zero(transcripts.size());
2265
2266
2267 boost::uniform_01<> uniform_dist;
2268 boost::mt19937 null_rng(random_seed);
2269 boost::variate_generator<boost::mt19937&, boost::uniform_01<> > uniform_gen(null_rng, uniform_dist);
2270
2271
2272 for (size_t assign_idx = 0; assign_idx < assigned_gamma_samples.size(); ++assign_idx)
2273 {
2274 boost::numeric::ublas::vector<double>& random_count_assign = assigned_gamma_samples[assign_idx];
2275
2276 //cerr << random_count_assign << endl;
2277 for (size_t gen_idx = 0; gen_idx < num_frag_count_draws; ++gen_idx)
2278 {
2279 Eigen::VectorXd generated_and_assigned_counts = Eigen::VectorXd::Zero(transcripts.size());
2280
2281 for (size_t j = 0; j < transcripts.size(); ++j)
2282 {
2283 double r = random_count_assign(j);
2284
2285 //r = r < 1 ? 1 : r;
2286
2287 if (r > 0)
2288 {
2289 //double fit_var = _abundances[j]->mass_variance();
2290
2291 double fit_var = total_var * (random_count_assign(j) / total_frag_counts);
2292 double frags = random_count_assign(j);
2293
2294 if (fit_var - frags > 1e-1)
2295 {
2296 r *= r;
2297 double over_disp_scale = fit_var - frags;
2298 r /= over_disp_scale;
2299
2300 double after_decimal = r - (long)r;
2301 //fprintf( stderr, "after decimal = %lg\n", after_decimal);
2302 if (uniform_gen() < after_decimal)
2303 r = floor(r);
2304 else
2305 r = ceil(r);
2306
2307 if (r == 0)
2308 {
2309 generated_and_assigned_counts(j) = 0;
2310 continue;
2311 }
2312
2313 //double p = _abundances[j]->num_fragments() / fit_var;
2314 double p = r / (r + frags);
2315
2316 negative_binomial_distribution<int, double> nb_j(r, p);
2317
2318 generated_and_assigned_counts(j) = nb_j(rng);
2319
2320 }
2321 else
2322 {
2323 double after_decimal = r - (long)r;
2324 //fprintf( stderr, "after decimal = %lg\n", after_decimal);
2325 if (uniform_gen() < after_decimal)
2326 r = floor(r);
2327 else
2328 r = ceil(r);
2329
2330 if (r == 0)
2331 {
2332 generated_and_assigned_counts(j) = 0;
2333 continue;
2334 }
2335
2336 boost::random::poisson_distribution<int, double> nb_j(r);
2337
2338 generated_and_assigned_counts(j) = nb_j(rng);
2339
2340 }
2341 }
2342 else
2343 {
2344 generated_and_assigned_counts(j) = 0;
2345 }
2346 }
2347 //cerr << " assigned count sample: " << generated_and_assigned_counts.transpose() << endl;
2348 assigned_count_samples[assign_idx*num_frag_count_draws + gen_idx] = generated_and_assigned_counts;
2349 }
2350 }
2351
2352 Eigen::VectorXd expected_counts = Eigen::VectorXd::Zero(transcripts.size());
2353 Eigen::VectorXd expected_relative_abundances = Eigen::VectorXd::Zero(transcripts.size());
2354
2355 for (size_t i = 0; i < assigned_count_samples.size(); ++i)
2356 {
2357 for (int j = 0; j < assigned_count_samples[i].size(); ++j)
2358 {
2359 assert (!isnan(assigned_count_samples[i](j)) && !isinf(assigned_count_samples[i](j)));
2360 }
2361
2362
2363 expected_counts += assigned_count_samples[i];
2364 //
2365 //expected_relative_abundances += relative_abundances[i];
2366 }
2367 if (assigned_count_samples.size() > 0)
2368 {
2369 expected_counts /= assigned_count_samples.size();
2370 //expected_generated_counts /= assigned_counts.size();
2371 //expected_relative_abundances /= assigned_counts.size();
2372 }
2373
2374 // if (num_frag_assignments > 0)
2375 // {
2376 // expected_generated_counts /= num_frag_assignments;
2377 // //expected_relative_abundances /= assigned_counts.size();
2378 // }
2379
2380
2381 // cerr << "======" << endl;
2382 // cerr << "updated expected counts #1: " << endl;
2383 // std::cerr << expected_counts << std::endl;
2384 // cerr << "updated expected generated counts #1: " << endl;
2385 // std::cerr << expected_generated_counts << std::endl;
2386 // cerr << "======" << endl;
2387
2388 for (size_t i = 0; i < transcripts.size(); ++i)
2389 {
2390 for (size_t j = 0; j < transcripts.size(); ++j)
2391 {
2392 for (size_t k = 0 ; k < assigned_count_samples.size(); ++k)
2393 {
2394 double c = (assigned_count_samples[k](i) - expected_counts(i)) * (assigned_count_samples[k](j) - expected_counts(j));
2395 count_covariance(i,j) += c;
2396
2397 assert (!isinf(count_covariance(i,j)) && !isnan(count_covariance(i,j)));
2398 //double r = (relative_abundances[k](i) - expected_relative_abundances(i)) * (relative_abundances[k](j) - expected_relative_abundances(j));
2399 //_kappa_covariance(i,j) +=
2400 }
2401 }
2402 }
2403
2404 if (assigned_count_samples.empty() == false)
2405 count_covariance /= assigned_count_samples.size();
2406
2407 for (size_t i = 0; i < transcripts.size(); ++i)
2408 {
2409 // Make sure we aren't below the fit for the single isoform case
2410 if (count_covariance(i,i) < ceil(frag_variances[i]))
2411 {
2412 //fprintf(stderr, "Counts for %d (var = %lg) are underdispersed, reverting to fitted variance model (%lg)\n", i, _count_covariance(i,i), ceil(_abundances[i]->mass_variance()));
2413 count_covariance(i,i) = ceil(frag_variances[i]);
2414 assert (!isinf(count_covariance(i,i)) && !isnan(count_covariance(i,i)));
2415 }
2416
2417 // Check that we aren't below what the Poisson model says we ought to be at
2418 if (count_covariance(i,i) < ceil(num_fragments[i] + iterated_exp_count_covariance(i,i)))
2419 {
2420 //fprintf(stderr, "Counts for %d (var = %lg) are underdispersed, reverting to additive variance model (%lg)\n", i, _count_covariance(i,i), ceil(_abundances[i]->num_fragments() + _iterated_exp_count_covariance(i,i)));
2421 count_covariance(i,i) = ceil(num_fragments[i] + iterated_exp_count_covariance(i,i));
2422 assert (!isinf(count_covariance(i,i)) && !isnan(count_covariance(i,i)));
2423 }
2424 for (size_t j = 0; j < transcripts.size(); ++j)
2425 {
2426 assert (!isinf(count_covariance(i,j)) && !isnan(count_covariance(i,j)));
2427 }
2428 }
2429
2430 return true;
2431 }
2432
2433
generate_fpkm_samples()2434 void AbundanceGroup::generate_fpkm_samples()
2435 {
2436 double external_scale_factor = -1.0;
2437
2438 double M = 0;
2439
2440 for (set<boost::shared_ptr<ReadGroupProperties const> >::iterator itr = _read_group_props.begin();
2441 itr != _read_group_props.end();
2442 ++itr)
2443 {
2444 boost::shared_ptr<ReadGroupProperties const> rg_props = *itr;
2445 M += rg_props->normalized_map_mass();
2446
2447 if (external_scale_factor < 0)
2448 {
2449 external_scale_factor = rg_props->external_scale_factor();
2450 }
2451 else
2452 {
2453 assert (external_scale_factor == rg_props->external_scale_factor());
2454 }
2455 }
2456
2457 M /= _read_group_props.size();
2458
2459 // set up individual vectors of FPKM samples for each abundance object in this group.
2460 vector<vector<double> > fpkm_sample_vectors(_abundances.size());
2461 vector<double> group_sum_fpkm_samples;
2462
2463 vector<double> fpkm_means(_abundances.size(), 0);
2464
2465 for(size_t i = 0; i < _assigned_count_samples.size(); ++i)
2466 {
2467 const Eigen::VectorXd sample = _assigned_count_samples[i];
2468 double total_fpkm = 0;
2469
2470 for (size_t j = 0; j < sample.size(); ++j)
2471 {
2472 double fpkm_sample = sample[j] / M;
2473
2474 if (_abundances[j]->effective_length() > 0)
2475 {
2476 fpkm_sample *= 1000000000;
2477 fpkm_sample /= _abundances[j]->effective_length();
2478 fpkm_sample /= external_scale_factor;
2479 //double standard_fpkm = _abundances[j]->FPKM();
2480 //fprintf(stderr, "count = %lg, fpkm = %lg, standard fpkm = %lg\n", sample[j], fpkm_sample, standard_fpkm);
2481 }
2482 else
2483 {
2484 fpkm_sample = 0;
2485 }
2486
2487 assert (isnan(fpkm_sample) == false);
2488
2489 fpkm_sample_vectors[j].push_back(fpkm_sample);
2490 fpkm_means[j] += fpkm_sample;
2491 total_fpkm += fpkm_sample;
2492 }
2493
2494 if (effective_length() > 0)
2495 {
2496 group_sum_fpkm_samples.push_back(total_fpkm);
2497 }
2498 else
2499 {
2500 group_sum_fpkm_samples.push_back(0);
2501 }
2502 }
2503
2504 for (size_t i = 0; i < _abundances.size(); ++i)
2505 {
2506 fpkm_means[i] /= _assigned_count_samples.size();
2507 _abundances[i]->fpkm_samples(fpkm_sample_vectors[i]);
2508 //fprintf(stderr, "standard fpkm = %lg, sample mean = %lg\n", _abundances[i]->FPKM(), fpkm_means[i]);
2509 }
2510
2511 vector<Eigen::VectorXd> mem_fpkm_samples;
2512 for (size_t i = 0; i < fpkm_sample_vectors.size(); ++i)
2513 {
2514 Eigen::VectorXd sample(fpkm_sample_vectors[i].size());
2515 mem_fpkm_samples.push_back(sample);
2516 for (size_t j = 0; j < fpkm_sample_vectors[i].size(); ++j)
2517 {
2518 mem_fpkm_samples[i](j) = fpkm_sample_vectors[i][j];
2519 }
2520 }
2521
2522 member_fpkm_samples(mem_fpkm_samples);
2523
2524 fpkm_samples(group_sum_fpkm_samples);
2525
2526 assert (group_sum_fpkm_samples.empty() == false);
2527 }
2528
calculate_FPKM_covariance()2529 void AbundanceGroup::calculate_FPKM_covariance()
2530 {
2531 if (effective_length() == 0)
2532 {
2533 _fpkm_covariance = ublas::zero_matrix<double>(_abundances.size(), _abundances.size());
2534 return;
2535 }
2536
2537 //estimate_count_covariance();
2538
2539 long double total_var = 0.0;
2540
2541 double abundance_weighted_length = 0.0;
2542 double total_abundance = 0.0;
2543
2544 double external_scale_factor = -1.0;
2545
2546 double M = 0;
2547
2548 for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
2549 itr != _count_per_replicate.end();
2550 ++itr)
2551 {
2552 boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
2553 M += rg_props->normalized_map_mass();
2554
2555 if (external_scale_factor < 0)
2556 {
2557 external_scale_factor = rg_props->external_scale_factor();
2558 }
2559 else
2560 {
2561 assert (external_scale_factor == rg_props->external_scale_factor());
2562 }
2563 }
2564
2565 M /= _count_per_replicate.size();
2566
2567 for (size_t j = 0; j < _abundances.size(); ++j)
2568 {
2569 abundance_weighted_length += _abundances[j]->effective_length() * _abundances[j]->FPKM();
2570 total_abundance += _abundances[j]->FPKM();
2571
2572 for (size_t i = 0; i < _abundances.size(); ++i)
2573 {
2574 _fpkm_covariance(i,j) = _count_covariance(i,j);
2575
2576 // FPKMs need to be on the external scale, so we can compare them
2577 // between conditions. Counts are internally scaled up until here.
2578 _fpkm_covariance(i,j) *= 1.0 / (external_scale_factor * external_scale_factor);
2579 assert (!isinf(_count_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
2580
2581 long double length_i = _abundances[i]->effective_length();
2582 long double length_j = _abundances[j]->effective_length();
2583 assert (!isinf(length_i) && !isnan(length_i));
2584 assert (!isinf(length_j) && !isnan(length_j));
2585 if (length_i > 0 && length_j > 0 & M > 0)
2586 {
2587 _fpkm_covariance(i,j) *=
2588 ((1000000000.0 / (length_j *M)))*((1000000000.0 / (length_i *M)));
2589 assert (!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
2590 }
2591 else
2592 {
2593 _fpkm_covariance(i,j) = 0.0;
2594 }
2595
2596 if (i == j)
2597 {
2598 double fpkm = _abundances[i]->FPKM();
2599 double fpkm_var = _fpkm_covariance(i,j);
2600 // if (fpkm > 0 && fpkm_var == 0 )
2601 // {
2602 // cerr << _count_covariance << endl;
2603 // }
2604 assert (fpkm == 0 || fpkm_var > 0 || _abundances[i]->status() != NUMERIC_OK);
2605 assert (!isinf(fpkm_var) && !isnan(fpkm_var));
2606 _abundances[i]->FPKM_variance(fpkm_var);
2607 _abundances[i]->num_fragment_var(_count_covariance(i,j));
2608
2609 }
2610
2611 total_var += _fpkm_covariance(i,j);
2612
2613 assert (!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
2614 }
2615 }
2616
2617 _FPKM_variance = total_var;
2618
2619 if (final_est_run && library_type != "transfrags")
2620 {
2621 ublas::matrix<double> test = _fpkm_covariance;
2622 double ret = cholesky_factorize(test);
2623 if (ret != 0 || (_FPKM_variance < 0 && status() == NUMERIC_OK))
2624 {
2625 //fprintf(stderr, "Warning: FPKM covariance is not positive definite (ret = %lg)!\n", ret);
2626 for (size_t j = 0; j < _abundances.size(); ++j)
2627 {
2628 _abundances[j]->status(NUMERIC_FAIL);
2629 }
2630 }
2631
2632 assert (FPKM() == 0 || _FPKM_variance > 0 || status() != NUMERIC_OK);
2633 }
2634
2635 // cerr << "FPKM covariance: " << endl;
2636 // for (unsigned i = 0; i < _fpkm_covariance.size1 (); ++ i)
2637 // {
2638 // ublas::matrix_row<ublas::matrix<double> > mr (_fpkm_covariance, i);
2639 // cerr << i << " : " << _abundances[i]->FPKM() << " : ";
2640 // std::cerr << i << " : " << mr << std::endl;
2641 // }
2642
2643 assert (!isinf(_FPKM_variance) && !isnan(_FPKM_variance));
2644 }
2645
calculate_conf_intervals()2646 void AbundanceGroup::calculate_conf_intervals()
2647 {
2648 // We only really ever call this function for primary abundance groups
2649 // (i.e. the transcript groups and read bundles with which we calculate
2650 // transcript MLE expression levels. Genes, TSS groups, etc get broken
2651 // off of primary bundles, so we should not call this function on those
2652 // secondary groups. The group splitting code needs to manage the task
2653 // of splitting up all the variout covariance matrices we're calculating
2654 // here.
2655 if (status() == NUMERIC_OK)
2656 {
2657 // This will compute the transcript level FPKM confidence intervals
2658 for (size_t j = 0; j < _abundances.size(); ++j)
2659 {
2660 long double fpkm_var = _abundances[j]->FPKM_variance();
2661 double FPKM_hi = 0.0;
2662 double FPKM_lo = 0.0;
2663 if (_abundances[j]->status() != NUMERIC_FAIL)
2664 {
2665 FPKM_hi = _abundances[j]->FPKM() + 2 * sqrt(fpkm_var);
2666 FPKM_lo = max(0.0, (double)(_abundances[j]->FPKM() - 2 * sqrt(fpkm_var)));
2667 if (!(FPKM_lo <= _abundances[j]->FPKM() && _abundances[j]->FPKM() <= FPKM_hi))
2668 {
2669 //fprintf(stderr, "Error: confidence intervals are illegal! var = %Lg, fpkm = %lg, lo = %lg, hi %lg, status = %d\n", fpkm_var, _abundances[j]->FPKM(), FPKM_lo, FPKM_hi, _abundances[j]->status());
2670 }
2671 assert (FPKM_lo <= _abundances[j]->FPKM() && _abundances[j]->FPKM() <= FPKM_hi);
2672 ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2673 _abundances[j]->FPKM_conf(conf);
2674 //_abundances[j]->FPKM_variance(fpkm_var);
2675 }
2676 else
2677 {
2678 // we shouldn't be able to get here
2679 assert(false);
2680 // TODO: nothing to do here?
2681 }
2682 }
2683
2684 // Now build a confidence interval for the whole abundance group
2685 double group_fpkm = FPKM();
2686 if (group_fpkm > 0.0)
2687 {
2688 double FPKM_hi = FPKM() + 2 * sqrt(FPKM_variance());
2689 double FPKM_lo = max(0.0, FPKM() - 2 * sqrt(FPKM_variance()));
2690 ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2691 FPKM_conf(conf);
2692 }
2693 else
2694 {
2695 _FPKM_variance = 0.0;
2696 ConfidenceInterval conf(0.0, 0.0);
2697 FPKM_conf(conf);
2698 }
2699 }
2700 else
2701 {
2702 double sum_transfrag_FPKM_hi = 0;
2703 double max_fpkm = 0.0;
2704 //double min_fpkm = 1e100;
2705
2706 double avg_X_g = 0.0;
2707 double avg_mass_fraction = 0.0;
2708
2709 int N = _abundances.size();
2710
2711 vector<double> avg_mass_variances(_abundances.size(), 0.0);
2712
2713 for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
2714 itr != _count_per_replicate.end();
2715 ++itr)
2716 {
2717 boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
2718 double scaled_mass = itr->second;
2719 double scaled_total_mass = rg_props->normalized_map_mass();
2720 avg_X_g += scaled_mass;
2721 boost::shared_ptr<MassDispersionModel const> disperser = rg_props->mass_dispersion_model();
2722 for (size_t j = 0; j < N; ++j)
2723 {
2724 double scaled_variance;
2725 //scaled_variance = disperser->scale_mass_variance(scaled_mass * _abundances[j]->gamma());
2726 scaled_variance = _abundances[j]->gamma() * disperser->scale_mass_variance(scaled_mass);
2727 avg_mass_variances[j] += scaled_variance;
2728 }
2729 assert (disperser->scale_mass_variance(scaled_mass) != 0 || scaled_mass == 0);
2730
2731 //assert (scaled_total_mass != 0.0);
2732 avg_mass_fraction += (scaled_mass / scaled_total_mass);
2733 }
2734
2735 double num_replicates = _count_per_replicate.size();
2736
2737 if (num_replicates)
2738 {
2739 avg_X_g /= num_replicates;
2740 avg_mass_fraction /= num_replicates;
2741 for (size_t j = 0; j < N; ++j)
2742 {
2743 avg_mass_variances[j] /= num_replicates;
2744 }
2745 }
2746
2747 BOOST_FOREACH(boost::shared_ptr<Abundance> pA, _abundances)
2748 {
2749 double FPKM_hi;
2750 double FPKM_lo;
2751 if (pA->effective_length() > 0 && avg_mass_fraction > 0)
2752 {
2753 double norm_frag_density = 1000000000;
2754 norm_frag_density /= pA->effective_length();
2755
2756 norm_frag_density *= avg_mass_fraction;
2757 double fpkm_high = norm_frag_density;
2758
2759 double total_mass = (num_fragments() / avg_mass_fraction);
2760 double fpkm_constant = 1000000000 / pA->effective_length() / total_mass;
2761 double var_fpkm = mass_variance() * (fpkm_constant * fpkm_constant);
2762
2763 FPKM_hi = fpkm_high + 2 * sqrt(var_fpkm);
2764 FPKM_lo = 0.0;
2765 ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2766 //assert (FPKM_lo <= pA->FPKM() && pA->FPKM() <= FPKM_hi);
2767 pA->FPKM_conf(conf);
2768 //pA->FPKM_variance(var_fpkm);
2769 max_fpkm = max(sum_transfrag_FPKM_hi, FPKM_hi);
2770 }
2771 else
2772 {
2773 FPKM_hi = 0.0;
2774 FPKM_lo = 0.0;
2775 ConfidenceInterval conf(0.0, 0.0);
2776 pA->FPKM_conf(conf);
2777 //pA->FPKM_variance(0.0);
2778 }
2779
2780 }
2781 // In the case of a numeric failure, the groups error bars need to be
2782 // set such that
2783 FPKM_conf(ConfidenceInterval(0.0, max_fpkm + 2 * sqrt(FPKM_variance())));
2784 }
2785 }
2786
2787
2788 /*
2789 void AbundanceGroup::calculate_conf_intervals()
2790 {
2791 for (size_t j = 0; j < _abundances.size(); ++j)
2792 {
2793 double FPKM_hi = 0.0;
2794 double FPKM_lo = numeric_limits<double>::max();
2795 const vector<double> ab_j_samples = _abundances[j]->fpkm_samples();
2796 vector<pair<double, double> > fpkm_samples;
2797 double target_fpkm = _abundances[j]->FPKM();
2798 for (size_t i = 0; i < ab_j_samples.size(); ++i)
2799 fpkm_samples.push_back(make_pair(abs(ab_j_samples[i] - _abundances[j]->FPKM()), ab_j_samples[i]));
2800
2801 sort(fpkm_samples.begin(), fpkm_samples.end());
2802
2803 for (size_t i = 0; i < 0.95*fpkm_samples.size(); ++i)
2804 {
2805 if (FPKM_lo > fpkm_samples[i].second)
2806 FPKM_lo = fpkm_samples[i].second;
2807 if (FPKM_hi < fpkm_samples[i].second)
2808 FPKM_hi = fpkm_samples[i].second;
2809 }
2810
2811 if (fpkm_samples.size() > 0 && FPKM_lo > target_fpkm)
2812 {
2813 fprintf(stderr, "Warning: transcript confidence interval lower bound is > FPKM\n");
2814 }
2815
2816 if (fpkm_samples.size() > 0 && FPKM_hi < target_fpkm)
2817 {
2818 fprintf(stderr, "Warning: transcript confidence interval upper bound is < FPKM\n");
2819 }
2820
2821 ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2822 _abundances[j]->FPKM_conf(conf);
2823 }
2824
2825 double FPKM_hi = 0.0;
2826 double FPKM_lo = numeric_limits<double>::max();
2827 const vector<double>& ab_j_samples = _fpkm_samples;
2828 vector<pair<double, double> > fpkm_samples;
2829 for (size_t i = 0; i < ab_j_samples.size(); ++i)
2830 fpkm_samples.push_back(make_pair(abs(ab_j_samples[i] - FPKM()), ab_j_samples[i]));
2831
2832 sort(fpkm_samples.begin(), fpkm_samples.end());
2833
2834 for (size_t i = 0; i < 0.95*fpkm_samples.size(); ++i)
2835 {
2836 if (FPKM_lo > fpkm_samples[i].second)
2837 FPKM_lo = fpkm_samples[i].second;
2838 if (FPKM_hi < fpkm_samples[i].second)
2839 FPKM_hi = fpkm_samples[i].second;
2840 }
2841
2842 double target_fpkm = FPKM();
2843
2844 if (fpkm_samples.size() > 0 && FPKM_lo > target_fpkm)
2845 {
2846 fprintf(stderr, "Warning: group confidence interval lower bound is > FPKM\n");
2847 }
2848
2849 if (fpkm_samples.size() > 0 && FPKM_hi < target_fpkm)
2850 {
2851 fprintf(stderr, "Warning: group confidence interval upper bound is < FPKM\n");
2852 }
2853
2854 ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2855 FPKM_conf(conf);
2856
2857 return;
2858 }
2859
2860 */
compute_cond_probs_and_effective_lengths(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts,vector<boost::shared_ptr<Abundance>> & mapped_transcripts)2861 void compute_cond_probs_and_effective_lengths(const vector<MateHit>& alignments,
2862 vector<boost::shared_ptr<Abundance> >& transcripts,
2863 vector<boost::shared_ptr<Abundance> >& mapped_transcripts)
2864 {
2865 int N = transcripts.size();
2866 int M = alignments.size();
2867
2868 vector<vector<char> > compatibilities(N, vector<char>(M,0));
2869 compute_compatibilities(transcripts, alignments, compatibilities);
2870
2871 for (int j = 0; j < N; ++j)
2872 {
2873 boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
2874 vector<double>& cond_probs = *(new vector<double>(M,0));
2875
2876 BiasCorrectionHelper bch(transfrag);
2877
2878 for(int i = 0 ; i < M; ++i)
2879 {
2880 if (compatibilities[j][i]==1)
2881 {
2882 total_cond_prob_calls++;
2883 cond_probs[i] = bch.get_cond_prob(alignments[i]);
2884 }
2885 }
2886
2887 transcripts[j]->effective_length(bch.get_effective_length());
2888 transcripts[j]->cond_probs(&cond_probs);
2889
2890 if (bch.is_mapped())
2891 mapped_transcripts.push_back(transcripts[j]);
2892 }
2893 }
2894
2895
trace(const ublas::matrix<double> & m)2896 double trace(const ublas::matrix<double>& m)
2897 {
2898
2899 double t = 0.0;
2900 for (size_t i = 0.0; i < m.size1(); ++i)
2901 {
2902 t += m(i,i);
2903 }
2904
2905 return t;
2906 }
2907
2908
2909 // FIXME: This function doesn't really need to copy the transcripts out of
2910 // the cluster. Needs refactoring
calculate_gammas(const vector<MateHit> & nr_alignments,const vector<double> & log_conv_factors,const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<boost::shared_ptr<Abundance>> & mapped_transcripts)2911 bool AbundanceGroup::calculate_gammas(const vector<MateHit>& nr_alignments,
2912 const vector<double>& log_conv_factors,
2913 const vector<boost::shared_ptr<Abundance> >& transcripts,
2914 const vector<boost::shared_ptr<Abundance> >& mapped_transcripts)
2915 {
2916 if (mapped_transcripts.empty())
2917 {
2918 //gammas = vector<double>(transfrags.size(), 0.0);
2919 BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
2920 {
2921 ab->gamma(0);
2922 }
2923 _gamma_covariance = ublas::zero_matrix<double>(transcripts.size(),
2924 transcripts.size());
2925 _count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2926 transcripts.size());
2927 _iterated_exp_count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2928 transcripts.size());
2929 _fpkm_covariance = ublas::zero_matrix<double>(transcripts.size(),
2930 transcripts.size());
2931
2932 return true;
2933 }
2934
2935 vector<double> gammas;
2936
2937 verbose_msg( "Calculating intial MLE\n");
2938
2939 AbundanceStatus mle_success = gamma_mle(mapped_transcripts,
2940 nr_alignments,
2941 log_conv_factors,
2942 gammas);
2943
2944 verbose_msg( "Tossing likely garbage isoforms\n");
2945
2946 for (size_t i = 0; i < gammas.size(); ++i)
2947 {
2948 if (isnan(gammas[i]))
2949 {
2950 verbose_msg("Warning: isoform abundance is NaN!\n");
2951 }
2952 }
2953
2954 double locus_mass = 0.0;
2955
2956 for (size_t i = 0; i < nr_alignments.size(); ++i)
2957 {
2958 const MateHit& alignment = nr_alignments[i];
2959 locus_mass += alignment.collapse_mass();
2960 }
2961
2962 vector<boost::shared_ptr<Abundance> > filtered_transcripts = mapped_transcripts;
2963 vector<double> filtered_gammas = gammas;
2964 filter_junk_isoforms(filtered_transcripts, filtered_gammas, mapped_transcripts, locus_mass);
2965
2966 if (filtered_transcripts.empty())
2967 {
2968 //gammas = vector<double>(transfrags.size(), 0.0);
2969 BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
2970 {
2971 ab->gamma(0);
2972 }
2973 _gamma_covariance = ublas::zero_matrix<double>(transcripts.size(),
2974 transcripts.size());
2975 _count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2976 transcripts.size());
2977 _iterated_exp_count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2978 transcripts.size());
2979 _fpkm_covariance = ublas::zero_matrix<double>(transcripts.size(),
2980 transcripts.size());
2981
2982 return true;
2983 }
2984
2985 if (filtered_transcripts.size() != mapped_transcripts.size())
2986 {
2987 filtered_gammas.clear();
2988
2989 verbose_msg( "Revising MLE\n");
2990
2991 mle_success = gamma_mle(filtered_transcripts,
2992 nr_alignments,
2993 log_conv_factors,
2994 filtered_gammas);
2995 }
2996
2997 for (size_t i = 0; i < filtered_gammas.size(); ++i)
2998 {
2999 if (isnan(filtered_gammas[i]))
3000 {
3001 verbose_msg("Warning: isoform abundance is NaN!\n");
3002 }
3003 }
3004
3005 size_t N = transcripts.size();
3006
3007 set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
3008 for (size_t i = 0; i < nr_alignments.size(); ++i)
3009 {
3010 rg_props.insert(nr_alignments[i].read_group_props());
3011 }
3012
3013 AbundanceStatus map_success = NUMERIC_OK;
3014 if (final_est_run) // Only on last estimation run.
3015 {
3016 ublas::vector<double> gamma_mle(filtered_gammas.size());
3017 std::copy(filtered_gammas.begin(), filtered_gammas.end(), gamma_mle.begin());
3018 }
3019
3020 for (size_t i = 0; i < filtered_gammas.size(); ++i)
3021 {
3022 if (isnan(gammas[i]))
3023 {
3024 verbose_msg( "Warning: isoform abundance is NaN!\n");
3025 map_success = NUMERIC_FAIL;
3026 }
3027 }
3028
3029 // Now we need to fill in zeros for the isoforms we filtered out of the
3030 // MLE/MAP calculation
3031 vector<double> updated_gammas = vector<double>(N, 0.0);
3032
3033
3034 ublas::matrix<double> updated_gamma_cov;
3035 updated_gamma_cov = ublas::zero_matrix<double>(N, N);
3036
3037 ublas::matrix<double> updated_count_cov;
3038 updated_count_cov = ublas::zero_matrix<double>(N, N);
3039 ublas::matrix<double> updated_iterated_exp_count_cov;
3040 updated_iterated_exp_count_cov = ublas::zero_matrix<double>(N, N);
3041 ublas::matrix<double> updated_fpkm_cov;
3042 updated_fpkm_cov = ublas::zero_matrix<double>(N, N);
3043
3044 size_t cfs = 0;
3045 boost::shared_ptr<Scaffold> curr_filtered_scaff = filtered_transcripts[cfs]->transfrag();
3046 StructurallyEqualScaffolds se;
3047 vector<size_t> scaff_present(N, N);
3048
3049 for (size_t i = 0; i < N; ++i)
3050 {
3051 boost::shared_ptr<Scaffold> scaff_i = transcripts[i]->transfrag();
3052 if (cfs < filtered_transcripts.size())
3053 {
3054 curr_filtered_scaff = filtered_transcripts[cfs]->transfrag();
3055 if (se(scaff_i, curr_filtered_scaff))
3056 {
3057 scaff_present[i] = cfs;
3058 cfs++;
3059 }
3060 }
3061 }
3062
3063 for (size_t i = 0; i < N; ++i)
3064 {
3065 if (scaff_present[i] != N)
3066 {
3067 // then scaffolds[i] has a non-zero abundance, we need to fill
3068 // that in along with relevant cells from the covariance matrix
3069 updated_gammas[i] = filtered_gammas[scaff_present[i]];
3070 //cerr << updated_gammas[i] << ",";
3071
3072 for (size_t j = 0; j < N; ++j)
3073 {
3074 if (scaff_present[j] != N)
3075 {
3076 updated_gamma_cov(i,j) = _gamma_covariance(scaff_present[i],
3077 scaff_present[j]);
3078
3079 updated_iterated_exp_count_cov(i,j) = _iterated_exp_count_covariance(scaff_present[i],
3080 scaff_present[j]);
3081 // Should still be empty but let's do these for consistency:
3082 updated_count_cov(i,j) = _count_covariance(scaff_present[i],
3083 scaff_present[j]);
3084 updated_fpkm_cov(i,j) = _fpkm_covariance(scaff_present[i],
3085 scaff_present[j]);
3086 assert (!isinf(updated_gamma_cov(i,j)));
3087 assert (!isnan(updated_gamma_cov(i,j)));
3088 }
3089 }
3090 }
3091 }
3092
3093 //cerr << endl;
3094
3095 AbundanceStatus numeric_status = NUMERIC_OK;
3096 if (mle_success == NUMERIC_LOW_DATA)
3097 {
3098 numeric_status = NUMERIC_LOW_DATA;
3099 }
3100 else if (mle_success == NUMERIC_FAIL)
3101 {
3102 numeric_status = NUMERIC_FAIL;
3103 }
3104 else
3105 {
3106 assert (mle_success == NUMERIC_OK);
3107 if (map_success == NUMERIC_FAIL)
3108 {
3109 numeric_status = NUMERIC_FAIL;
3110 }
3111 else if (map_success == NUMERIC_LOW_DATA)
3112 {
3113 numeric_status = NUMERIC_LOW_DATA;
3114 }
3115 // otherwise, we're cool.
3116 }
3117
3118
3119
3120 // All scaffolds that go in get abundances, but those that get "filtered"
3121 // from the calculation get zeros.
3122 //gammas = updated_gammas;
3123 for (size_t i = 0; i < _abundances.size(); ++i)
3124 {
3125 _abundances[i]->gamma(updated_gammas[i]);
3126 _abundances[i]->status(numeric_status);
3127 }
3128 _gamma_covariance = updated_gamma_cov;
3129 _count_covariance = updated_count_cov;
3130 _iterated_exp_count_covariance = updated_iterated_exp_count_cov;
3131 _fpkm_covariance = updated_fpkm_cov;
3132
3133 return (status() == NUMERIC_OK);
3134 }
3135
calculate_assignment_probs(const Eigen::VectorXd & alignment_multiplicities,const Eigen::MatrixXd & transcript_cond_probs,const Eigen::VectorXd & proposed_gammas,Eigen::MatrixXd & assignment_probs)3136 void calculate_assignment_probs(const Eigen::VectorXd& alignment_multiplicities,
3137 const Eigen::MatrixXd& transcript_cond_probs,
3138 const Eigen::VectorXd& proposed_gammas,
3139 Eigen::MatrixXd& assignment_probs)
3140 {
3141 // vector<double> u(nr_alignments.size());
3142 // for (size_t i = 0; i < alignment_multiplicity.size(); ++i)
3143 // {
3144 // u[i] = nr_alignments[i].collapse_mass();
3145 // }
3146
3147 //ublas::vector<double> total_cond_prob = ublas::prod(proposed_gammas,transcript_cond_probs);
3148 Eigen::VectorXd total_cond_prob = proposed_gammas.transpose() * transcript_cond_probs ;
3149
3150 // Compute the marginal conditional probability for each fragment against each isoform
3151 //ublas::matrix<double> marg_cond_prob = ublas::zero_matrix<double>(transcript_cond_probs.size1(), transcript_cond_probs.size2());
3152 Eigen::MatrixXd marg_cond_prob(transcript_cond_probs.rows(), transcript_cond_probs.cols());
3153
3154 for (size_t i = 0; i < alignment_multiplicities.size(); ++i)
3155 {
3156 marg_cond_prob.array().col(i) = proposed_gammas.array() * transcript_cond_probs.array().col(i);
3157
3158 if (total_cond_prob(i) > 0)
3159 {
3160 marg_cond_prob.array().col(i) /= total_cond_prob(i);
3161 //column(marg_cond_prob,i) /= total_cond_prob(i);
3162 }
3163 }
3164
3165 assignment_probs = marg_cond_prob;
3166 }
3167
calculate_average_assignment_probs(const Eigen::VectorXd & alignment_multiplicities,const Eigen::MatrixXd & transcript_cond_probs,const Eigen::VectorXd & proposed_gammas,Eigen::MatrixXd & assignment_probs)3168 void calculate_average_assignment_probs(const Eigen::VectorXd& alignment_multiplicities,
3169 const Eigen::MatrixXd& transcript_cond_probs,
3170 const Eigen::VectorXd& proposed_gammas,
3171 Eigen::MatrixXd& assignment_probs)
3172 {
3173 //ublas::vector<double> total_cond_prob = ublas::prod(proposed_gammas,transcript_cond_probs);
3174 Eigen::VectorXd total_cond_prob = proposed_gammas.transpose() * transcript_cond_probs ;
3175
3176 // Compute the marginal conditional probability for each fragment against each isoform
3177 //ublas::matrix<double> marg_cond_prob = ublas::zero_matrix<double>(transcript_cond_probs.size1(), transcript_cond_probs.size2());
3178 Eigen::MatrixXd marg_cond_prob(transcript_cond_probs.rows(), transcript_cond_probs.cols());
3179
3180 for (size_t i = 0; i < alignment_multiplicities.size(); ++i)
3181 {
3182 marg_cond_prob.array().col(i) = proposed_gammas.array() * transcript_cond_probs.array().col(i);
3183
3184 if (total_cond_prob(i) > 0)
3185 {
3186 marg_cond_prob.array().col(i) /= total_cond_prob(i);
3187 //column(marg_cond_prob,i) /= total_cond_prob(i);
3188 }
3189 }
3190
3191 assignment_probs = Eigen::MatrixXd::Zero(proposed_gammas.size(), proposed_gammas.size());
3192
3193 vector<double> num_compatible(proposed_gammas.size(), 0);
3194
3195 for (size_t i = 0; i < alignment_multiplicities.size(); ++i)
3196 {
3197 for (size_t j = 0; j < proposed_gammas.size(); ++j)
3198 {
3199 if (marg_cond_prob(j,i) > 0)
3200 {
3201 assignment_probs.col(j) += marg_cond_prob.col(i) * alignment_multiplicities[i];
3202 num_compatible[j] += alignment_multiplicities[i];
3203 }
3204 }
3205 }
3206
3207 // cerr << "marg cond prob:" << endl;
3208 // cerr << marg_cond_prob << endl;
3209
3210 for (size_t j = 0; j < proposed_gammas.size(); ++j)
3211 {
3212 if (num_compatible[j] > 0)
3213 assignment_probs.col(j) /= num_compatible[j];
3214 }
3215
3216 // cerr << "multiplicities:" << endl;
3217 // cerr << alignment_multiplicities << endl;
3218 // cerr << "avg matrix:" << endl;
3219 // cerr << assignment_probs << endl;
3220 //assignment_probs = marg_cond_prob;
3221 }
3222
3223
3224
calculate_iterated_exp_count_covariance(const vector<double> & gammas,const vector<MateHit> & nr_alignments,const vector<boost::shared_ptr<Abundance>> & transcripts,ublas::matrix<double> & count_covariance)3225 void calculate_iterated_exp_count_covariance(const vector<double>& gammas,
3226 const vector<MateHit>& nr_alignments,
3227 const vector<boost::shared_ptr<Abundance> >& transcripts,
3228 ublas::matrix<double>& count_covariance)
3229 {
3230 // Now calculate the _iterated_exp_count_covariance matrix via iterated expectation
3231 vector<vector<double> > cond_probs(transcripts.size(), vector<double>());
3232 for(size_t j = 0; j < transcripts.size(); ++j)
3233 {
3234 cond_probs[j]= *(transcripts[j]->cond_probs());
3235 }
3236
3237 vector<double> u(nr_alignments.size());
3238 for (size_t i = 0; i < nr_alignments.size(); ++i)
3239 {
3240 u[i] = nr_alignments[i].collapse_mass();
3241 }
3242
3243 count_covariance = ublas::zero_matrix<double>(transcripts.size(), transcripts.size());
3244
3245 ublas::vector<double> total_cond_prob = ublas::zero_vector<double>(nr_alignments.size());
3246
3247 for (size_t i = 0; i < nr_alignments.size(); ++i)
3248 {
3249 // the replicate gamma mles might not be available, if one of the
3250 // replicates returned an error, we'll consider all to be unreliable
3251 for (size_t j = 0; j < cond_probs.size(); ++j)
3252 {
3253 if (cond_probs[j][i] > 0)
3254 {
3255 total_cond_prob(i) += gammas[j] * cond_probs[j][i];
3256 assert (!isnan(total_cond_prob(i) && ! isinf(total_cond_prob(i))));
3257 }
3258 }
3259 }
3260
3261 // Compute the marginal conditional probability for each fragment against each isoform
3262 ublas::matrix<double> marg_cond_prob = ublas::zero_matrix<double>(transcripts.size(), nr_alignments.size());
3263
3264 for (size_t i = 0; i < nr_alignments.size(); ++i)
3265 {
3266 // the replicate gamma mles might not be available, if one of the
3267 // replicates returned an error, we'll consider all to be unreliable
3268 for (size_t j = 0; j < cond_probs.size(); ++j)
3269 {
3270 if (total_cond_prob(i))
3271 {
3272 if (cond_probs[j][i] > 0)
3273 {
3274 marg_cond_prob(j,i) = (gammas[j] * cond_probs[j][i])/total_cond_prob(i);
3275 }
3276 }
3277 }
3278 }
3279
3280 double total_var = 0.0;
3281
3282 ublas::vector<double> expected_counts = ublas::zero_vector<double>(cond_probs.size());
3283
3284 //iterate over fragments
3285 for (size_t i = 0; i < marg_cond_prob.size2(); ++i)
3286 {
3287
3288 // iterate over transcripts
3289 for (size_t j = 0; j < marg_cond_prob.size1(); ++j)
3290 {
3291 double c_j_i = marg_cond_prob(j,i);
3292
3293 expected_counts(j) += u[i] * marg_cond_prob(j,i);
3294
3295 //if (c_j_i == 0 || c_j_i == 1.0)
3296 // continue;
3297
3298 for (size_t k = 0; k < marg_cond_prob.size1(); ++k)
3299 {
3300 double c_k_i = marg_cond_prob(k,i);
3301 //if (c_k_i == 0 || c_k_i == 1.0)
3302 // continue;
3303
3304 if (j == k)
3305 {
3306 if (c_k_i != 0 && c_k_i != 1.0)
3307 {
3308 double var = u[i] * c_k_i * (1.0 - c_k_i);
3309 count_covariance(k,k) += var;
3310 assert (var >= 0);
3311 assert (!isnan(var) && !isinf(var));
3312 total_var += var;
3313 }
3314 }
3315 else
3316 {
3317 if (c_k_i != 0 && c_k_i != 1.0 &&
3318 c_j_i != 0 && c_j_i != 1.0)
3319 {
3320 double covar = -u[i] * c_k_i * c_j_i;
3321 assert (covar <= 0);
3322 assert (!isnan(covar) && !isinf(covar));
3323 count_covariance(k,j) += covar;
3324 }
3325 }
3326 }
3327 }
3328
3329 }
3330 //cerr << "expected counts" << endl;
3331 //cerr << expected_counts << endl;
3332
3333 // // take care of little rounding errors
3334 // for (size_t i = 0; i < _iterated_exp_count_covariance.size1(); ++i)
3335 // {
3336 // for (size_t j = 0; j < _iterated_exp_count_covariance.size2(); ++j)
3337 // {
3338 // if (i == j)
3339 // {
3340 // double c = _iterated_exp_count_covariance(i,j);
3341 // if (c < 0)
3342 // _iterated_exp_count_covariance(i,j) = 0;
3343 // //assert(c >= 0);
3344 // }
3345 // else
3346 // {
3347 // double c = _iterated_exp_count_covariance(i,j);
3348 // if (c > 0)
3349 // _iterated_exp_count_covariance(i,j) = 0;
3350 // //assert(c <= 0);
3351 // }
3352 // }
3353 // _abundances[i]->num_fragment_uncertainty_var(_iterated_exp_count_covariance(i,i));
3354 // }
3355 }
3356
calculate_kappas()3357 void AbundanceGroup::calculate_kappas()
3358 {
3359 size_t num_members = _abundances.size();
3360 _kappa_covariance = ublas::matrix<double>(num_members,
3361 num_members);
3362 //cerr << gamma_cov <<endl;
3363
3364 assert (_gamma_covariance.size1() == num_members);
3365 assert (_gamma_covariance.size2() == num_members);
3366
3367 //tss_group.sub_quants = vector<QuantGroup>(isos_in_tss);
3368
3369 double S_FPKM = 0.0;
3370 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
3371 {
3372 if (pA->effective_length() > 0)
3373 {
3374 S_FPKM += pA->FPKM();
3375 }
3376 }
3377
3378 //fprintf (stderr, "*********\n");
3379 BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
3380 {
3381 if (S_FPKM > 0)
3382 {
3383 pA->kappa(pA->FPKM() / S_FPKM);
3384 }
3385 else
3386 {
3387 pA->kappa(0);
3388 }
3389 }
3390
3391 for (size_t k = 0; k < num_members; ++k)
3392 {
3393 for (size_t m = 0; m < num_members; ++m)
3394 {
3395 double L = _abundances[k]->effective_length() *
3396 _abundances[m]->effective_length();
3397 if (L == 0.0)
3398 {
3399 _kappa_covariance(k,m) = 0.0;
3400 }
3401 else if (m == k)
3402 {
3403 // Use the modeled count variance here instead
3404 double kappa_var;
3405 if (S_FPKM)
3406 {
3407 kappa_var = _fpkm_covariance(k,k) / (S_FPKM * S_FPKM);
3408 }
3409 else
3410 {
3411 kappa_var = 0.0;
3412 }
3413
3414 if (isnan(kappa_var) || isinf(kappa_var)) // to protect against underflow
3415 kappa_var = 0;
3416 _kappa_covariance(k,m) = kappa_var;
3417 }
3418 else
3419 {
3420 double kappa_covar;
3421 if (S_FPKM)
3422 {
3423 kappa_covar = _fpkm_covariance(k,m) / (S_FPKM * S_FPKM);
3424 }
3425 else
3426 {
3427 kappa_covar = 0.0;
3428 }
3429 _kappa_covariance(k,m) = kappa_covar;
3430 }
3431 }
3432 }
3433
3434
3435 // size_t num_members = _abundances.size();
3436 // _kappa_covariance = ublas::zero_matrix<double>(num_members,
3437 // num_members);
3438 // if (FPKM() == 0)
3439 // {
3440 // for (size_t k = 0; k < num_members; ++k)
3441 // {
3442 // _abundances[k]->kappa(0);
3443 // }
3444 // return;
3445 // }
3446 //
3447
3448 // FIXME:
3449 size_t num_count_draws = _assigned_count_samples.size();
3450 vector<Eigen::VectorXd > relative_abundances (num_count_draws, Eigen::VectorXd::Zero(num_members));
3451
3452 // We'll use the effective lengths to transform counts into relative abundances,
3453 // and then use that to calculate the kappa variances and covariances.
3454 Eigen::VectorXd effective_length_recip = Eigen::VectorXd::Zero(_abundances.size());
3455 for (size_t i = 0; i < _abundances.size(); ++i)
3456 {
3457 if (_abundances[i]->effective_length() > 0)
3458 effective_length_recip(i) = 1.0 / _abundances[i]->effective_length();
3459 }
3460
3461 for (size_t i = 0; i < num_count_draws; ++i)
3462 {
3463
3464 Eigen::VectorXd relative_abundance = effective_length_recip.array() * _assigned_count_samples[i].array();
3465 double total = relative_abundance.sum();
3466 if (total > 0)
3467 relative_abundance /= total;
3468 //cerr << relative_abundance.transpose() << endl;
3469 relative_abundances[i] = relative_abundance;
3470 }
3471 }
3472
get_alignments_from_scaffolds(const vector<boost::shared_ptr<Abundance>> & abundances,vector<MateHit> & alignments)3473 void get_alignments_from_scaffolds(const vector<boost::shared_ptr<Abundance> >& abundances,
3474 vector<MateHit>& alignments)
3475 {
3476 set<const MateHit*> hits_in_gene_set;
3477
3478 BOOST_FOREACH(boost::shared_ptr<Abundance> pA, abundances)
3479 {
3480 boost::shared_ptr<Scaffold> pS = pA->transfrag();
3481 assert (pS);
3482 hits_in_gene_set.insert(pS->mate_hits().begin(),
3483 pS->mate_hits().end());
3484 }
3485
3486 for(set<const MateHit*>::iterator itr = hits_in_gene_set.begin();
3487 itr != hits_in_gene_set.end();
3488 ++itr)
3489 {
3490 alignments.push_back(**itr);
3491 }
3492
3493 sort(alignments.begin(), alignments.end(), mate_hit_lt);
3494 }
3495
3496 //void round(Eigen::VectorXd& p) {
3497 //
3498 // double KILLP = 0; // kill all probabilities below this
3499 //
3500 // for (size_t i = 0; i < p.size(); ++i) {
3501 // if (p(i) < KILLP)
3502 // p(i) = 0;
3503 // }
3504 //}
3505
Estep(int N,int M,const Eigen::VectorXd & p,Eigen::MatrixXd & U,const Eigen::MatrixXd & cond_probs,const Eigen::VectorXd & alignment_multiplicities)3506 void Estep (int N,
3507 int M,
3508 const Eigen::VectorXd& p,
3509 Eigen::MatrixXd& U,
3510 const Eigen::MatrixXd& cond_probs,
3511 const Eigen::VectorXd& alignment_multiplicities) {
3512 // given p, fills U with expected frequencies
3513 int i,j;
3514
3515 // Eigen::VectorXd frag_prob_sums = Eigen::VectorXd::Zero(M);
3516 //
3517 // for (j = 0; j < N; ++j)
3518 // {
3519 // for (i = 0; i < M; ++i)
3520 // {
3521 // frag_prob_sums(i) += cond_probs(j,i) * p(j);
3522 // }
3523 // }
3524 //
3525 // for (i = 0; i < M; ++i)
3526 // {
3527 // frag_prob_sums(i) = frag_prob_sums(i) ? (1.0 / frag_prob_sums(i)) : 0.0;
3528 // }
3529 //
3530 // for (j = 0; j < N; ++j)
3531 // {
3532 // for (i = 0; i < M; ++i)
3533 // {
3534 // double ProbY = frag_prob_sums(i);
3535 // double exp_i_j = alignment_multiplicities(i) * cond_probs(j,i) * p(j) * ProbY;
3536 // U(j,i) = exp_i_j;
3537 // }
3538 // }
3539
3540 Eigen::VectorXd frag_prob_sums;// = Eigen::VectorXd::Zero(M);
3541
3542 // for (j = 0; j < N; ++j)
3543 // {
3544 // for (i = 0; i < M; ++i)
3545 // {
3546 // frag_prob_sums(i) += cond_probs(j,i) * p(j);
3547 // assert (!isnan(cond_probs(j,i)) && !isinf(cond_probs(j,i)));
3548 // }
3549 // }
3550
3551 frag_prob_sums = p.transpose() * cond_probs;
3552
3553 for (i = 0; i < M; ++i)
3554 {
3555 assert (!isnan(frag_prob_sums(i)) && !isinf(frag_prob_sums(i)));
3556 //double x = frag_prob_sums(i);
3557 frag_prob_sums(i) = frag_prob_sums(i) ? (1.0 / frag_prob_sums(i)) : 0.0;
3558 if (isnan(frag_prob_sums(i)) || isinf(frag_prob_sums(i)))
3559 {
3560 frag_prob_sums(i) = 0; // protect against overflow/underflow
3561 }
3562 }
3563
3564 Eigen::VectorXd x = frag_prob_sums.array() * alignment_multiplicities.array();
3565 // Eigen::MatrixXd y = p.transpose() * cond_probs;
3566 // Eigen::MatrixXd UU = y.array() * x.array();
3567
3568 // cerr << UU <<endl;
3569 // for (j = 0; j < N; ++j)
3570 // {
3571 // for (i = 0; i < M; ++i)
3572 // {
3573 // //double ProbY = frag_prob_sums(i);
3574 // double exp_i_j = cond_probs(j,i) * p(j) * x(i);
3575 // U(j,i) = exp_i_j;
3576 // }
3577 // }
3578
3579 U = Eigen::MatrixXd(N,M);
3580
3581 for (i = 0; i < M; ++i)
3582 {
3583 //double ProbY = frag_prob_sums(i);
3584 //double exp_i_j = cond_probs(j,i) * p(j) * x(i);
3585 //U.r = exp_i_j;
3586 U.col(i) = cond_probs.col(i).array() * p.array();
3587 }
3588 for (j = 0; j < N; ++j)
3589 {
3590 U.array().row(j) *= x.transpose().array();
3591 }
3592
3593 //cerr << UU << endl;
3594 //cerr << "==========" << endl;
3595 //cerr << U << endl;
3596 //cerr << "**********" << endl;
3597 //Eigen::ArrayXXd a_cond_probs = cond_probs.array();
3598 //Eigen::ArrayXXd a_p = p.array();
3599 //Eigen::ArrayXXd a_U = a_p * a_cond_probs.colwise();
3600 //U = (.colwise() * p.transpose().array()).matrix();
3601 //U = cond_probs.colwise() * p.array();
3602 //U = alignment_multiplicities.array() * p.array() * cond_probs.array() * frag_prob_sums.array();
3603 //U = (frag_prob_sums.array() * alignment_multiplicities.array()).rowwise() * p.array().colwise() * cond_probs.array().colwise();
3604 }
3605
3606
Mstep(int N,int M,Eigen::VectorXd & p,const Eigen::MatrixXd & U)3607 void Mstep (int N,
3608 int M,
3609 Eigen::VectorXd& p,
3610 const Eigen::MatrixXd& U)
3611 {
3612 Eigen::VectorXd v;// = Eigen::VectorXd::Zero(N);
3613
3614 double m = 0;
3615
3616 v = U.rowwise().sum();
3617 m = v.colwise().sum()(0);
3618
3619 if (m)
3620 {
3621 p = v / m;
3622 }
3623 else
3624 {
3625 p = Eigen::VectorXd::Zero(N);
3626 }
3627 }
3628
3629
logLike(int N,int M,Eigen::VectorXd & p,const Eigen::MatrixXd & cond_prob,const Eigen::VectorXd & alignment_multiplicities,const vector<double> & log_conv_factors)3630 double logLike (int N,
3631 int M,
3632 Eigen::VectorXd& p,
3633 const Eigen::MatrixXd& cond_prob,
3634 const Eigen::VectorXd& alignment_multiplicities,
3635 const vector<double>& log_conv_factors) {
3636 //int i,j;
3637
3638 double ell = accumulate(log_conv_factors.begin(), log_conv_factors.end(), 0.0);
3639 //double Prob_Y;
3640 // for (int i= 0; i < M; i++) {
3641 // Prob_Y = 0;
3642 // for (int j= 0; j < N; j++) {
3643 // Prob_Y += cond_prob(j,i) * p(j);
3644 // }
3645 // if (Prob_Y > 0) {
3646 // ell += (alignment_multiplicities(i) * log(Prob_Y));
3647 // }
3648 // }
3649 Eigen::VectorXd Prob_Ys = p.transpose() * cond_prob;
3650 Eigen::VectorXd log_Prob_Ys = Prob_Ys.array().log();
3651 //log_Prob_Ys *= alignment_multiplicities;
3652 for (int i = 0; i < M; i++) {
3653 if (Prob_Ys(i) > 0)
3654 {
3655 ell += alignment_multiplicities(i) * log_Prob_Ys(i);
3656 }
3657 }
3658 return ell;
3659 }
3660
EM(int N,int M,Eigen::VectorXd & newP,const Eigen::MatrixXd & cond_prob,const Eigen::VectorXd & alignment_multiplicities,vector<double> const & log_conv_factors,bool & converged)3661 double EM(int N, int M,
3662 Eigen::VectorXd& newP,
3663 const Eigen::MatrixXd& cond_prob,
3664 const Eigen::VectorXd& alignment_multiplicities,
3665 vector<double> const & log_conv_factors,
3666 bool& converged)
3667 {
3668 converged = true;
3669 //double sum = 0;
3670 double newEll = 0;
3671 Eigen::VectorXd p = Eigen::VectorXd::Zero(N);
3672 Eigen::MatrixXd U = Eigen::MatrixXd::Zero(N,M);
3673 double ell = 0;
3674 int iter = 0;
3675 int j;
3676
3677 if (N == 0 || M == 0)
3678 return NUMERIC_OK;
3679
3680 for (j = 0; j < N; ++j) {
3681 //p[j] = drand48();
3682 //sum += p[j];
3683 p(j) = 1.0/(double)N;
3684 }
3685
3686
3687
3688 static const double ACCURACY = mle_accuracy; // convergence for EM
3689
3690 while (((iter <= 2) || (abs(ell - newEll) > ACCURACY)) && (iter < max_mle_iterations)) {
3691 if (iter > 0) {
3692 //round(newP);
3693 p = newP;
3694 ell = newEll;
3695 }
3696
3697 Estep(N, M, p, U, cond_prob, alignment_multiplicities); // fills U
3698 Mstep(N, M, newP,U); // fills p
3699
3700 newEll = logLike(N, M, newP, cond_prob,alignment_multiplicities, log_conv_factors);
3701
3702 //fprintf(stderr, "%d\t%lf\n", iter, newEll);
3703
3704 //printf("%.3f %.3f %.3f ", newP[0], newP[1], newP[2]);
3705 //printf("%.3f %.3f %.3f ", newP[3], newP[4], newP[5]);
3706 //printf("%.3f %.3f %.3f\n", newP[6], newP[7], newP[8]);
3707 iter++;
3708 }
3709 if (iter >= max_mle_iterations)
3710 {
3711 verbose_msg("Warning: ITERMAX reached in abundance estimation, estimation hasn't fully converged\n");
3712 converged = false;
3713 }
3714 verbose_msg("Convergence reached in %d iterations \n", iter);
3715 return newEll;
3716 }
3717
compute_fisher(const vector<boost::shared_ptr<Abundance>> & transcripts,const ublas::vector<double> & abundances,const vector<MateHit> & alignments,const vector<double> & u,boost::numeric::ublas::matrix<double> & fisher)3718 void compute_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts,
3719 const ublas::vector<double>& abundances,
3720 const vector<MateHit>& alignments,
3721 const vector<double>& u,
3722 boost::numeric::ublas::matrix<double>& fisher)
3723 {
3724 int M = alignments.size();
3725 int N = transcripts.size();
3726
3727 vector<long double> denoms(M, 0.0);
3728 vector<vector<long double> > P(M,vector<long double>(N,0));
3729
3730 for (int j = 0; j < N; ++j)
3731 {
3732 const vector<double>& cond_probs_j = *(transcripts[j]->cond_probs());
3733 for (int x = 0; x < M; ++x)
3734 {
3735 if (cond_probs_j[x]==0)
3736 continue;
3737 long double alpha = 0.0;
3738 alpha = cond_probs_j[x];
3739 alpha *= abundances(j);
3740 denoms[x] += alpha;
3741 }
3742 }
3743
3744 for (int x = 0; x < M; ++x)
3745 denoms[x] *= denoms[x];
3746
3747
3748 for (int j = 0; j < N; ++j)
3749 {
3750 const vector<double>& cond_probs_j = *(transcripts[j]->cond_probs());
3751 for (int k = 0; k < N; ++k)
3752 {
3753
3754 const vector<double>& cond_probs_k = *(transcripts[k]->cond_probs());
3755
3756 for (int x = 0; x < M; ++x)
3757 {
3758 if (cond_probs_j[x]==0 && cond_probs_k[x]==0)
3759 continue;
3760
3761 assert(denoms[x] != 0.0);
3762
3763 double fisher_x_j_k = cond_probs_j[x] * cond_probs_k[x] / denoms[x];
3764
3765 fisher(j,k) += u[x] * fisher_x_j_k;
3766 }
3767 }
3768 }
3769 }
3770
compute_posterior_expectation(const vector<ublas::vector<double>> & weighted_samples,const vector<pair<size_t,double>> & sample_weights,ublas::vector<double> & expectation,long double & log_total_weight)3771 AbundanceStatus compute_posterior_expectation(const vector<ublas::vector<double> >& weighted_samples,
3772 const vector<pair<size_t, double> >& sample_weights,
3773 ublas::vector<double>& expectation,
3774 long double& log_total_weight)
3775 {
3776 ublas::vector<double> log_expectation(expectation.size());
3777 log_total_weight = 0.0;
3778
3779 // compute the weighted sum of the samples from the proposal distribution,
3780 // but keep the result in log space to avoid underflow.
3781 for (size_t i = 0; i < weighted_samples.size(); ++i)
3782 {
3783 const ublas::vector<double>& scaled_sample = weighted_samples[i];
3784 double log_weight = sample_weights[i].second;
3785 if (log_total_weight == 0.0)
3786 {
3787 log_expectation = weighted_samples[i];
3788 log_total_weight = log_weight;
3789 }
3790 else
3791 {
3792 for (size_t e = 0; e < log_expectation.size(); ++e)
3793 {
3794 log_expectation(e) = log_space_add<long double>(log_expectation[e], scaled_sample[e]);
3795 }
3796 log_total_weight = log_space_add<long double>(log_total_weight, log_weight);
3797 }
3798 }
3799
3800 if (log_total_weight == 0 || sample_weights.size() < 100)
3801 {
3802 verbose_msg("Warning: restimation failed, importance samples have zero weight.\n\tResorting to MLE and observed Fisher\n");
3803 return NUMERIC_FAIL;
3804 }
3805
3806 // compute the weighted mean, and transform back out of log space
3807 for (size_t e = 0; e < expectation.size(); ++e)
3808 {
3809 expectation(e) = (long double)log_expectation(e) - log_total_weight;
3810 expectation(e) = exp(expectation(e));
3811 }
3812
3813 for (size_t e = 0; e < expectation.size(); ++e)
3814 {
3815 if (isinf(expectation(e)) || isnan(expectation(e)))
3816 {
3817 verbose_msg("Warning: isoform abundance is NaN, restimation failed.\n\tResorting to MLE and observed Fisher.");
3818 return NUMERIC_FAIL;
3819 }
3820 }
3821
3822 for (size_t j = 0; j < expectation.size(); ++j)
3823 {
3824 if (expectation(j) < 0)
3825 expectation(j) = 0;
3826 }
3827
3828 long double m = sum(expectation);
3829
3830 if (m == 0 || isinf(m) || isnan(m))
3831 {
3832 verbose_msg("Warning: restimation failed, could not renormalize MAP estimate\n\tResorting to MLE and observed Fisher.");
3833 return NUMERIC_FAIL;
3834 }
3835
3836 for (size_t j = 0; j < expectation.size(); ++j) {
3837 expectation(j) = expectation(j) / m;
3838 }
3839 return NUMERIC_OK;
3840 }
3841
calculate_per_replicate_abundances(vector<boost::shared_ptr<Abundance>> & transcripts,const map<boost::shared_ptr<ReadGroupProperties const>,vector<MateHit>> & alignments_per_read_group,std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<AbundanceGroup>> & ab_group_per_replicate,bool perform_collapse)3842 AbundanceStatus AbundanceGroup::calculate_per_replicate_abundances(vector<boost::shared_ptr<Abundance> >& transcripts,
3843 const map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >& alignments_per_read_group,
3844 std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
3845 bool perform_collapse)
3846 {
3847 for(std::set<boost::shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
3848 itr != _read_group_props.end();
3849 ++itr)
3850 {
3851 vector<boost::shared_ptr<Abundance> > new_transcripts;
3852 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, transcripts)
3853 {
3854 boost::shared_ptr<TranscriptAbundance> d = boost::static_pointer_cast<TranscriptAbundance>(ab);
3855 //new_transcripts.push_back(boost::shared_ptr<Abundance>(new TranscriptAbundance(*boost::static_pointer_cast<TranscriptAbundance>(ab))));
3856 TranscriptAbundance* pT = new TranscriptAbundance;
3857 pT->transfrag(d->transfrag());
3858 boost::shared_ptr<Abundance> ab_new(pT);
3859 ab_new->description(ab_new->description());
3860 ab_new->locus_tag("");
3861 new_transcripts.push_back(ab_new);
3862 }
3863 boost::shared_ptr<AbundanceGroup> ab_group(new AbundanceGroup(new_transcripts));
3864 std::set<boost::shared_ptr<ReadGroupProperties const > > rg_props;
3865 rg_props.insert(*itr);
3866 ab_group->init_rg_props(rg_props);
3867
3868 map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >::const_iterator al_itr =
3869 alignments_per_read_group.find(*itr);
3870
3871 assert(al_itr != alignments_per_read_group.end());
3872 const vector<MateHit>& rep_hits = alignments_per_read_group.find(*itr)->second;
3873
3874 vector<MateHit> nr_alignments;
3875 vector<MateHit> non_equiv_alignments;
3876 vector<double> log_conv_factors;
3877 vector<boost::shared_ptr<Abundance> > mapped_transcripts;
3878
3879 if (perform_collapse)
3880 {
3881 collapse_hits(rep_hits, nr_alignments);
3882 collapse_equivalent_hits_helper(nr_alignments, transcripts, non_equiv_alignments, log_conv_factors);
3883 assert (non_equiv_alignments.size() == log_conv_factors.size());
3884 log_conv_factors = vector<double>(nr_alignments.size(), 0);
3885 nr_alignments.clear();
3886 }
3887 else
3888 {
3889 nr_alignments = rep_hits;
3890 non_equiv_alignments = nr_alignments;
3891 log_conv_factors = vector<double>(nr_alignments.size(), 0);
3892 }
3893
3894 ab_group->calculate_abundance_for_replicate(non_equiv_alignments, false);
3895
3896 //fprintf (stderr, "FPKM = %lg\n", ab_group->FPKM());
3897 ab_group_per_replicate[*itr] = ab_group;
3898 }
3899
3900 return NUMERIC_OK;
3901 }
3902
calculate_inverse_fisher(const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<MateHit> & alignments,const ublas::vector<double> & gamma_mean,ublas::matrix<double> & inverse_fisher)3903 AbundanceStatus calculate_inverse_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts,
3904 const vector<MateHit>& alignments,
3905 const ublas::vector<double>& gamma_mean,
3906 ublas::matrix<double>& inverse_fisher)
3907 {
3908 // size_t N = gamma_covariance.size1();
3909
3910 // gamma_map_covariance = ublas::zero_matrix<double>(N);
3911
3912 typedef ublas::matrix<double> matrix_type;
3913 matrix_type fisher = ublas::zero_matrix<double>(gamma_mean.size(),gamma_mean.size());
3914
3915 vector<double> u(alignments.size());
3916 for (size_t i = 0; i < alignments.size(); ++i)
3917 {
3918 u[i] = alignments[i].collapse_mass();
3919 }
3920
3921 compute_fisher(transcripts,
3922 gamma_mean,
3923 alignments,
3924 u,
3925 fisher);
3926
3927 ublas::matrix<double> epsilon = ublas::zero_matrix<double>(gamma_mean.size(),gamma_mean.size());
3928 for (size_t i = 0; i < gamma_mean.size(); ++i)
3929 {
3930 epsilon(i,i) = 1e-6;
3931 }
3932
3933 fisher += epsilon; // modify matrix to avoid problems during inverse
3934
3935 ublas::matrix<double> fisher_chol = fisher;
3936
3937 double ch = cholesky_factorize(fisher_chol);
3938 if (ch != 0.0)
3939 {
3940 verbose_msg("Warning: Fisher matrix is not positive definite (bad element: %lg)\n", ch);
3941 return NUMERIC_FAIL;
3942 }
3943
3944 inverse_fisher = ublas::zero_matrix<double>(gamma_mean.size(),gamma_mean.size());
3945 bool invertible = chol_invert_matrix(fisher_chol, inverse_fisher);
3946
3947 ublas::matrix<double> test_fisher = inverse_fisher;
3948 ch = cholesky_factorize(test_fisher);
3949 if (ch != 0.0 || !invertible)
3950 {
3951 verbose_msg("Warning: Fisher matrix is not inverible\n", ch);
3952 return NUMERIC_FAIL;
3953 }
3954
3955 return NUMERIC_OK;
3956 }
3957
revise_map_mean_and_cov_estimate(double log_total_weight,const ublas::vector<double> & expectation,const vector<pair<size_t,double>> & sample_weights,const vector<ublas::vector<double>> & weighted_samples,ublas::vector<double> & gamma_map_estimate,ublas::matrix<double> & gamma_map_covariance)3958 AbundanceStatus revise_map_mean_and_cov_estimate(double log_total_weight,
3959 const ublas::vector<double>& expectation,
3960 const vector<pair<size_t, double> >& sample_weights,
3961 const vector<ublas::vector<double> >& weighted_samples,
3962 ublas::vector<double>& gamma_map_estimate,
3963 ublas::matrix<double>& gamma_map_covariance)
3964 {
3965 int N = expectation.size();
3966
3967 // revise gamma by setting it to the posterior expectation computed via the
3968 // importance sampling
3969 gamma_map_estimate = expectation;
3970
3971 // calculate the sample - mean vectors, store them in log space
3972 vector<ublas::vector<double> > sample_expectation_diffs;
3973
3974 ublas::vector<double> check_expectation = ublas::zero_vector<double>(expectation.size());
3975
3976 for (size_t j = 0; j < weighted_samples.size(); ++j)
3977 {
3978 ublas::vector<double> sample = weighted_samples[j];
3979 double log_sample_weight = sample_weights[j].second;
3980
3981 for (size_t e = 0; e < expectation.size(); ++e)
3982 {
3983 // sample is already log transformed after it was weighted, so we
3984 // need to divide by the sample weight to recover the original sample
3985 // value, then undo the log transform, then subtract the mean from it
3986 sample(e) = (exp(((long double)sample(e) - log_sample_weight)) - expectation(e));
3987 //sample(e) *= exp((log_sample_weight - log_total_weight));
3988 }
3989 //cerr << sample << endl;
3990 sample_expectation_diffs.push_back(sample);
3991 }
3992
3993 // We want to revise the covariance matrix from the samples, since we'll
3994 // need it later for the CIs.
3995 ublas::matrix<double> revised_cov = ublas::zero_matrix<double>(N,N);
3996
3997 // accumulate the contributions from the other samples (doing one cell of
3998 // covariance matrix per outer (i x j) loop iteration.
3999
4000 for (size_t j = 0; j < sample_expectation_diffs.size(); ++j)
4001 {
4002 double log_sample_weight = sample_weights[j].second;
4003 double w = exp((log_sample_weight - log_total_weight));
4004 ublas::vector<double> sample = weighted_samples[j];
4005
4006 for (size_t e = 0; e < expectation.size(); ++e)
4007 {
4008 // sample is already log transformed after it was weighted, so we
4009 // need to divide by the sample weight to recover the original sample
4010 // value, then undo the log transform, then subtract the mean from it
4011 sample(e) = exp(sample(e) - log_sample_weight);
4012 //sample(e) *= exp((log_sample_weight - log_total_weight));
4013 }
4014
4015 revised_cov += w * (outer_prod(sample,sample));
4016 }
4017
4018 revised_cov -= outer_prod(expectation,expectation);
4019
4020 //cerr << "Revised COV" << endl;
4021 //cerr << revised_cov << endl;
4022 gamma_map_covariance = revised_cov;
4023
4024 //cerr << "Revised MAP estimate: " << expectation << endl;
4025 //cerr << "Revised Covariance matrix:" << endl;
4026 //cerr << gamma_map_covariance << endl;
4027 //cerr << "*************" << endl;
4028
4029 return NUMERIC_OK;
4030 }
4031
calc_is_scale_factor(const ublas::matrix<double> & covariance_chol,double & is_scale_factor)4032 AbundanceStatus calc_is_scale_factor(const ublas::matrix<double>& covariance_chol,
4033 double& is_scale_factor)
4034 {
4035 double det = determinant(covariance_chol);
4036 is_scale_factor = pow(2.0*boost::math::constants::pi<double>(), covariance_chol.size1()/2.0);
4037 double s = sqrt(det);
4038 is_scale_factor *= s;
4039
4040 //assert (det);
4041 if (s == 0.0)
4042 {
4043 verbose_msg("Error: sqrt(det(cov)) == 0, %lf after rounding. \n", det);
4044 //cerr << covariance << endl;
4045 return NUMERIC_FAIL;
4046 }
4047 assert (s);
4048 assert (is_scale_factor);
4049 return NUMERIC_OK;
4050 }
4051
4052 template<class M, class PM>
is_identifiable(M & m,PM & pm)4053 bool is_identifiable(M &m, PM &pm)
4054 {
4055 using namespace ublas;
4056 typedef M matrix_type;
4057 typedef typename M::size_type size_type;
4058 typedef typename M::value_type value_type;
4059
4060 int singular = 0;
4061 size_type size1 = m.size1 ();
4062 size_type size2 = m.size2 ();
4063 size_type size = (std::min) (size1, size2);
4064 for (size_type i = 0; i < size; ++ i) {
4065 matrix_column<M> mci (column (m, i));
4066 matrix_row<M> mri (row (m, i));
4067 size_type i_norm_inf = i + index_norm_inf (project (mci, boost::numeric::ublas::range (i, size1)));
4068 if (m (i_norm_inf, i) != value_type/*zero*/()) {
4069 if (i_norm_inf != i) {
4070 pm (i) = i_norm_inf;
4071 row (m, i_norm_inf).swap (mri);
4072 } else {
4073 //BOOST_UBLAS_CHECK (pm (i) == i_norm_inf, external_logic ());
4074 }
4075 project (mci, boost::numeric::ublas::range (i + 1, size1)) *= value_type (1) / m (i, i);
4076 } else if (singular == 0) {
4077 singular = i + 1;
4078 }
4079 project (m, boost::numeric::ublas::range (i + 1, size1), boost::numeric::ublas::range (i + 1, size2)).minus_assign (outer_prod (project (mci, boost::numeric::ublas::range (i + 1, size1)),
4080 project (mri, boost::numeric::ublas::range (i + 1, size2))));
4081 }
4082 return singular == 0;
4083 }
4084
gamma_mle(const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<MateHit> & nr_alignments,const vector<double> & log_conv_factors,vector<double> & gammas,bool check_identifiability)4085 AbundanceStatus gamma_mle(const vector<boost::shared_ptr<Abundance> >& transcripts,
4086 const vector<MateHit>& nr_alignments,
4087 const vector<double>& log_conv_factors,
4088 vector<double>& gammas,
4089 bool check_identifiability)
4090 {
4091 //gammas.clear();
4092 gammas = vector<double>(transcripts.size(), 0);
4093 if (transcripts.empty())
4094 return NUMERIC_OK;
4095
4096
4097 if (transcripts.size() == 1)
4098 {
4099 gammas = vector<double>(1,1.0);
4100 return NUMERIC_OK;
4101 }
4102
4103 size_t M = nr_alignments.size();
4104 size_t N = transcripts.size();
4105
4106 bool converged = true;
4107 bool identifiable = true;
4108
4109 if (M > 0)
4110 {
4111
4112 //vector<vector<double> > saliencies (M,vector<double>(N,0));
4113
4114
4115 //compute_saliencies(cond_probs, saliencies, saliency_weight);
4116
4117 Eigen::VectorXd prob = Eigen::VectorXd::Zero(N);
4118 Eigen::VectorXd _phint = Eigen::VectorXd::Zero(N);
4119
4120 double logL;
4121
4122 Eigen::MatrixXd cond_probs(N, M);
4123
4124 for (size_t j = 0; j < cond_probs.rows(); ++j)
4125 {
4126 for (size_t i = 0; i < cond_probs.cols(); ++i)
4127 {
4128 cond_probs(j,i) = (*(transcripts[j]->cond_probs()))[i];
4129 }
4130 }
4131
4132 if (check_identifiability)
4133 {
4134 ublas::matrix<double> compat = ublas::zero_matrix<double>(M,N);
4135
4136 for (size_t j = 0; j < N; ++j)
4137 {
4138 for (size_t i = 0; i < M; ++i)
4139 {
4140 if (cond_probs(j,i))
4141 {
4142 //compat(i,j) = cond_probs[j][i];
4143 compat(i,j) = 1;
4144 }
4145 }
4146 }
4147
4148 vector<size_t> transcripts_with_frags;
4149 for (size_t j = 0; j < N; ++j)
4150 {
4151 bool has_fragment = false;
4152 for (size_t i = 0; i < M; ++i)
4153 {
4154 if (compat(i,j))
4155 {
4156 has_fragment = true;
4157 break;
4158 }
4159 }
4160 if (has_fragment)
4161 transcripts_with_frags.push_back(j);
4162 }
4163 ublas::matrix<double> reduced_compat = ublas::zero_matrix<double>(M,transcripts_with_frags.size());
4164 for (size_t j = 0; j < transcripts_with_frags.size(); ++j)
4165 {
4166 column(reduced_compat, j) = column(compat, transcripts_with_frags[j]);
4167 }
4168
4169
4170 typedef ublas::permutation_matrix<std::size_t> pmatrix;
4171
4172 // create a permutation matrix for the LU-factorization
4173 pmatrix pm(reduced_compat.size1());
4174
4175 // cerr << compat.size2() <<endl;
4176 // perform LU-factorization
4177 identifiable = is_identifiable<ublas::matrix<double>,pmatrix>(reduced_compat,pm);
4178 }
4179
4180 Eigen::VectorXd alignment_multiplicities(M);
4181 for (size_t i = 0; i < M; ++i)
4182 {
4183 alignment_multiplicities[i] = nr_alignments[i].collapse_mass();
4184 }
4185
4186 logL = EM(N, M, prob, cond_probs, alignment_multiplicities, log_conv_factors, converged);
4187
4188 gammas = vector<double>(N, 0.0);
4189
4190 for (size_t i = 0; i < gammas.size(); ++i)
4191 {
4192 gammas[i] = prob(i);
4193 if (isnan(gammas[i]) || isinf(gammas[i]))
4194 {
4195 return NUMERIC_FAIL;
4196 }
4197 }
4198 }
4199 else
4200 {
4201 gammas = vector<double>(N, 0.0);
4202 }
4203
4204 double round_err = 0.0;
4205 double num_good = 0;
4206 BOOST_FOREACH (double& g, gammas)
4207 {
4208 if (g < min_isoform_fraction)
4209 {
4210 round_err += g;
4211 g = 0.0;
4212 }
4213 else
4214 {
4215 num_good += 1;
4216 }
4217 }
4218 BOOST_FOREACH (double& g, gammas)
4219 {
4220 if (g != 0)
4221 {
4222 g += (round_err/num_good);
4223 }
4224 }
4225
4226 if (converged && identifiable)
4227 return NUMERIC_OK;
4228 else
4229 {
4230 if (!identifiable)
4231 //return NUMERIC_LOW_DATA;
4232 return NUMERIC_OK;
4233 else
4234 return NUMERIC_FAIL;
4235 }
4236
4237 return NUMERIC_OK;
4238 }
4239
calc_isoform_fpkm_conf_intervals(double FPKM,double variance,ConfidenceInterval & FPKM_conf)4240 void calc_isoform_fpkm_conf_intervals(double FPKM,
4241 double variance,
4242 ConfidenceInterval& FPKM_conf)
4243 {
4244 double FPKM_lo = 0.0;
4245 double FPKM_hi = 0.0;
4246 FPKM_hi = FPKM + 2 * sqrt(variance);
4247 FPKM_lo = max(0.0, FPKM - 2 * sqrt(variance));
4248 FPKM_conf = ConfidenceInterval(FPKM_lo, FPKM_hi);
4249 }
4250
not_intronic(int p,vector<float> & depth_of_coverage,vector<float> & intronic_cov,float min_intra_intron_fraction,int & intronic_status)4251 bool not_intronic(int p, vector<float>& depth_of_coverage, vector<float>& intronic_cov, float min_intra_intron_fraction,
4252 int& intronic_status) {
4253 bool not_an_intron = (intronic_cov[p]==0 ||
4254 depth_of_coverage[p]/intronic_cov[p] >= min_intra_intron_fraction);
4255 if (not_an_intron) intronic_status--;
4256 else intronic_status++;
4257 return not_an_intron;
4258 }
4259
4260
compute_doc(int bundle_origin,const vector<Scaffold> & scaffolds,vector<float> & depth_of_coverage,map<pair<int,int>,float> & intron_depth_of_coverage,bool exclude_intra_intron,vector<float> * intronic_cov,vector<int> * scaff_intronic_status)4261 double compute_doc(int bundle_origin,
4262 const vector<Scaffold>& scaffolds,
4263 vector<float>& depth_of_coverage,
4264 map<pair<int, int>, float>& intron_depth_of_coverage,
4265 bool exclude_intra_intron,
4266 vector<float>* intronic_cov,
4267 vector<int>* scaff_intronic_status)
4268 {
4269 vector<int> i_status;
4270 if (scaff_intronic_status==NULL)
4271 scaff_intronic_status=&i_status;
4272 *scaff_intronic_status = vector<int>(scaffolds.size(), 0);
4273 vector<float> intronic;
4274 if (intronic_cov==NULL)
4275 intronic_cov=&intronic;
4276 *intronic_cov = vector<float>(depth_of_coverage.size(), 0);
4277 //vector<bool> intronic(depth_of_coverage.size(), false);
4278 depth_of_coverage = vector<float>(depth_of_coverage.size(), 0);
4279
4280 set<const MateHit*> hits_in_gene_set;
4281 for (size_t i = 0; i < scaffolds.size(); ++i)
4282 {
4283 hits_in_gene_set.insert(scaffolds[i].mate_hits().begin(),
4284 scaffolds[i].mate_hits().end());
4285 }
4286
4287 vector<Scaffold> hits;
4288
4289 for(set<const MateHit*>::iterator itr = hits_in_gene_set.begin();
4290 itr != hits_in_gene_set.end();
4291 ++itr)
4292 {
4293 hits.push_back(Scaffold(**itr));
4294 hits.back().fpkm((**itr).mass());
4295 }
4296
4297 /*
4298 //no need for this here, we do it below with depth_of_coverage
4299 for (size_t i = 0; i < hits.size(); ++i)
4300 {
4301 const vector<AugmentedCuffOp>& aug_ops = hits[i].augmented_ops();
4302 for (size_t j = 0; j < aug_ops.size(); ++j)
4303 {
4304 const AugmentedCuffOp& op = aug_ops[j];
4305 if (op.opcode == CUFF_INTRON)
4306 {
4307 for (int K = op.g_left(); K < op.g_right(); ++K)
4308 {
4309 intronic[K - bundle_origin] = true;
4310 }
4311 }
4312 }
4313 }
4314 */
4315 for (size_t i = 0; i < hits.size(); ++i)
4316 {
4317 const vector<AugmentedCuffOp>& aug_ops = hits[i].augmented_ops();
4318 for (size_t j = 0; j < aug_ops.size(); ++j)
4319 {
4320 const AugmentedCuffOp& op = aug_ops[j];
4321 if (op.opcode == CUFF_MATCH)
4322 {
4323 for (int K = op.g_left(); K < op.g_right(); ++K)
4324 {
4325 depth_of_coverage[K - bundle_origin] += hits[i].fpkm();
4326 }
4327 }
4328 else if (op.opcode == CUFF_INTRON)
4329 {
4330 for (int K = op.g_left(); K < op.g_right(); ++K)
4331 {
4332 (*intronic_cov)[K - bundle_origin] += hits[i].fpkm();
4333 //intronic[K - bundle_origin] = true;
4334 }
4335
4336 pair<map<pair<int,int>,float>::iterator, bool> is = intron_depth_of_coverage.insert(make_pair(make_pair(op.g_left(), op.g_right()), 0));
4337 is.first->second += hits[i].fpkm();
4338 }
4339 }
4340 }
4341
4342 vector<float> knockout(depth_of_coverage);
4343
4344 double total_doc = 0;
4345 int total_len = 0;
4346 float min_intra_intron_fraction = min(pre_mrna_fraction, min_isoform_fraction);
4347 //for (size_t i = 0; i < hits.size(); ++i)
4348 for (size_t i = 0; i < scaffolds.size(); ++i)
4349 {
4350 //const vector<AugmentedCuffOp>& aug_ops = hits[i].augmented_ops();
4351 const vector<AugmentedCuffOp>& aug_ops = scaffolds[i].augmented_ops();
4352 for (size_t j = 0; j < aug_ops.size(); ++j)
4353 {
4354 const AugmentedCuffOp& op = aug_ops[j];
4355 if (op.opcode == CUFF_MATCH)
4356 {
4357 for (int K = op.g_left(); K < op.g_right(); ++K)
4358 {
4359 //if (!exclude_intra_intron || !intronic[K - bundle_origin])
4360 if (!exclude_intra_intron ||
4361 not_intronic(K-bundle_origin, depth_of_coverage, *intronic_cov, min_intra_intron_fraction,
4362 (*scaff_intronic_status)[i]) )
4363 {
4364 total_doc += knockout[K - bundle_origin];
4365 total_len += (knockout[K - bundle_origin] != 0);
4366 knockout[K - bundle_origin] = 0;
4367 }
4368 }
4369 }
4370 }
4371 }
4372
4373 return total_doc/(double)total_len;
4374 }
4375
major_isoform_intron_doc(map<pair<int,int>,float> & intron_doc)4376 double major_isoform_intron_doc(map<pair<int, int>, float>& intron_doc)
4377 {
4378 double major_isoform_intron_doc = 0;
4379 int num_major_introns = 0;
4380 for(map<pair<int, int>, float>::const_iterator itr = intron_doc.begin();
4381 itr != intron_doc.end();
4382 ++itr)
4383 {
4384 bool heaviest = true;
4385
4386 for (map<pair<int,int>, float>::const_iterator itr2 = intron_doc.begin();
4387 itr2 != intron_doc.end();
4388 ++itr2)
4389 {
4390 if (itr != itr2 &&
4391 itr->second < itr2->second &&
4392 overlap_in_genome(itr->first.first,
4393 itr->first.second,
4394 itr2->first.first,
4395 itr2->first.second))
4396 {
4397 heaviest = false;
4398 break;
4399 }
4400 }
4401
4402 if (heaviest)
4403 {
4404 major_isoform_intron_doc += itr->second;
4405 num_major_introns++;
4406 }
4407 }
4408 if (num_major_introns)
4409 {
4410 return major_isoform_intron_doc / num_major_introns;
4411 }
4412 else
4413 {
4414 return 0.0;
4415 }
4416 }
4417
record_min_doc_for_scaffolds(int bundle_origin,const vector<Scaffold> & hits,const vector<float> & depth_of_coverage,const map<pair<int,int>,float> & intron_depth_of_coverage,vector<double> & scaff_doc)4418 void record_min_doc_for_scaffolds(int bundle_origin,
4419 const vector<Scaffold>& hits,
4420 const vector<float>& depth_of_coverage,
4421 const map<pair<int, int>, float>& intron_depth_of_coverage,
4422 vector<double>& scaff_doc)
4423 {
4424 for (size_t h = 0; h < hits.size(); ++h)
4425 {
4426 double doc = 99999999.0;
4427 if (hits[h].has_intron())
4428 doc = get_intron_doc(hits[h], intron_depth_of_coverage);
4429
4430 doc = min(doc, get_scaffold_min_doc(bundle_origin,
4431 hits[h],
4432 depth_of_coverage));
4433 scaff_doc.push_back(doc);
4434 }
4435 }
4436
record_doc_for_scaffolds(int bundle_origin,const vector<Scaffold> & hits,const vector<float> & depth_of_coverage,vector<double> & scaff_doc)4437 void record_doc_for_scaffolds(int bundle_origin,
4438 const vector<Scaffold>& hits,
4439 const vector<float>& depth_of_coverage,
4440 vector<double>& scaff_doc)
4441 {
4442 for (size_t h = 0; h < hits.size(); ++h)
4443 {
4444 double doc;
4445 doc = get_scaffold_doc(bundle_origin,
4446 hits[h],
4447 depth_of_coverage);
4448 scaff_doc.push_back(doc);
4449 }
4450 }
4451
record_doc_for_scaffolds(int bundle_origin,const vector<Scaffold> & hits,const vector<float> & depth_of_coverage,const map<pair<int,int>,float> & intron_depth_of_coverage,vector<double> & scaff_doc)4452 void record_doc_for_scaffolds(int bundle_origin,
4453 const vector<Scaffold>& hits,
4454 const vector<float>& depth_of_coverage,
4455 const map<pair<int, int>, float>& intron_depth_of_coverage,
4456 vector<double>& scaff_doc)
4457 {
4458 for (size_t h = 0; h < hits.size(); ++h)
4459 {
4460 double doc;
4461 if (hits[h].has_intron())
4462 doc = get_intron_doc(hits[h], intron_depth_of_coverage);
4463 else
4464 doc = get_scaffold_doc(bundle_origin,
4465 hits[h],
4466 depth_of_coverage);
4467 scaff_doc.push_back(doc);
4468 }
4469 }
4470
get_intron_doc(const Scaffold & s,const map<pair<int,int>,float> & intron_depth_of_coverage)4471 double get_intron_doc(const Scaffold& s,
4472 const map<pair<int, int>, float >& intron_depth_of_coverage)
4473 {
4474 const vector<AugmentedCuffOp>& aug_ops = s.augmented_ops();
4475 int num_introns = 0;
4476 double doc = 0;
4477 for (size_t j = 0; j < aug_ops.size(); ++j)
4478 {
4479 const AugmentedCuffOp& op = aug_ops[j];
4480 if (op.opcode == CUFF_INTRON)
4481 {
4482 num_introns++;
4483 pair<int,int> op_intron(op.g_left(), op.g_right());
4484 map<pair<int, int>, float >::const_iterator itr = intron_depth_of_coverage.find(op_intron);
4485 // assert (itr != intron_depth_of_coverage.end());
4486 if (itr == intron_depth_of_coverage.end())
4487 {
4488 map<pair<int, int>, float >::const_iterator zi;
4489 for (zi = intron_depth_of_coverage.begin();
4490 zi != intron_depth_of_coverage.end();
4491 ++zi)
4492 {
4493 verbose_msg( "Warning: intron not within scaffold ([%d-%d], %d)\n", zi->first.first, zi->first.second, zi->second);
4494 }
4495 }
4496
4497 doc += itr->second;
4498 }
4499 }
4500 return doc / (double)num_introns;
4501 }
4502
get_scaffold_doc(int bundle_origin,const Scaffold & s,const vector<float> & depth_of_coverage)4503 double get_scaffold_doc(int bundle_origin,
4504 const Scaffold& s,
4505 const vector<float>& depth_of_coverage)
4506 {
4507 const vector<AugmentedCuffOp>& aug_ops = s.augmented_ops();
4508 int m_len = 0;
4509 double doc = 0;
4510 for (size_t j = 0; j < aug_ops.size(); ++j)
4511 {
4512 const AugmentedCuffOp& op = aug_ops[j];
4513 if (op.opcode == CUFF_MATCH)
4514 {
4515 for (int K = op.g_left(); K < op.g_right(); ++K)
4516 {
4517 m_len++;
4518 doc += depth_of_coverage[K - bundle_origin];
4519 }
4520 }
4521 }
4522
4523 return doc/(double)m_len;
4524 }
4525
get_scaffold_min_doc(int bundle_origin,const Scaffold & s,const vector<float> & depth_of_coverage)4526 double get_scaffold_min_doc(int bundle_origin,
4527 const Scaffold& s,
4528 const vector<float>& depth_of_coverage)
4529 {
4530 const vector<AugmentedCuffOp>& aug_ops = s.augmented_ops();
4531 float min_doc = 99999999;
4532
4533 for (size_t j = 0; j < aug_ops.size(); ++j)
4534 {
4535 const AugmentedCuffOp& op = aug_ops[j];
4536 if (op.opcode == CUFF_MATCH)
4537 {
4538 for (int K = op.g_left(); K < op.g_right(); ++K)
4539 {
4540 if (min_doc > depth_of_coverage[K - bundle_origin])
4541 min_doc = depth_of_coverage[K - bundle_origin];
4542 }
4543 }
4544 }
4545
4546 return min_doc;
4547 }
4548
tss_analysis(const string & locus_tag,SampleAbundances & sample)4549 void tss_analysis(const string& locus_tag, SampleAbundances& sample)
4550 {
4551 // Cluster transcripts by start site (TSS)
4552 vector<AbundanceGroup> transcripts_by_tss;
4553
4554 ublas::matrix<double> tss_gamma_cov;
4555 ublas::matrix<double> tss_count_cov;
4556 ublas::matrix<double> tss_iterated_exp_count_cov;
4557 ublas::matrix<double> tss_fpkm_cov;
4558 vector<Eigen::VectorXd> tss_assigned_counts;
4559
4560 vector<bool> mask(sample.transcripts.abundances().size(), true);
4561 for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
4562 {
4563 if (*(sample.transcripts.abundances()[i]->tss_id().begin()) == "")
4564 {
4565 mask[i] = false;
4566 }
4567 }
4568
4569 AbundanceGroup trans_with_tss;
4570 sample.transcripts.filter_group(mask, trans_with_tss);
4571
4572 cluster_transcripts<ConnectByAnnotatedTssId>(trans_with_tss,
4573 transcripts_by_tss,
4574 &tss_gamma_cov,
4575 &tss_iterated_exp_count_cov,
4576 &tss_count_cov,
4577 &tss_fpkm_cov);
4578
4579
4580 BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_tss)
4581 {
4582 ab_group.locus_tag(locus_tag);
4583 set<string> tss_ids = ab_group.tss_id();
4584 assert (tss_ids.size() == 1);
4585 string desc = *(tss_ids.begin());
4586 assert (desc != "");
4587 ab_group.description(*(tss_ids.begin()));
4588 }
4589
4590 sample.primary_transcripts = transcripts_by_tss;
4591
4592 // Group TSS clusters by gene
4593 vector<boost::shared_ptr<Abundance> > primary_transcript_abundances;
4594 set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
4595 BOOST_FOREACH (AbundanceGroup& ab_group, sample.primary_transcripts)
4596 {
4597 primary_transcript_abundances.push_back(boost::shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
4598 rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
4599 }
4600
4601 AbundanceGroup primary_transcripts(primary_transcript_abundances,
4602 tss_gamma_cov,
4603 tss_iterated_exp_count_cov,
4604 tss_count_cov,
4605 tss_fpkm_cov,
4606 rg_props);
4607
4608 vector<AbundanceGroup> primary_transcripts_by_gene;
4609
4610 cluster_transcripts<ConnectByAnnotatedGeneId>(primary_transcripts,
4611 primary_transcripts_by_gene);
4612
4613 BOOST_FOREACH(AbundanceGroup& ab_group, primary_transcripts_by_gene)
4614 {
4615 ab_group.locus_tag(locus_tag);
4616 set<string> gene_ids = ab_group.gene_id();
4617 if (gene_ids.size() > 1)
4618 {
4619 BOOST_FOREACH (string st, gene_ids)
4620 {
4621 fprintf(stderr, "%s\n", st.c_str());
4622 }
4623 ab_group.gene_id();
4624 }
4625 assert (gene_ids.size() == 1);
4626 ab_group.description(*(gene_ids.begin()));
4627 }
4628
4629 sample.gene_primary_transcripts = primary_transcripts_by_gene;
4630 }
4631
cds_analyis(const string & locus_tag,SampleAbundances & sample)4632 void cds_analyis(const string& locus_tag, SampleAbundances& sample)
4633 {
4634 // Cluster transcripts by CDS
4635 vector<AbundanceGroup> transcripts_by_cds;
4636 ublas::matrix<double> cds_gamma_cov;
4637 ublas::matrix<double> cds_count_cov;
4638 ublas::matrix<double> cds_iterated_exp_count_cov;
4639 ublas::matrix<double> cds_fpkm_cov;
4640
4641 vector<bool> mask(sample.transcripts.abundances().size(), true);
4642 for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
4643 {
4644 if (*(sample.transcripts.abundances()[i]->protein_id().begin()) == "")
4645 {
4646 mask[i] = false;
4647 }
4648 }
4649
4650 AbundanceGroup trans_with_p_id;
4651 sample.transcripts.filter_group(mask, trans_with_p_id);
4652
4653 cluster_transcripts<ConnectByAnnotatedProteinId>(trans_with_p_id,
4654 transcripts_by_cds,
4655 &cds_gamma_cov,
4656 &cds_iterated_exp_count_cov,
4657 &cds_count_cov,
4658 &cds_fpkm_cov);
4659
4660 BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_cds)
4661 {
4662 ab_group.locus_tag(locus_tag);
4663 set<string> protein_ids = ab_group.protein_id();
4664 assert (protein_ids.size() == 1);
4665 string desc = *(protein_ids.begin());
4666 //if (desc != "")
4667 //{
4668 assert (desc != "");
4669 ab_group.description(*(protein_ids.begin()));
4670 //}
4671 }
4672
4673 sample.cds = transcripts_by_cds;
4674
4675 // Group the CDS clusters by gene
4676 vector<boost::shared_ptr<Abundance> > cds_abundances;
4677
4678 set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
4679 BOOST_FOREACH (AbundanceGroup& ab_group, sample.cds)
4680 {
4681 //if (ab_group.description() != "")
4682 {
4683 cds_abundances.push_back(boost::shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
4684 rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
4685 }
4686 }
4687 AbundanceGroup cds(cds_abundances,
4688 cds_gamma_cov,
4689 cds_iterated_exp_count_cov,
4690 cds_count_cov,
4691 cds_fpkm_cov,
4692 rg_props);
4693
4694 vector<AbundanceGroup> cds_by_gene;
4695
4696 cluster_transcripts<ConnectByAnnotatedGeneId>(cds,
4697 cds_by_gene);
4698
4699 BOOST_FOREACH(AbundanceGroup& ab_group, cds_by_gene)
4700 {
4701 ab_group.locus_tag(locus_tag);
4702 set<string> gene_ids = ab_group.gene_id();
4703 assert (gene_ids.size() == 1);
4704 ab_group.description(*(gene_ids.begin()));
4705 }
4706
4707 sample.gene_cds = cds_by_gene;
4708
4709 }
4710
sample_abundance_worker(const string & locus_tag,const set<boost::shared_ptr<ReadGroupProperties const>> & rg_props,SampleAbundances & sample,boost::shared_ptr<HitBundle> sample_bundle,bool perform_cds_analysis,bool perform_tss_analysis,bool calculate_variance)4711 void sample_abundance_worker(const string& locus_tag,
4712 const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props,
4713 SampleAbundances& sample,
4714 boost::shared_ptr<HitBundle> sample_bundle,
4715 bool perform_cds_analysis,
4716 bool perform_tss_analysis,
4717 bool calculate_variance)
4718 {
4719 vector<boost::shared_ptr<Abundance> > abundances;
4720
4721 BOOST_FOREACH(boost::shared_ptr<Scaffold> s, sample_bundle->ref_scaffolds())
4722 {
4723 TranscriptAbundance* pT = new TranscriptAbundance;
4724 pT->transfrag(s);
4725 boost::shared_ptr<Abundance> ab(pT);
4726 ab->description(s->annotated_trans_id());
4727 ab->locus_tag(locus_tag);
4728 abundances.push_back(ab);
4729 }
4730
4731 sample.transcripts = AbundanceGroup(abundances);
4732
4733 sample.transcripts.init_rg_props(rg_props);
4734
4735 vector<MateHit> hits_in_cluster;
4736
4737 if (sample_bundle->hits().size() < (size_t)max_frags_per_bundle)
4738 {
4739 get_alignments_from_scaffolds(sample.transcripts.abundances(),
4740 hits_in_cluster);
4741
4742 // Compute the individual transcript FPKMs via each sample's
4743 // AbundanceGroup for this locus.
4744
4745 sample.transcripts.calculate_abundance(hits_in_cluster, true, calculate_variance);
4746 }
4747 else
4748 {
4749 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, abundances)
4750 {
4751 ab->status(NUMERIC_HI_DATA);
4752
4753 CountPerReplicateTable cpr;
4754 FPKMPerReplicateTable fpr;
4755 StatusPerReplicateTable spr;
4756 for (set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg_props.begin();
4757 itr != rg_props.end();
4758 ++itr)
4759 {
4760 cpr[*itr] = 0;
4761 fpr[*itr] = 0;
4762 spr[*itr] = NUMERIC_HI_DATA;
4763 }
4764 ab->num_fragments_by_replicate(cpr);
4765 ab->FPKM_by_replicate(fpr);
4766 ab->status_by_replicate(spr);
4767 }
4768 }
4769
4770 // Cluster transcripts by gene_id
4771 vector<AbundanceGroup> transcripts_by_gene_id;
4772 cluster_transcripts<ConnectByAnnotatedGeneId>(sample.transcripts,
4773 transcripts_by_gene_id);
4774
4775 BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_gene_id)
4776 {
4777 ab_group.locus_tag(locus_tag);
4778 set<string> gene_ids = ab_group.gene_id();
4779 assert (gene_ids.size() == 1);
4780 ab_group.description(*(gene_ids.begin()));
4781 }
4782
4783 sample.genes = transcripts_by_gene_id;
4784
4785 if (perform_cds_analysis)
4786 {
4787 cds_analyis(locus_tag, sample);
4788 }
4789
4790 if (perform_tss_analysis) {
4791 tss_analysis(locus_tag, sample);
4792 }
4793 }
4794
4795 // This function applies library size factors to pre-computed expression entries
apply_normalization_to_abundances(const map<boost::shared_ptr<const ReadGroupProperties>,boost::shared_ptr<const AbundanceGroup>> & unnormalized_ab_group_per_replicate,map<boost::shared_ptr<const ReadGroupProperties>,boost::shared_ptr<AbundanceGroup>> & normalized_ab_group_per_replicate)4796 void AbundanceGroup::apply_normalization_to_abundances(const map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >& unnormalized_ab_group_per_replicate,
4797 map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >& normalized_ab_group_per_replicate)
4798 {
4799 for (map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = unnormalized_ab_group_per_replicate.begin();
4800 itr != unnormalized_ab_group_per_replicate.end(); ++itr)
4801 {
4802 boost::shared_ptr<AbundanceGroup> norm_ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(*itr->second));
4803 boost::shared_ptr<const ReadGroupProperties> rg_props = itr->first;
4804 boost::shared_ptr<const MassDispersionModel> disp_model = rg_props->mass_dispersion_model();
4805
4806 boost::shared_ptr<const ReadGroupProperties> old_rg_props = *(itr->second->rg_props().begin());
4807
4808 double fpkm_correction_factor = 1.0;
4809 fpkm_correction_factor = old_rg_props->normalized_map_mass() / rg_props->normalized_map_mass();
4810
4811 double internal_scale_factor = rg_props->internal_scale_factor();
4812
4813 double total_mass = 0.0;
4814
4815 for (size_t i = 0; i < norm_ab->_abundances.size(); ++i)
4816 {
4817 norm_ab->_abundances[i]->num_fragments(itr->second->_abundances[i]->num_fragments() / internal_scale_factor);
4818
4819 total_mass += norm_ab->_abundances[i]->num_fragments();
4820
4821 norm_ab->_abundances[i]->FPKM(fpkm_correction_factor * itr->second->_abundances[i]->FPKM() / internal_scale_factor);
4822 norm_ab->_iterated_exp_count_covariance = norm_ab->iterated_count_cov() / (internal_scale_factor*internal_scale_factor);
4823 norm_ab->_fpkm_covariance = norm_ab->_fpkm_covariance * (fpkm_correction_factor * fpkm_correction_factor)/ (internal_scale_factor*internal_scale_factor);
4824 norm_ab->_count_covariance = norm_ab->_count_covariance/ (internal_scale_factor*internal_scale_factor);
4825 }
4826
4827 double locus_mass_variance = disp_model->scale_mass_variance(total_mass);
4828
4829 for (size_t i = 0; i < norm_ab->_abundances.size(); ++i)
4830 {
4831 norm_ab->_abundances[i]->mass_variance(locus_mass_variance * norm_ab->_abundances[i]->gamma());
4832 }
4833
4834 normalized_ab_group_per_replicate[itr->first] = norm_ab;
4835 }
4836 }
4837
merge_precomputed_expression_worker(const string & locus_tag,const vector<boost::shared_ptr<PrecomputedExpressionBundleFactory>> & expression_factories,SampleAbundances & sample,boost::shared_ptr<HitBundle> sample_bundle,bool perform_cds_analysis,bool perform_tss_analysis,bool calculate_variance)4838 void merge_precomputed_expression_worker(const string& locus_tag,
4839 const vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> >& expression_factories,
4840 SampleAbundances& sample,
4841 boost::shared_ptr<HitBundle> sample_bundle,
4842 bool perform_cds_analysis,
4843 bool perform_tss_analysis,
4844 bool calculate_variance)
4845 {
4846 map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> > unnormalized_ab_group_per_replicate;
4847 map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> > normalized_ab_group_per_replicate;
4848 map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> > const_ab_group_per_replicate;
4849
4850 set<boost::shared_ptr<const ReadGroupProperties> > rg_props;
4851 for (size_t i = 0; i < expression_factories.size(); ++i)
4852 {
4853 boost::shared_ptr<PrecomputedExpressionBundleFactory> pBundleFac = expression_factories[i];
4854 boost::shared_ptr<const PrecomputedExpressionHitFactory> pHitFac = boost::dynamic_pointer_cast<const PrecomputedExpressionHitFactory> (pBundleFac->hit_factory());
4855 assert (pHitFac);
4856
4857 boost::shared_ptr<const ReadGroupProperties> rg_prop = pBundleFac->read_group_properties();
4858 rg_props.insert(rg_prop);
4859
4860 boost::shared_ptr<const AbundanceGroup> ab = pBundleFac->get_abundance_for_locus(sample_bundle->id());
4861 pBundleFac->clear_abundance_for_locus(sample_bundle->id());
4862 if (!ab)
4863 {
4864 fprintf(stderr, "Error: no bundle with id %d in precomputed expression file\n", sample_bundle->id());
4865 }
4866 else if(ab->abundances().size() != sample_bundle->ref_scaffolds().size())
4867 {
4868 fprintf(stderr, "Error: bad bundle merge %s != %s\n", ab->description().c_str(), locus_tag.c_str());
4869 }
4870 unnormalized_ab_group_per_replicate[rg_prop] = ab;
4871 }
4872
4873 AbundanceGroup::apply_normalization_to_abundances(unnormalized_ab_group_per_replicate, normalized_ab_group_per_replicate);
4874
4875 for (map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >::const_iterator itr = normalized_ab_group_per_replicate.begin();
4876 itr != normalized_ab_group_per_replicate.end(); ++itr)
4877 {
4878 const_ab_group_per_replicate[itr->first] = itr->second;
4879 }
4880
4881 vector<boost::shared_ptr<Abundance> > abundances;
4882
4883 BOOST_FOREACH(boost::shared_ptr<Scaffold> s, sample_bundle->ref_scaffolds())
4884 {
4885 TranscriptAbundance* pT = new TranscriptAbundance;
4886 pT->transfrag(s);
4887 boost::shared_ptr<Abundance> ab(pT);
4888 ab->description(s->annotated_trans_id());
4889 ab->locus_tag(locus_tag);
4890 abundances.push_back(ab);
4891 }
4892
4893 sample.transcripts = AbundanceGroup(abundances);
4894
4895 sample.transcripts.init_rg_props(rg_props);
4896
4897 vector<MateHit> hits_in_cluster;
4898
4899 if (sample_bundle->hits().size() < (size_t)max_frags_per_bundle)
4900 {
4901 sample.transcripts.collect_per_replicate_mass(const_ab_group_per_replicate);
4902 sample.transcripts.aggregate_replicate_abundances(const_ab_group_per_replicate);
4903 if (calculate_variance)
4904 {
4905 sample.transcripts.calculate_abundance_group_variance(abundances, const_ab_group_per_replicate);
4906 }
4907 }
4908 else // FIXME: THIS needs to do the right thing with sample.transcripts...
4909 {
4910 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, abundances)
4911 {
4912 ab->status(NUMERIC_HI_DATA);
4913
4914 CountPerReplicateTable cpr;
4915 FPKMPerReplicateTable fpr;
4916 StatusPerReplicateTable spr;
4917 for (set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg_props.begin();
4918 itr != rg_props.end();
4919 ++itr)
4920 {
4921 cpr[*itr] = 0;
4922 fpr[*itr] = 0;
4923 spr[*itr] = NUMERIC_HI_DATA;
4924 }
4925 ab->num_fragments_by_replicate(cpr);
4926 ab->FPKM_by_replicate(fpr);
4927 ab->status_by_replicate(spr);
4928 }
4929 }
4930
4931 // Cluster transcripts by gene_id
4932 vector<AbundanceGroup> transcripts_by_gene_id;
4933 cluster_transcripts<ConnectByAnnotatedGeneId>(sample.transcripts,
4934 transcripts_by_gene_id);
4935
4936 BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_gene_id)
4937 {
4938 ab_group.locus_tag(locus_tag);
4939 set<string> gene_ids = ab_group.gene_id();
4940 assert (gene_ids.size() == 1);
4941 ab_group.description(*(gene_ids.begin()));
4942 }
4943
4944 sample.genes = transcripts_by_gene_id;
4945
4946 if (perform_cds_analysis)
4947 {
4948 cds_analyis(locus_tag, sample);
4949 }
4950
4951 if (perform_tss_analysis)
4952 {
4953 tss_analysis(locus_tag, sample);
4954 }
4955 }
4956
4957
4958 //BOOST_CLASS_EXPORT(Abundance)
4959 BOOST_CLASS_EXPORT(TranscriptAbundance)
4960 BOOST_SERIALIZATION_SHARED_PTR(TranscriptAbundance);
4961 BOOST_CLASS_EXPORT(AbundanceGroup);
4962 BOOST_SERIALIZATION_SHARED_PTR(AbundanceGroup);
4963 BOOST_SERIALIZATION_ASSUME_ABSTRACT(Abundance);
4964
4965