1 /*
2  *  abundances.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 4/27/09.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  *  NOTE: some of the code in this file was derived from (Eriksson et al, 2008)
9  */
10 
11 
12 //#define BOOST_MATH_INSTRUMENT 1
13 //#include <iostream>
14 //#include <iomanip>
15 
16 #include "abundances.h"
17 #include <numeric>
18 #include <limits>
19 #include <algorithm>
20 
21 #include <boost/numeric/ublas/vector.hpp>
22 #include <boost/numeric/ublas/vector_proxy.hpp>
23 #include <boost/numeric/ublas/matrix.hpp>
24 #include <boost/numeric/ublas/triangular.hpp>
25 
26 //#define BOOST_UBLAS_TYPE_CHECK 0
27 #include <boost/numeric/ublas/lu.hpp>
28 
29 #include <boost/numeric/ublas/io.hpp>
30 
31 #include <boost/random/mersenne_twister.hpp>
32 #include <boost/random/normal_distribution.hpp>
33 #include <boost/random/uniform_int.hpp>
34 #include <boost/random/variate_generator.hpp>
35 #include <boost/math/constants/constants.hpp>
36 #include <boost/math/tools/roots.hpp>
37 #include <complex>
38 #include <boost/math/tools/tuple.hpp>
39 
40 #include <boost/math/distributions/chi_squared.hpp>
41 
42 #include "filters.h"
43 #include "replicates.h"
44 #include "sampling.h"
45 #include "jensen_shannon.h"
46 #include "rounding.h"
47 #include "clustering.h"
48 
49 
50 #include "negative_binomial_distribution.h"
51 
52 
53 #include <Eigen/Dense>
54 //using Eigen::MatrixXd;
55 
56 //#ifdef  __USE_ISOC99
57 ///* INFINITY and NAN are defined by the ISO C99 standard */
58 //#else
59 //double my_infinity(void) {
60 //    double zero = 0.0;
61 //    return 1.0/zero;
62 //}
63 //double my_nan(void) {
64 //    double zero = 0.0;
65 //    return zero/zero;
66 //}
67 //#define INFINITY my_infinity()
68 //#define NAN my_nan()
69 //#endif
70 
71 #define EulerGamma 0.57721566490153286060651209008240243104215933593992
72 
73 /* The digamma function is the derivative of gammaln.
74 
75  Reference:
76  J Bernardo,
77  Psi ( Digamma ) Function,
78  Algorithm AS 103,
79  Applied Statistics,
80  Volume 25, Number 3, pages 315-317, 1976.
81 
82  From http://www.psc.edu/~burkardt/src/dirichlet/dirichlet.f
83  (with modifications for negative numbers and extra precision)
84  */
digamma(double x)85 double digamma(double x)
86 {
87     double neginf = -INFINITY;
88     static const double c = 12,
89     d1 = -EulerGamma,
90     d2 = 1.6449340668482264365, /* pi^2/6 */
91     s = 1e-6,
92     s3 = 1./12,
93     s4 = 1./120,
94     s5 = 1./252,
95     s6 = 1./240,
96     s7 = 1./132;
97     /*static const double s8 = 691/32760.0, s9 = 1/12.0, s10 = 3617/8160.0;*/
98     double result;
99 #if 0
100     static double cache_x = 0;
101     static int hits = 0, total = 0;
102     total++;
103     if(x == cache_x) {
104         hits++;
105     }
106     if(total % 1000 == 1) {
107         printf("hits = %d, total = %d, hits/total = %g\n", hits, total,
108                ((double)hits)/total);
109     }
110     cache_x = x;
111 #endif
112     if( x==1.0 )
113         return d1;
114 
115     /* Illegal arguments */
116     if((x == neginf) || isnan(x)) {
117         return NAN;
118     }
119     /* Singularities */
120     if((x <= 0) && (floor(x) == x)) {
121         return neginf;
122     }
123     /* Negative values */
124     /* Use the reflection formula (Jeffrey 11.1.6):
125      * digamma(-x) = digamma(x+1) + pi*cot(pi*x)
126      *
127      * This is related to the identity
128      * digamma(-x) = digamma(x+1) - digamma(z) + digamma(1-z)
129      * where z is the fractional part of x
130      * For example:
131      * digamma(-3.1) = 1/3.1 + 1/2.1 + 1/1.1 + 1/0.1 + digamma(1-0.1)
132      *               = digamma(4.1) - digamma(0.1) + digamma(1-0.1)
133      * Then we use
134      * digamma(1-z) - digamma(z) = pi*cot(pi*z)
135      */
136     if(x < 0) {
137         return digamma(1-x) + M_PI/tan(-M_PI*x);
138     }
139     /* Use Taylor series if argument <= S */
140     if(x <= s) return d1 - 1/x + d2*x;
141     /* Reduce to digamma(X + N) where (X + N) >= C */
142     result = 0;
143     while(x < c) {
144         result -= 1/x;
145         x++;
146     }
147     /* Use de Moivre's expansion if argument >= C */
148     /* This expansion can be computed in Maple via asympt(Psi(x),x) */
149     if(x >= c) {
150         double r = 1/x, t;
151         result += log(x) - 0.5*r;
152         r *= r;
153 #if 0
154         result -= r * (s3 - r * (s4 - r * (s5 - r * (s6 - r * s7))));
155 #else
156         /* this version for lame compilers */
157         t = (s5 - r * (s6 - r * s7));
158         result -= r * (s3 - r * (s4 - r * t));
159 #endif
160     }
161     return result;
162 }
163 
164 
165 /* The trigamma function is the derivative of the digamma function.
166 
167  Reference:
168 
169  B Schneider,
170  Trigamma Function,
171  Algorithm AS 121,
172  Applied Statistics,
173  Volume 27, Number 1, page 97-99, 1978.
174 
175  From http://www.psc.edu/~burkardt/src/dirichlet/dirichlet.f
176  (with modification for negative arguments and extra precision)
177  */
trigamma(double x)178 double trigamma(double x)
179 {
180     double neginf = -INFINITY,
181     small = 1e-4,
182     large = 8,
183     c = 1.6449340668482264365, /* pi^2/6 = Zeta(2) */
184     c1 = -2.404113806319188570799476,  /* -2 Zeta(3) */
185     b2 =  1./6,
186     b4 = -1./30,
187     b6 =  1./42,
188     b8 = -1./30,
189     b10 = 5./66;
190     double result;
191     /* Illegal arguments */
192     if((x == neginf) || isnan(x)) {
193         return NAN;
194     }
195     /* Singularities */
196     if((x <= 0) && (floor(x) == x)) {
197         return neginf;
198     }
199     /* Negative values */
200     /* Use the derivative of the digamma reflection formula:
201      * -trigamma(-x) = trigamma(x+1) - (pi*csc(pi*x))^2
202      */
203     if(x < 0) {
204         result = M_PI/sin(-M_PI*x);
205         return -trigamma(1-x) + result*result;
206     }
207     /* Use Taylor series if argument <= small */
208     if(x <= small) {
209         return 1/(x*x) + c + c1*x;
210     }
211     result = 0;
212     /* Reduce to trigamma(x+n) where ( X + N ) >= B */
213     while(x < large) {
214         result += 1/(x*x);
215         x++;
216     }
217     /* Apply asymptotic formula when X >= B */
218     /* This expansion can be computed in Maple via asympt(Psi(1,x),x) */
219     if(x >= large) {
220         double r = 1/(x*x), t;
221 #if 0
222         result += 0.5*r + (1 + r*(b2 + r*(b4 + r*(b6 + r*(b8 + r*b10)))))/x;
223 #else
224         t = (b4 + r*(b6 + r*(b8 + r*b10)));
225         result += 0.5*r + (1 + r*(b2 + r*t))/x;
226 #endif
227     }
228     return result;
229 }
230 
231 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
negbin_log_likelihood(const vector<double> & samples,long double r,long double p)232 long double negbin_log_likelihood(const vector<double>& samples, long double r, long double p)
233 {
234     if (samples.empty() || r == 0 || p == 0)
235     {
236         return 1.0;
237     }
238 
239     long double T1 = 0;
240     for (size_t i = 0; i < samples.size(); ++i)
241     {
242         T1 += lgamma(samples[i] + r);
243     }
244 
245     long double T2 = 0;
246     for (size_t i = 0; i < samples.size(); ++i)
247     {
248         T2 += lgamma(samples[i] + 1);
249     }
250 
251     long double T3 = samples.size() * lgamma(r);
252     long double T4 = samples.size() * r * logl(1 - p);
253 
254     long double T5 = 0;
255     for (size_t i = 0; i < samples.size(); ++i)
256     {
257         T5 += samples[i] * logl(p);
258     }
259 
260     assert ((isnan(T1) || isnan(T2) || isnan(T3) || isnan(T4) || isnan(T5)) == false);
261     long double log_likelihood = T1 - T2 - T3 + T4 + T5;
262     return log_likelihood;
263 }
264 
265 
266 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
poisson_log_likelihood(const vector<double> & samples,long double lambda)267 long double poisson_log_likelihood(const vector<double>& samples, long double lambda)
268 {
269     if (samples.empty() || lambda == 0.0)
270     {
271         return 1.0;
272     }
273 
274     long double T1 = 0;
275     for (size_t i = 0; i < samples.size(); ++i)
276     {
277         T1 += samples[i] * logl(lambda);
278     }
279 
280     long double T2 = samples.size() * lambda;
281 
282     assert ((isnan(T1) || isnan(T2)) == false);
283     long double log_likelihood = T1 - T2;
284     return log_likelihood;
285 }
286 
287 
288 
289 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
negbin_log_likelihood_helper(const vector<double> & samples,long double r)290 long double negbin_log_likelihood_helper(const vector<double>& samples, long double r)
291 {
292     long double p = 0;
293     long double mean_count = accumulate(samples.begin(), samples.end(), 0.0);
294 
295     if (r == 0 || mean_count == 0)
296     {
297         fprintf(stderr, "Error: r must be > 0!\n");
298         return 0;
299     }
300 
301     if (samples.empty() == false)
302         mean_count /= samples.size();
303 
304     p = mean_count / (r + mean_count);
305 
306     long double T1 = 0;
307     for (size_t i = 0; i < samples.size(); ++i)
308     {
309         T1 += lgamma(samples[i] + r);
310     }
311 
312     long double T2 = 0;
313     for (size_t i = 0; i < samples.size(); ++i)
314     {
315         T2 += lgamma(samples[i] + 1);
316     }
317 
318     long double T3 = samples.size() * lgamma(r);
319     long double T4 = samples.size() * r * log(1 - p);
320 
321     long double T5 = 0;
322     for (size_t i = 0; i < samples.size(); ++i)
323     {
324         T5 += samples[i] * log(p);
325     }
326 
327     assert ((isnan(T1) || isnan(T2) || isnan(T3) || isnan(T4) || isnan(T5)) == false);
328     long double log_likelihood = T1 - T2 - T3 + T4 + T5;
329     return log_likelihood;
330 }
331 
332 
333 // Returns the log likelihood of the negative binomial with a given r and the mle value of p
negbin_log_likelihood_prime_helper(const vector<double> & samples,long double r)334 long double negbin_log_likelihood_prime_helper(const vector<double>& samples, long double r)
335 {
336     long double T1 = 0;
337     for (size_t i = 0; i < samples.size(); ++i)
338     {
339         T1 += digamma(samples[i] + r);
340     }
341 
342     long double T2 = samples.size() * digamma(r);
343 
344     long double mean_count = accumulate(samples.begin(), samples.end(), 0.0);
345     if (samples.empty() == false)
346         mean_count /= samples.size();
347 
348     long double T3 = samples.size() * log(r / (r + mean_count));
349 
350     assert ((isnan(T1) || isnan(T2) || isnan(T3)) == false);
351 
352     long double log_likelihood_prime = T1 - T2 + T3;
353     return log_likelihood_prime;
354 }
355 
356 
357 struct negbin_ll_functor
358 {
negbin_ll_functornegbin_ll_functor359     negbin_ll_functor(const vector<double>& count_samples) : samples(count_samples){}
operator ()negbin_ll_functor360     boost::math::tuple<long double, long double> operator()(long double r)
361     {
362         long double llk = negbin_log_likelihood_helper(samples, r);
363         long double llk_d = negbin_log_likelihood_prime_helper(samples, r);
364         return boost::math::make_tuple(llk, llk_d);
365     }
366 private:
367     const vector<double>& samples;
368 };
369 
370 
fit_negbin_dist(const vector<double> samples,double & r,double & p)371 bool fit_negbin_dist(const vector<double> samples, double& r, double& p)
372 {
373 
374     if (samples.size() == 0)
375     {
376         r = 0;
377         p = 0;
378         return true;
379     }
380 
381     long double guess = accumulate(samples.begin(), samples.end(), 0.0);
382     if (samples.empty() == false)
383         guess /= samples.size();
384 
385     if (guess == 0)
386     {
387         r = 0;
388         p = 0;
389         return true;
390     }
391 
392     long double min = 0;
393     long double max = *std::max_element(samples.begin(), samples.end());
394     max *= max;
395     int digits = std::numeric_limits<double>::digits; // Maximum possible binary digits accuracy for type T.
396 
397     boost::uintmax_t max_iters = 50;
398 
399     r = boost::math::tools::newton_raphson_iterate(negbin_ll_functor(samples), guess, min, max, digits, max_iters);
400 
401     long double mean_count = accumulate(samples.begin(), samples.end(), 0.0);
402     if (samples.empty() == false)
403         mean_count /= samples.size();
404 
405     p = mean_count / (r + mean_count);
406 
407     //fprintf(stderr, "r = %lg, p = %lg, max_r = %Lg\n", r, p, max);
408 
409     if (isnan(r) || isnan(p))
410     {
411         fprintf(stderr, "warning: negative binomial parameters are Nan!\n");
412         return false;
413     }
414     return true;
415 }
416 
417 
418 //#define USE_LOG_CACHE
419 
compute_compatibilities(const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<MateHit> & alignments,vector<vector<char>> & compatibilities)420 void compute_compatibilities(const vector<boost::shared_ptr<Abundance> >& transcripts,
421                              const vector<MateHit>& alignments,
422 							 vector<vector<char> >& compatibilities)
423 {
424 	int M = alignments.size();
425 	int N = transcripts.size();
426 
427 	vector<Scaffold> alignment_scaffs;
428 
429 	for (size_t i = 0; i < alignments.size(); ++i)
430 	{
431 		const MateHit& hit = alignments[i];
432 		alignment_scaffs.push_back(Scaffold(hit));
433 	}
434 
435     for (int j = 0; j < N; ++j)
436     {
437 		boost::shared_ptr<Scaffold> transfrag_j = transcripts[j]->transfrag();
438 		for (int i = 0; i < M; ++i)
439         {
440 			if (transfrag_j->contains(alignment_scaffs[i])
441 				&& Scaffold::compatible(*transfrag_j, alignment_scaffs[i]))
442             {
443 				compatibilities[j][i] = 1;
444             }
445         }
446 	}
447 }
448 
AbundanceGroup(const vector<boost::shared_ptr<Abundance>> & abundances,const ublas::matrix<double> & gamma_covariance,const ublas::matrix<double> & iterated_exp_count_covariance,const ublas::matrix<double> & count_covariance,const ublas::matrix<double> & fpkm_covariance,const set<boost::shared_ptr<ReadGroupProperties const>> & rg_props)449 AbundanceGroup::AbundanceGroup(const vector<boost::shared_ptr<Abundance> >& abundances,
450                                const ublas::matrix<double>& gamma_covariance,
451                                const ublas::matrix<double>& iterated_exp_count_covariance,
452                                const ublas::matrix<double>& count_covariance,
453                                const ublas::matrix<double>& fpkm_covariance,
454                                const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props) :
455     _abundances(abundances),
456     _iterated_exp_count_covariance(iterated_exp_count_covariance),
457     _count_covariance(count_covariance),
458     _fpkm_covariance(fpkm_covariance),
459     _gamma_covariance(gamma_covariance),
460     _salient_frags(0.0),
461     _total_frags(0.0),
462     _read_group_props(rg_props)
463 {
464     // Calling calculate_FPKM_covariance() also estimates cross-replicate
465     // count variances
466     // calculate_FPKM_covariance();
467     double fpkm_var = 0.0;
468     for (size_t i = 0; i < _fpkm_covariance.size1(); ++i)
469     {
470         for (size_t j = 0; j < _fpkm_covariance.size2(); ++j)
471         {
472             assert (!isnan(_fpkm_covariance(i,j)) && !isinf(_fpkm_covariance(i,j)));
473             fpkm_var += _fpkm_covariance(i,j);
474         }
475     }
476 
477     _FPKM_variance = fpkm_var;
478 
479     if (FPKM() > 0 && final_est_run && library_type != "transfrags")
480     {
481 
482         ublas::matrix<double> test = _fpkm_covariance;
483         double ret = cholesky_factorize(test);
484         if (ret != 0 || (_FPKM_variance < 0 && status() == NUMERIC_OK))
485         {
486             //fprintf(stderr, "Warning: total FPKM covariance is not positive definite (ret = %lg)!\n", ret);
487             for (size_t j = 0; j < _abundances.size(); ++j)
488             {
489                 _abundances[j]->status(NUMERIC_FAIL);
490             }
491         }
492 
493        if(!(FPKM() == 0 || fpkm_var > 0 || status() != NUMERIC_OK))
494        {
495            //cerr << _count_covariance << endl;
496            //cerr << _fpkm_covariance << endl;
497        }
498 
499        // assert (FPKM() == 0 || fpkm_var > 0 || status() != NUMERIC_OK);
500     }
501 
502     for (size_t i = 0; i < _abundances.size(); ++i)
503     {
504         if (_fpkm_samples.empty())
505         {
506             _fpkm_samples = vector<double>(_abundances[i]->fpkm_samples().size(), 0);
507             _member_fpkm_samples = vector<Eigen::VectorXd>(_abundances[i]->fpkm_samples().size(), Eigen::VectorXd::Zero(_abundances.size()));
508         }
509         for (size_t j = 0; j < _abundances[i]->fpkm_samples().size(); ++j)
510         {
511             _fpkm_samples[j] += _abundances[i]->fpkm_samples()[j];
512             _member_fpkm_samples[j](i) = _abundances[i]->fpkm_samples()[j];
513         }
514     }
515 
516     calculate_conf_intervals();
517 
518     if (no_js_tests == false && _read_group_props.size() >= min_reps_for_js_test)
519     {
520         calculate_kappas();
521     }
522 }
523 
status() const524 AbundanceStatus AbundanceGroup::status() const
525 {
526     bool has_lowdata_member = false;
527     bool has_ok_member = false;
528 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
529 	{
530 		if (ab->status() == NUMERIC_FAIL)
531 		{
532 			return NUMERIC_FAIL;
533 		}
534         else if (ab->status() == NUMERIC_LOW_DATA)
535 		{
536 			has_lowdata_member = true;
537             //return NUMERIC_LOW_DATA;
538 		}
539         else if (ab->status() == NUMERIC_HI_DATA)
540 		{
541 			return NUMERIC_HI_DATA;
542 		}
543         else if (ab->status() == NUMERIC_OK)
544 		{
545 			has_ok_member = true;
546 		}
547 	}
548 
549     if (has_ok_member == false)
550         return NUMERIC_LOW_DATA;
551 
552 	return NUMERIC_OK;
553 }
554 
FPKM_variance(double v)555 void TranscriptAbundance::FPKM_variance(double v)
556 {
557     assert (v >= 0);
558     assert(!isnan(v));
559     _FPKM_variance = v;
560 }
561 
has_member_with_status(AbundanceStatus member_status) const562 bool AbundanceGroup::has_member_with_status(AbundanceStatus member_status) const
563 {
564     BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
565 	{
566 		if (ab->status() == member_status)
567 		{
568 			return true;
569 		}
570 	}
571     return false;
572 }
573 
num_fragments() const574 double AbundanceGroup::num_fragments() const
575 {
576 	double num_f = 0;
577 
578 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
579 	{
580 		num_f += ab->num_fragments();
581 	}
582     assert (!isnan(num_f));
583 	return num_f;
584 }
585 
num_fragments_by_replicate() const586 CountPerReplicateTable AbundanceGroup::num_fragments_by_replicate() const
587 {
588 	CountPerReplicateTable cpr;
589 
590 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
591 	{
592 		if (cpr.empty())
593         {
594             cpr = ab->num_fragments_by_replicate();
595         }
596         else
597         {
598             CountPerReplicateTable ab_cpr = ab->num_fragments_by_replicate();
599             for (CountPerReplicateTable::const_iterator itr = ab_cpr.begin();
600                  itr != ab_cpr.end();
601                  ++itr)
602             {
603                 CountPerReplicateTable::iterator cpr_itr = cpr.find(itr->first);
604                 assert (cpr_itr != cpr.end());
605                 cpr_itr->second += itr->second;
606             }
607         }
608 	}
609 
610     //assert (cpr.empty() != false);
611 	return cpr;
612 }
613 
FPKM_by_replicate() const614 FPKMPerReplicateTable AbundanceGroup::FPKM_by_replicate() const
615 {
616 	FPKMPerReplicateTable fpr;
617 
618 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
619 	{
620         FPKMPerReplicateTable ab_fpr = ab->FPKM_by_replicate();
621 
622         for (FPKMPerReplicateTable::const_iterator itr = ab_fpr.begin();
623              itr != ab_fpr.end();
624              ++itr)
625         {
626             FPKMPerReplicateTable::iterator fpr_itr = fpr.find(itr->first);
627             if (fpr_itr != fpr.end())
628                 fpr_itr->second += itr->second;
629             else
630                 fpr[itr->first] = itr->second;
631         }
632 	}
633 
634     //assert (cpr.empty() != false);
635 	return fpr;
636 }
637 
status_by_replicate() const638 StatusPerReplicateTable AbundanceGroup::status_by_replicate() const
639 {
640 	StatusPerReplicateTable fpr;
641 
642 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
643 	{
644 		if (fpr.empty())
645         {
646             fpr = ab->status_by_replicate();
647         }
648         else
649         {
650             StatusPerReplicateTable ab_fpr = ab->status_by_replicate();
651             for (StatusPerReplicateTable::const_iterator itr = ab_fpr.begin();
652                  itr != ab_fpr.end();
653                  ++itr)
654             {
655                 StatusPerReplicateTable::iterator fpr_itr = fpr.find(itr->first);
656                 assert (fpr_itr != fpr.end());
657 
658                 AbundanceStatus s = itr->second;
659 
660                 if (s == NUMERIC_FAIL)
661                 {
662                     fpr_itr->second = NUMERIC_FAIL;
663                 }
664                 else if (s == NUMERIC_LOW_DATA && (fpr_itr->second != NUMERIC_HI_DATA && fpr_itr->second != NUMERIC_FAIL && fpr_itr->second != NUMERIC_OK))
665                 {
666                     fpr_itr->second = NUMERIC_LOW_DATA;
667                 }
668                 else if (s == NUMERIC_HI_DATA)
669                 {
670                     fpr_itr->second = NUMERIC_HI_DATA;
671                 }
672                 else if (s == NUMERIC_OK && (fpr_itr->second != NUMERIC_HI_DATA && fpr_itr->second != NUMERIC_FAIL))
673                 {
674                     fpr_itr->second = NUMERIC_OK;
675                 }
676             }
677         }
678 	}
679 
680     //assert (cpr.empty() != false);
681 	return fpr;
682 }
683 
mass_variance() const684 double AbundanceGroup::mass_variance() const
685 {
686     double mass_var = 0;
687 
688 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
689 	{
690 		mass_var += ab->mass_variance();
691 	}
692 	return mass_var;
693 }
694 
695 // This tracks the final modeled variance in the assigned counts.
num_fragment_var() const696 double AbundanceGroup::num_fragment_var() const
697 {
698     double frag_var = 0.0;
699     for (size_t i = 0; i < _abundances.size(); ++i)
700     {
701         for (size_t j = 0; j < _abundances.size(); ++j)
702         {
703             frag_var += _count_covariance(i,j);
704         }
705     }
706     return frag_var;
707 }
708 
709 // This tracks the final modeled variance in the assigned counts.
num_fragment_uncertainty_var() const710 double AbundanceGroup::num_fragment_uncertainty_var() const
711 {
712     double frag_var = 0.0;
713     for (size_t i = 0; i < _abundances.size(); ++i)
714     {
715         for (size_t j = 0; j < _abundances.size(); ++j)
716         {
717             frag_var += _iterated_exp_count_covariance(i,j);
718         }
719     }
720     return frag_var;
721 }
722 
FPKM() const723 double AbundanceGroup::FPKM() const
724 {
725 	double fpkm = 0;
726 
727 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
728 	{
729 		fpkm += ab->FPKM();
730 	}
731 
732 	return fpkm;
733 }
734 
gamma() const735 double AbundanceGroup::gamma() const
736 {
737 	double gamma = 0;
738 
739 	BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances)
740 	{
741 		gamma += ab->gamma();
742 	}
743 
744 	return gamma;
745 }
746 
clear_non_serialized_data()747 void TranscriptAbundance::clear_non_serialized_data()
748 {
749     _fpkm_samples.clear();
750     std::vector<double>().swap(_fpkm_samples);
751 
752     if (_cond_probs)
753     {
754         _cond_probs->clear();
755         std::vector<double>().swap(*_cond_probs);
756     }
757 
758     if (_transfrag)
759     {
760         _transfrag->clear_hits();
761         _transfrag = boost::shared_ptr<Scaffold>();
762     }
763 }
764 
clear_non_serialized_data()765 void AbundanceGroup::clear_non_serialized_data()
766 {
767 
768     for (size_t i = 0; i < _abundances.size(); ++i)
769     {
770         _abundances[i]->clear_non_serialized_data();
771     }
772 
773     _fpkm_samples.clear();
774     std::vector<double>().swap(_fpkm_samples);
775     _member_fpkm_samples.clear();
776     std::vector<Eigen::VectorXd>().swap(_member_fpkm_samples);
777     _assigned_count_samples.clear();
778     std::vector<Eigen::VectorXd>().swap(_assigned_count_samples);
779 }
780 
filter_group(const vector<bool> & to_keep,AbundanceGroup & filtered_group) const781 void AbundanceGroup::filter_group(const vector<bool>& to_keep,
782 								  AbundanceGroup& filtered_group) const
783 {
784 	//filtered_group = AbundanceGroup();
785 
786 	assert (to_keep.size() == _abundances.size());
787 
788 	size_t num_kept = 0;
789 	BOOST_FOREACH(bool keeper, to_keep)
790 	{
791 		num_kept += keeper;
792 	}
793 
794 	ublas::matrix<double> new_cov = ublas::zero_matrix<double>(num_kept,num_kept);
795     ublas::matrix<double> new_iterated_em_count_cov = ublas::zero_matrix<double>(num_kept,num_kept);
796     ublas::matrix<double> new_count_cov = ublas::zero_matrix<double>(num_kept,num_kept);
797     ublas::matrix<double> new_fpkm_cov = ublas::zero_matrix<double>(num_kept,num_kept);
798 
799 	vector<boost::shared_ptr<Abundance> > new_ab;
800 
801 	//vector<vector<double> > new_fpkm_samples(_fpkm_samples.size(), vector<double>(num_kept, 0));
802 
803     // rebuild covariance matrix and abundance vector after filtration
804 
805 	size_t next_cov_row = 0;
806 	for (size_t i = 0; i < _abundances.size(); ++i)
807 	{
808 		if (to_keep[i])
809 		{
810 			new_ab.push_back(_abundances[i]);
811 			size_t next_cov_col = 0;
812 			for (size_t j = 0; j < _abundances.size(); ++j)
813 			{
814 				if (to_keep[j])
815 				{
816 					new_cov(next_cov_row,next_cov_col) = _gamma_covariance(i, j);
817                     new_iterated_em_count_cov(next_cov_row,next_cov_col) = _iterated_exp_count_covariance(i, j);
818                     new_count_cov(next_cov_row,next_cov_col) = _count_covariance(i, j);
819                     new_fpkm_cov(next_cov_row,next_cov_col) = _fpkm_covariance(i, j);
820 					next_cov_col++;
821 				}
822 			}
823 			next_cov_row++;
824 		}
825 	}
826 
827 
828 //    size_t curr_abundance_idx = 0;
829 //    for (size_t i = 0; i < _abundances.size(); ++i)
830 //    {
831 //        if (to_keep[i])
832 //		{
833 //            for (size_t j = 0; j < _fpkm_samples.size(); ++j)
834 //            {
835 //                new_fpkm_samples[j][curr_abundance_idx] = _fpkm_samples[j][i];
836 //            }
837 //            curr_abundance_idx++;
838 //        }
839 //
840 //    }
841 
842 
843 
844 	filtered_group = AbundanceGroup(new_ab,
845                                     new_cov,
846                                     new_iterated_em_count_cov,
847                                     new_count_cov,
848                                     new_fpkm_cov,
849                                     _read_group_props);
850     //assert (filtered_group.FPKM() == 0 || new_fpkm_samples.size() > 0);
851 
852     filtered_group.description(_description);
853 }
854 
get_transfrags(vector<boost::shared_ptr<Abundance>> & transfrags) const855 void AbundanceGroup::get_transfrags(vector<boost::shared_ptr<Abundance> >& transfrags) const
856 {
857 	transfrags.clear();
858 	BOOST_FOREACH(boost::shared_ptr<Abundance> pA, _abundances)
859 	{
860 		boost::shared_ptr<Scaffold> pS = pA->transfrag();
861 		if (pS)
862 		{
863 			transfrags.push_back(pA);
864 		}
865 	}
866 }
867 
gene_id() const868 set<string> AbundanceGroup::gene_id() const
869 {
870 	set<string> s;
871 
872 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
873 	{
874 		set<string> sub = pA->gene_id();
875 		s.insert(sub.begin(), sub.end());
876 	}
877 
878 	return s;
879 }
880 
gene_name() const881 set<string> AbundanceGroup::gene_name() const
882 {
883 	set<string> s;
884 
885 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
886 	{
887 		set<string> sub = pA->gene_name();
888 		s.insert(sub.begin(), sub.end());
889 	}
890 
891 	return s;
892 }
893 
894 
tss_id() const895 set<string> AbundanceGroup::tss_id() const
896 {
897 	set<string> s;
898 
899 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
900 	{
901 		set<string> sub = pA->tss_id();
902 		s.insert(sub.begin(), sub.end());
903 	}
904 
905 	return s;
906 }
907 
protein_id() const908 set<string> AbundanceGroup::protein_id() const
909 {
910 	set<string> s;
911 
912 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
913 	{
914 		set<string> sub = pA->protein_id();
915 		s.insert(sub.begin(), sub.end());
916 	}
917 
918 	return s;
919 }
920 
locus_tag() const921 const string& AbundanceGroup::locus_tag() const
922 {
923 	static string default_locus_tag = "-";
924 	const string* pLast = NULL;
925 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
926 	{
927 		if (pLast)
928 		{
929 			if (pA->locus_tag() != *pLast)
930 			{
931 				assert (false);
932 				return default_locus_tag;
933 			}
934 		}
935 		pLast = &(pA->locus_tag());
936 	}
937 	if (pLast)
938 	{
939 		return *pLast;
940 	}
941 	//assert (false);
942 	return default_locus_tag;
943 }
944 
reference_tag() const945 const string& AbundanceGroup::reference_tag() const
946 {
947 	static string default_reference_tag = "-";
948 	const string* pLast = NULL;
949 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
950 	{
951 		if (pLast)
952 		{
953 			if (pA->reference_tag() != *pLast)
954 			{
955 				assert (false);
956 				return default_reference_tag;
957 			}
958 		}
959 		pLast = &(pA->reference_tag());
960 	}
961 	if (pLast)
962 	{
963 		return *pLast;
964 	}
965 	//assert (false);
966 	return default_reference_tag;
967 }
968 
effective_length() const969 double AbundanceGroup::effective_length() const
970 {
971 	double eff_len = 0.0;
972 	double group_fpkm = FPKM();
973 	if (group_fpkm == 0)
974 		return 0;
975 	BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
976 	{
977 		eff_len += (ab->effective_length() * (ab->FPKM() / group_fpkm));
978 	}
979 	return eff_len;
980 }
981 
982 //void AbundanceGroup::collect_read_group_props()
983 //{
984 //    size_t M = alignments.size();
985 //
986 //	for (size_t i = 0; i < M; ++i)
987 //	{
988 //		if (!alignments[i].left_alignment())
989 //			continue;
990 //        boost::shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
991 //
992 //        _read_group_props.insert(rg_props;
993 //    }
994 //}
995 
collect_per_replicate_mass(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts)996 void AbundanceGroup::collect_per_replicate_mass(const vector<MateHit>& alignments,
997                                                 vector<boost::shared_ptr<Abundance> >& transcripts)
998 {
999     size_t M = alignments.size();
1000 	size_t N = transcripts.size();
1001 
1002     //_count_per_replicate.clear();
1003 
1004     for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
1005          itr != _count_per_replicate.end();
1006          ++itr)
1007     {
1008         itr->second = 0.0;
1009     }
1010 
1011 	if (transcripts.empty())
1012 		return;
1013 
1014     //map<boost::shared_ptr<ReadGroupProperties const>, double> count_per_replicate;
1015 
1016     vector<boost::shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
1017 	compute_cond_probs_and_effective_lengths(alignments, transcripts, mapped_transcripts);
1018 
1019 	for (size_t i = 0; i < M; ++i)
1020 	{
1021 		if (!alignments[i].left_alignment())
1022 			continue;
1023 
1024 		bool mapped = false;
1025 		for (size_t j = 0; j < N; ++j)
1026         {
1027 			if (_abundances[j]->cond_probs()->at(i) > 0)
1028             {
1029 				mapped = true;
1030 				break;
1031 			}
1032 		}
1033 		if (mapped)
1034         {
1035             boost::shared_ptr<ReadGroupProperties const> rg_props = alignments[i].read_group_props();
1036             //assert (parent != NULL);
1037             pair<map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator, bool> inserted;
1038             inserted = _count_per_replicate.insert(make_pair(rg_props, 0.0));
1039             _read_group_props.insert(rg_props);
1040 
1041             // these are the *internally* scaled masses.
1042             double more_mass = alignments[i].collapse_mass();
1043             double curr_mass = inserted.first->second;
1044             assert (isnan(more_mass) == false);
1045             inserted.first->second += more_mass;
1046         }
1047     }
1048 }
1049 
calculate_locus_scaled_mass_and_variance(const vector<boost::shared_ptr<Abundance>> & transcripts)1050 void AbundanceGroup::calculate_locus_scaled_mass_and_variance(const vector<boost::shared_ptr<Abundance> >& transcripts)
1051 {
1052 	size_t N = transcripts.size();
1053 
1054 	if (transcripts.empty())
1055 		return;
1056 
1057     double avg_X_g = 0.0;
1058     double avg_mass_fraction = 0.0;
1059 
1060     // as long as all the read groups share the same dispersion model (currently true)
1061     // then all the variances from each read group will be the same, so this
1062     // averaging step isn't strictly necessary.  Computing it this way is simply
1063     // convenient.
1064     vector<double> avg_mass_variances(N, 0.0);
1065 
1066     double external_scale_factor = -1.0;
1067     for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
1068          itr != _count_per_replicate.end();
1069          ++itr)
1070     {
1071         boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
1072 
1073         if (external_scale_factor < 0)
1074         {
1075             external_scale_factor = rg_props->external_scale_factor();
1076         }
1077         else
1078         {
1079             assert (external_scale_factor == rg_props->external_scale_factor());
1080         }
1081 
1082         // Since the _count_per_replicate table stores internally scaled
1083         // fragment counts, we need to scale the fragment counts up so we
1084         // can compare between conditions, rather than just between replicates
1085         // of this condition.
1086         double scaled_mass = itr->second;
1087         double scaled_total_mass = rg_props->normalized_map_mass();
1088         avg_X_g += scaled_mass;
1089         boost::shared_ptr<MassDispersionModel const> disperser = rg_props->mass_dispersion_model();
1090         for (size_t j = 0; j < N; ++j)
1091         {
1092             double scaled_variance;
1093             //scaled_variance = disperser->scale_mass_variance(scaled_mass * _abundances[j]->gamma());
1094             scaled_variance = _abundances[j]->gamma() * disperser->scale_mass_variance(scaled_mass);
1095             avg_mass_variances[j] += scaled_variance;
1096         }
1097         assert (disperser->scale_mass_variance(scaled_mass) != 0 || scaled_mass == 0);
1098 
1099         //assert (scaled_total_mass != 0.0);
1100         avg_mass_fraction += (scaled_mass / scaled_total_mass);
1101     }
1102 
1103     double num_replicates = _count_per_replicate.size();
1104 
1105     if (num_replicates)
1106     {
1107         avg_X_g /= num_replicates;
1108         avg_mass_fraction /= num_replicates;
1109         for (size_t j = 0; j < N; ++j)
1110         {
1111             avg_mass_variances[j] /= num_replicates;
1112         }
1113     }
1114 
1115     for (size_t j = 0; j < _abundances.size(); ++j)
1116 	{
1117 		_abundances[j]->num_fragments(_abundances[j]->gamma() * avg_X_g);
1118 
1119         double j_avg_mass_fraction = _abundances[j]->gamma() * avg_mass_fraction;
1120 
1121         _abundances[j]->mass_variance(avg_mass_variances[j]);
1122 
1123         if (j_avg_mass_fraction > 0)
1124         {
1125             double FPKM = j_avg_mass_fraction * 1000000000/ _abundances[j]->effective_length();
1126             FPKM *= 1.0 / external_scale_factor;
1127 
1128             _abundances[j]->FPKM(FPKM);
1129         }
1130         else
1131         {
1132             _abundances[j]->FPKM(0);
1133             _abundances[j]->mass_variance(0);
1134         }
1135 	}
1136 
1137 }
1138 
1139 int total_cond_prob_calls = 0;
collapse_equivalent_hits(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts,vector<MateHit> & nr_alignments,vector<double> & log_conv_factors,bool require_overlap=true)1140 void collapse_equivalent_hits(const vector<MateHit>& alignments,
1141                               vector<boost::shared_ptr<Abundance> >& transcripts,
1142                               vector<MateHit>& nr_alignments,
1143                               vector<double>& log_conv_factors,
1144                               bool require_overlap = true)
1145 {
1146     int N = transcripts.size();
1147 	int M = alignments.size();
1148 
1149     nr_alignments.clear();
1150 
1151 	vector<vector<char> > compatibilities(N, vector<char>(M,0));
1152 	compute_compatibilities(transcripts, alignments, compatibilities);
1153 
1154     vector<vector<double> > cached_cond_probs (M, vector<double>());
1155 
1156     vector<bool> replaced(M, false);
1157     int num_replaced = 0;
1158 
1159     vector<BiasCorrectionHelper> bchs;
1160     for (size_t j = 0; j < N; ++j)
1161     {
1162         bchs.push_back(BiasCorrectionHelper(transcripts[j]->transfrag()));
1163     }
1164 
1165     double total_mass_pre_collapse = 0.0;
1166 
1167     for(int i = 0 ; i < M; ++i)
1168     {
1169         total_mass_pre_collapse += alignments[i].collapse_mass();
1170 
1171         vector<double> cond_probs_i(N,0);
1172         if (replaced[i] == true)
1173             continue;
1174 
1175         if (cached_cond_probs[i].empty())
1176         {
1177             for (int j = 0; j < N; ++j)
1178             {
1179                 boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
1180 
1181                 if (compatibilities[j][i]==1)
1182                 {
1183                     total_cond_prob_calls++;
1184                     cond_probs_i[j] = bchs[j].get_cond_prob(alignments[i]);
1185                 }
1186 
1187             }
1188             cached_cond_probs[i] = cond_probs_i;
1189         }
1190         else
1191         {
1192             cond_probs_i = cached_cond_probs[i];
1193         }
1194 
1195         MateHit* curr_align = NULL;
1196 
1197         nr_alignments.push_back(alignments[i]);
1198         curr_align = &nr_alignments.back();
1199         log_conv_factors.push_back(0);
1200 
1201         if (corr_multi && alignments[i].is_multi()) // don't reduce other hits into multihits
1202             continue;
1203 
1204         bool seen_olap = false;
1205 
1206         for(int k = i + 1 ; k < M; ++k)
1207         {
1208             if (replaced[k] || (corr_multi && alignments[k].is_multi()) || alignments[i].read_group_props() != alignments[k].read_group_props())
1209                 continue;
1210             if (require_overlap && !::overlap_in_genome(curr_align->left(), curr_align->right(),
1211                                      alignments[k].left(), alignments[k].right()))
1212             {
1213                 if (seen_olap)
1214                     break;
1215                 else
1216                     continue;
1217             }
1218             else
1219             {
1220                 seen_olap = true;
1221             }
1222 
1223             vector<double>* cond_probs_k;
1224             double last_cond_prob = -1;
1225 
1226             bool equiv = true;
1227 
1228             if (cached_cond_probs[k].empty())
1229             {
1230                 cached_cond_probs[k] = vector<double>(N, 0.0);
1231                 cond_probs_k = &cached_cond_probs[k];
1232                 for (int j = 0; j < N; ++j)
1233                 {
1234                     boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
1235 
1236                     if (compatibilities[j][k]==1)
1237                     {
1238                         total_cond_prob_calls++;
1239                         (*cond_probs_k)[j] = bchs[j].get_cond_prob(alignments[k]);
1240                     }
1241                 }
1242                 //cached_cond_probs[k] = cond_probs_k;
1243             }
1244             else
1245             {
1246                 cond_probs_k = &cached_cond_probs[k];
1247             }
1248 
1249 
1250             for (int j = 0; j < N; ++j)
1251             {
1252                 if ((*cond_probs_k)[j] != 0 && cond_probs_i[j] != 0)
1253                 {
1254                     double cp_j = (*cond_probs_k)[j];
1255                     double cp_i = cond_probs_i[j];
1256                     double ratio =  cp_j / cp_i;
1257                     if (last_cond_prob == -1)
1258                     {
1259                         //assert(ratio < 5);
1260                         last_cond_prob = ratio;
1261                     }
1262                     else
1263                     {
1264                         if (last_cond_prob != ratio)
1265                         //if (abs(last_cond_prob - ratio) > 0.001)
1266                         {
1267                             equiv = false;
1268                             break;
1269                         }
1270                     }
1271                 }
1272                 else if ((*cond_probs_k)[j] == 0 && cond_probs_i[j] == 0)
1273                 {
1274                     // just do nothing in this iter.
1275                     // last_cond_prob = 0.0;
1276                 }
1277                 else
1278                 {
1279                     equiv = false;
1280                     break;
1281                 }
1282             }
1283 
1284             // cond_prob_i vector is a scalar multiple of cond_prob_k, so we
1285             // can collapse k into i via the mass.
1286             if (equiv)
1287             {
1288                 if (last_cond_prob > 0.0)
1289                 {
1290                     //assert(curr_align->read_group_props() == alignments[k].read_group_props());
1291                     assert (last_cond_prob > 0);
1292                     //double mass_muliplier = sqrt(last_cond_prob);
1293                     double mass_multiplier = log(last_cond_prob);
1294                     //assert(last_cond_prob < 5);
1295                     assert (!isinf(mass_multiplier) && !isnan(mass_multiplier));
1296                     log_conv_factors[log_conv_factors.size() - 1] += mass_multiplier;
1297                     replaced[k] = true;
1298                     cached_cond_probs[k].clear();
1299                     vector<double>(cached_cond_probs[k]).swap(cached_cond_probs[k]);
1300                     num_replaced++;
1301                     double more_mass = alignments[k].collapse_mass();
1302                     curr_align->incr_collapse_mass(more_mass);
1303                 }
1304                 else
1305                 {
1306                     replaced[k] = true;
1307                     num_replaced++;
1308                     cached_cond_probs[k].clear();
1309                     vector<double>(cached_cond_probs[k]).swap(cached_cond_probs[k]);
1310                 }
1311             }
1312 
1313         }
1314     }
1315 
1316     N = transcripts.size();
1317 	//M = nr_alignments.size();
1318 
1319 	for (int j = 0; j < N; ++j)
1320     {
1321 		boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
1322 		vector<double>& cond_probs = *(new vector<double>(nr_alignments.size(),0));
1323 
1324 		BiasCorrectionHelper& bch = bchs[j];
1325 
1326         size_t last_cond_prob_idx = 0;
1327 		for(int i = 0 ; i < M; ++i)
1328 		{
1329             if (!cached_cond_probs[i].empty())
1330             {
1331                 if (compatibilities[j][i]==1)
1332                 {
1333                     assert (cached_cond_probs[i].size() > j);
1334                     cond_probs[last_cond_prob_idx] = cached_cond_probs[i][j];
1335                 }
1336                 last_cond_prob_idx++;
1337             }
1338         }
1339 
1340         assert (last_cond_prob_idx == nr_alignments.size());
1341 
1342 		transcripts[j]->effective_length(bch.get_effective_length());
1343 		transcripts[j]->cond_probs(&cond_probs);
1344 	}
1345 
1346     double total_mass_post_collapse = 0.0;
1347     for(int i = 0 ; i < nr_alignments.size(); ++i)
1348     {
1349         total_mass_post_collapse += nr_alignments[i].collapse_mass();
1350     }
1351 
1352     //assert(abs(total_mass_pre_collapse - total_mass_post_collapse) < 1);
1353 
1354     if (nr_alignments.size())
1355     {
1356         verbose_msg("\nReduced %lu frags to %lu (%lf percent)\n", alignments.size(), nr_alignments.size(), 100.0 * (1 - nr_alignments.size()/(double)alignments.size()));
1357     }
1358 }
1359 
collapse_equivalent_hits_helper(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts,vector<MateHit> & nr_alignments,vector<double> & log_conv_factors)1360 void collapse_equivalent_hits_helper(const vector<MateHit>& alignments,
1361                                      vector<boost::shared_ptr<Abundance> >& transcripts,
1362                                      vector<MateHit>& nr_alignments,
1363                                      vector<double>& log_conv_factors)
1364 {
1365     int N = transcripts.size();
1366 	int M = alignments.size();
1367 
1368     if (N == 1)
1369     {
1370         nr_alignments = alignments;
1371         log_conv_factors = vector<double>(M, 0.0);
1372         return;
1373     }
1374     // TODO: Remove this short cut after verifying that it doesn't really make sense
1375     // for large bundles.  The collapse is almost certainly more efficient.
1376     // If there's a lot of transcripts, just use the old, overlap constrained
1377     // version of the equivalence collapse.
1378     if (N > 24)
1379     {
1380         collapse_equivalent_hits(alignments,
1381                                  transcripts,
1382                                  nr_alignments,
1383                                  log_conv_factors,
1384                                  true);
1385         return;
1386     }
1387 
1388     vector<vector<const MateHit*> > compat_table(1 << N);
1389     vector<vector<char> > compatibilities(N, vector<char>(M,0));
1390     compute_compatibilities(transcripts, alignments, compatibilities);
1391 
1392     for(int i = 0; i < M; ++i)
1393     {
1394         size_t compat_mask = 0;
1395         for (int j = 0; j < N; ++j)
1396         {
1397             compat_mask |= ((compatibilities[j][i] !=0) << j);
1398         }
1399         assert (compat_mask < compat_table.size());
1400         compat_table[compat_mask].push_back(&(alignments[i]));
1401     }
1402 
1403     for (size_t i = 0; i < compat_table.size(); ++i)
1404     {
1405         vector<MateHit> tmp_hits;
1406         vector<MateHit> tmp_nr_hits;
1407         vector<double> tmp_log_conv_factors;
1408 
1409         for (size_t j = 0; j < compat_table[i].size(); ++j)
1410         {
1411             tmp_hits.push_back(*(compat_table[i][j]));
1412         }
1413         if (tmp_hits.empty())
1414             continue;
1415         collapse_equivalent_hits(tmp_hits,
1416                                  transcripts,
1417                                  tmp_nr_hits,
1418                                  tmp_log_conv_factors,
1419                                  false);
1420         copy(tmp_nr_hits.begin(), tmp_nr_hits.end(), back_inserter(nr_alignments));
1421         copy(tmp_log_conv_factors.begin(), tmp_log_conv_factors.end(), back_inserter(log_conv_factors));
1422     }
1423 }
1424 
1425 
1426 
1427 
1428  // Sample from a multivariate normal to estimate the gamma distribution
generate_count_assignment_samples(int num_draws,const vector<double> & count_mean,const ublas::matrix<double> & count_covariance,vector<ublas::vector<double>> & assigned_count_samples)1429 bool generate_count_assignment_samples(int num_draws,
1430                                        const vector<double>& count_mean,
1431                                        const ublas::matrix<double>& count_covariance,
1432                                        vector<ublas::vector<double> >& assigned_count_samples)
1433 {
1434     double total_frag_counts = accumulate(count_mean.begin(), count_mean.end(), 0.0);
1435 
1436     if (total_frag_counts == 0)
1437     {
1438         assigned_count_samples = vector<ublas::vector<double> > (num_draws, ublas::zero_vector<double>(count_mean.size()));
1439         return true;
1440     }
1441 
1442     boost::mt19937 rng(random_seed);
1443 
1444     ublas::vector<double> mle_frag_counts = ublas::zero_vector<double>(count_mean.size());
1445 
1446     for (size_t j = 0; j < count_mean.size(); ++j)
1447     {
1448         mle_frag_counts(j) = count_mean[j];
1449     }
1450 
1451     //    cerr << "********************" << endl;
1452     //    cerr << "initial MLE counts: " << mle_frag_counts << endl;
1453 
1454     ublas::matrix<double> mle_count_covar = count_covariance;
1455 
1456     ublas::matrix<double> epsilon = ublas::zero_matrix<double>(count_mean.size(),count_mean.size());
1457     for (size_t i = 0; i < count_mean.size(); ++i)
1458     {
1459         epsilon(i,i) = 1e-6;
1460     }
1461 
1462     mle_count_covar += epsilon; // modify matrix to avoid problems during inverse
1463 
1464     double ret = cholesky_factorize(mle_count_covar);
1465     if (ret != 0)
1466     {
1467         fprintf(stderr, "Warning: Iterated expectation count covariance matrix cannot be cholesky factorized!\n");
1468         //fprintf(stderr, "Warning: FPKM covariance is not positive definite (ret = %lg)!\n", ret);
1469         //        for (size_t j = 0; j < _abundances.size(); ++j)
1470         //        {
1471         //            _abundances[j]->status(NUMERIC_FAIL);
1472         //        }
1473         return false;
1474     }
1475 
1476     //    cerr << endl << "Cholesky factored covariance matrix: " << endl;
1477     //    for (unsigned i = 0; i < _count_covariance.size1 (); ++ i)
1478     //    {
1479     //        ublas::matrix_row<ublas::matrix<double> > mr (mle_count_covar, i);
1480     //        cerr << i << " : " << _abundances[i]->num_fragments() << " : ";
1481     //        std::cerr << i << " : " << mr << std::endl;
1482     //    }
1483     //cerr << "======" << endl;
1484 
1485     multinormal_generator<double> generator(mle_frag_counts, mle_count_covar);
1486     //vector<Eigen::VectorXd> multinormal_samples;
1487 
1488     assigned_count_samples = vector<ublas::vector<double> > (num_draws, ublas::zero_vector<double>(count_mean.size()));
1489 
1490     boost::uniform_01<> uniform_dist;
1491     boost::mt19937 null_rng(random_seed);
1492     boost::variate_generator<boost::mt19937&, boost::uniform_01<> > uniform_gen(null_rng, uniform_dist);
1493 
1494 
1495     for (size_t assign_idx = 0; assign_idx < num_draws; ++assign_idx)
1496     {
1497 
1498         boost::numeric::ublas::vector<double> random_count_assign = generator.next_rand();
1499         //cerr << random_count_assign << endl;
1500 
1501         for (size_t r_idx = 0; r_idx < random_count_assign.size(); ++r_idx)
1502         {
1503             if (random_count_assign(r_idx) < 0)
1504                 random_count_assign(r_idx) = 0;
1505         }
1506 
1507         double total_sample_counts = accumulate(random_count_assign.begin(), random_count_assign.end(), 0.0);
1508         if (total_sample_counts > 0)
1509             random_count_assign = total_frag_counts * (random_count_assign / total_sample_counts);
1510         else
1511             random_count_assign = ublas::zero_vector<double>(count_mean.size());
1512         assigned_count_samples[assign_idx] = random_count_assign;
1513     }
1514 
1515     return true;
1516 }
1517 
calculate_gamma_mle_covariance(const std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<AbundanceGroup>> & ab_group_per_replicate,ublas::vector<double> & estimated_gamma_mean,ublas::matrix<double> & estimated_gamma_covariance)1518 void calculate_gamma_mle_covariance(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
1519                                     ublas::vector<double>& estimated_gamma_mean,
1520                                     ublas::matrix<double>& estimated_gamma_covariance)
1521 {
1522     vector<ublas::vector<double> > all_assigned_count_samples;
1523 
1524     if (ab_group_per_replicate.empty())
1525         return;
1526     int num_transcripts = ab_group_per_replicate.begin()->second->abundances().size();
1527     if (num_transcripts == 1)
1528     {
1529         estimated_gamma_mean = ublas::vector<double>(1);
1530         estimated_gamma_mean(0) = 1.0;
1531 
1532         estimated_gamma_covariance = ublas::matrix<double>(1,1);
1533         estimated_gamma_covariance(0,0) = 0.0;
1534         return;
1535     }
1536 
1537     for(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1538         itr != ab_group_per_replicate.end();
1539         ++itr)
1540     {
1541         ublas::vector<double> count_mean = ublas::zero_vector<double>(itr->second->abundances().size());
1542         for (size_t i = 0; i < itr->second->abundances().size(); ++i)
1543         {
1544             count_mean(i) = itr->second->abundances()[i]->gamma();
1545         }
1546 
1547         all_assigned_count_samples.push_back(count_mean);
1548     }
1549 
1550     //    for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1551     //    {
1552     //        double total = accumulate(all_assigned_count_samples[i].begin(), all_assigned_count_samples[i].end(), 0.0);
1553     //        if (total > 0)
1554     //            all_assigned_count_samples[i] /= total;
1555     //    }
1556 
1557     estimated_gamma_mean = ublas::zero_vector<double>(num_transcripts);
1558     estimated_gamma_covariance = ublas::zero_matrix<double>(num_transcripts, num_transcripts);
1559 
1560     for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1561     {
1562         for (int j = 0; j < all_assigned_count_samples[i].size(); ++j)
1563         {
1564             assert (!isnan(all_assigned_count_samples[i](j)) && !isinf(all_assigned_count_samples[i](j)));
1565         }
1566 
1567         estimated_gamma_mean += all_assigned_count_samples[i];
1568         //
1569         //expected_relative_abundances += relative_abundances[i];
1570     }
1571 
1572     if (all_assigned_count_samples.size() > 1)
1573     {
1574         estimated_gamma_mean /= (all_assigned_count_samples.size() - 1);
1575     }
1576 
1577     for (size_t i = 0; i < num_transcripts; ++i)
1578     {
1579         for (size_t j = 0; j < num_transcripts; ++j)
1580         {
1581             for (size_t k = 0 ; k < all_assigned_count_samples.size(); ++k)
1582             {
1583                 double c = (all_assigned_count_samples[k](i) - estimated_gamma_mean(i)) * (all_assigned_count_samples[k](j) - estimated_gamma_mean(j));
1584                 estimated_gamma_covariance(i,j) += c;
1585 
1586                 assert (!isinf(estimated_gamma_covariance(i,j)) && !isnan(estimated_gamma_covariance(i,j)));
1587             }
1588         }
1589     }
1590 
1591     if (all_assigned_count_samples.size() > 1)
1592     {
1593         estimated_gamma_covariance /= (all_assigned_count_samples.size() - 1);
1594     }
1595 }
1596 
calculate_fragment_assignment_distribution(const std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<const AbundanceGroup>> & ab_group_per_replicate,ublas::vector<double> & estimated_gamma_mean,ublas::matrix<double> & estimated_gamma_covariance,vector<ublas::vector<double>> & all_assigned_count_samples)1597 void calculate_fragment_assignment_distribution(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate,
1598                                                 ublas::vector<double>& estimated_gamma_mean,
1599                                                 ublas::matrix<double>& estimated_gamma_covariance,
1600                                                 vector<ublas::vector<double> >& all_assigned_count_samples)
1601 {
1602 
1603     all_assigned_count_samples.clear();
1604 
1605     if (ab_group_per_replicate.empty())
1606         return;
1607     int num_transcripts = ab_group_per_replicate.begin()->second->abundances().size();
1608     if (num_transcripts == 1)
1609     {
1610         estimated_gamma_mean = ublas::vector<double>(1);
1611         estimated_gamma_mean(0) = 1.0;
1612 
1613         estimated_gamma_covariance = ublas::matrix<double>(1,1);
1614         estimated_gamma_covariance(0,0) = 0.0;
1615         return;
1616     }
1617 
1618     for(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1619         itr != ab_group_per_replicate.end();
1620         ++itr)
1621     {
1622         vector<double> count_mean;
1623         for (size_t i = 0; i < itr->second->abundances().size(); ++i)
1624         {
1625             count_mean.push_back(itr->second->abundances()[i]->num_fragments());
1626         }
1627 
1628         ublas::matrix<double> count_covariance = itr->second->iterated_count_cov();
1629         ublas::matrix<double> mle_error = ublas::zero_matrix<double>(count_mean.size(), count_mean.size());
1630         boost::shared_ptr<const MleErrorModel> mle_model = itr->first->mle_error_model();
1631         if (mle_model != NULL)
1632         {
1633             for (size_t i = 0; i < count_mean.size(); ++i)
1634             {
1635                 double mle_var = mle_model->scale_mle_variance(count_mean[i]);
1636                 mle_error(i,i) = max(0.0, mle_var);
1637             }
1638             count_covariance += mle_error;
1639 //            cerr << endl << "MLE error correction: " << endl;
1640 //            for (unsigned i = 0; i < mle_error.size1 (); ++ i)
1641 //            {
1642 //                ublas::matrix_row<ublas::matrix<double> > mr (mle_error, i);
1643 //                cerr << i << " : " << count_mean[i] << " : ";
1644 //                std::cerr << i << " : " << mr << std::endl;
1645 //            }
1646 //            cerr << "======" << endl;
1647         }
1648 
1649 
1650         vector<ublas::vector<double> > assigned_count_samples;
1651         generate_count_assignment_samples(num_frag_assignments,
1652                                           count_mean,
1653                                           count_covariance,
1654                                           assigned_count_samples);
1655 
1656         all_assigned_count_samples.insert(all_assigned_count_samples.end(), assigned_count_samples.begin(), assigned_count_samples.end());
1657     }
1658 
1659     for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1660     {
1661         double total = accumulate(all_assigned_count_samples[i].begin(), all_assigned_count_samples[i].end(), 0.0);
1662         if (total > 0)
1663             all_assigned_count_samples[i] /= total;
1664         //cerr << all_assigned_count_samples[i] << endl;
1665     }
1666 
1667     estimated_gamma_mean = ublas::zero_vector<double>(num_transcripts);
1668     estimated_gamma_covariance = ublas::zero_matrix<double>(num_transcripts, num_transcripts);
1669 
1670     for (size_t i = 0; i < all_assigned_count_samples.size(); ++i)
1671     {
1672         for (int j = 0; j < all_assigned_count_samples[i].size(); ++j)
1673         {
1674             assert (!isnan(all_assigned_count_samples[i](j)) && !isinf(all_assigned_count_samples[i](j)));
1675         }
1676 
1677         estimated_gamma_mean += all_assigned_count_samples[i];
1678         //
1679         //expected_relative_abundances += relative_abundances[i];
1680     }
1681 
1682     if (all_assigned_count_samples.size() > 1)
1683     {
1684         estimated_gamma_mean /= (all_assigned_count_samples.size() - 1);
1685     }
1686 
1687     for (size_t i = 0; i < num_transcripts; ++i)
1688     {
1689         for (size_t j = 0; j < num_transcripts; ++j)
1690         {
1691             for (size_t k = 0 ; k < all_assigned_count_samples.size(); ++k)
1692             {
1693                 double c = (all_assigned_count_samples[k](i) - estimated_gamma_mean(i)) * (all_assigned_count_samples[k](j) - estimated_gamma_mean(j));
1694                 estimated_gamma_covariance(i,j) += c;
1695 
1696                 assert (!isinf(estimated_gamma_covariance(i,j)) && !isnan(estimated_gamma_covariance(i,j)));
1697             }
1698         }
1699     }
1700 
1701     if (all_assigned_count_samples.size() > 1)
1702     {
1703         estimated_gamma_covariance /= (all_assigned_count_samples.size() - 1);
1704     }
1705 
1706 //    cerr << "Gamma covariance: " << endl;
1707 //    for (unsigned i = 0; i < estimated_gamma_covariance.size1 (); ++ i)
1708 //    {
1709 //        ublas::matrix_row<ublas::matrix<double> > mr (estimated_gamma_covariance, i);
1710 //        cerr << i << " : " << estimated_gamma_mean[i] << " : ";
1711 //        std::cerr << i << " : " << mr << std::endl;
1712 //    }
1713 
1714 }
1715 
1716 
1717 #define PERFORM_EQUIV_COLLAPSE 1
1718 
1719 //void AbundanceGroup::calculate_abundance_for_replicate(const vector<MateHit>& alignments)
1720 //{
1721 //
1722 //}
1723 
1724 
calculate_abundance_group_variance(const vector<boost::shared_ptr<Abundance>> & transcripts,const std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<const AbundanceGroup>> & ab_group_per_replicate)1725 void AbundanceGroup::calculate_abundance_group_variance(const vector<boost::shared_ptr<Abundance> >& transcripts,
1726                                                         const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate)
1727 {
1728 	if (final_est_run) // Only on last estimation run
1729 	{
1730         // Simulate NB draws and fragment assignment under uncertainty to sample
1731         // from the BNBs.
1732 
1733         ublas::matrix<double> count_assign_covariance;
1734 
1735         ublas::vector<double> estimated_gamma_mean;
1736         ublas::matrix<double> estimated_gamma_covariance;
1737 
1738         vector<ublas::vector<double> > gamma_samples;
1739         calculate_fragment_assignment_distribution(ab_group_per_replicate,
1740                                                    estimated_gamma_mean,
1741                                                    estimated_gamma_covariance,
1742                                                    gamma_samples);
1743         ublas::vector<double> estimated_count_mean = estimated_gamma_mean * num_fragments();
1744         ublas::matrix<double>  estimated_count_covariance = estimated_gamma_covariance * num_fragments() * num_fragments();
1745 
1746         //cerr << estimated_count_mean << endl;
1747         //cerr << estimated_count_covariance << endl;
1748         vector<double> frags_per_transcript;
1749         vector<double> frag_variances;
1750 
1751         for (size_t i = 0; i < _abundances.size(); ++i)
1752         {
1753             assert (estimated_count_mean.size() > i);
1754             frags_per_transcript.push_back(estimated_count_mean[i]);
1755             //frags_per_transcript.push_back(_abundances[i]->num_fragments());
1756 
1757             frag_variances.push_back(_abundances[i]->mass_variance());
1758         }
1759 
1760         simulate_count_covariance(frags_per_transcript, frag_variances, estimated_count_covariance, transcripts, _count_covariance, _assigned_count_samples, &gamma_samples);
1761 
1762         generate_fpkm_samples();
1763 
1764         calculate_FPKM_covariance();
1765 
1766         // Derive confidence intervals from the FPKM variance/covariance matrix
1767         calculate_conf_intervals();
1768 
1769         // Calculate the inter-group relative abundances and variances
1770         if (no_js_tests == false && _read_group_props.size() >= min_reps_for_js_test)
1771         {
1772             calculate_kappas();
1773         }
1774 
1775     }
1776 
1777 //    //cerr << _count_covariance << endl;
1778 //    for (size_t i = 0; i < _abundances.size(); ++i)
1779 //    {
1780 //        for (size_t j = 0; j < _abundances.size(); ++j)
1781 //        {
1782 //            if (i != j)
1783 //            {
1784 //                assert(!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
1785 //                if (_abundances[i]->transfrag()->contains(*_abundances[j]->transfrag()) &&
1786 //                    Scaffold::compatible(*_abundances[i]->transfrag(),*_abundances[j]->transfrag()))
1787 //                {
1788 //                    _abundances[j]->status(NUMERIC_LOW_DATA);
1789 //                }
1790 //            }
1791 //        }
1792 //    }
1793 
1794     //assert (FPKM() == 0 || _assigned_count_samples.size() > 0);
1795 
1796     //fprintf(stderr, "Total calls to get_cond_prob = %d\n", total_cond_prob_calls);
1797 
1798 }
1799 
calculate_abundance_for_replicate(const vector<MateHit> & alignments,bool perform_collapse)1800 void AbundanceGroup::calculate_abundance_for_replicate(const vector<MateHit>& alignments, bool perform_collapse)
1801 {
1802     vector<boost::shared_ptr<Abundance> > transcripts;
1803 
1804 	get_transfrags(transcripts);
1805 	vector<boost::shared_ptr<Abundance> > mapped_transcripts; // This collects the transcripts that have alignments mapping to them
1806 
1807 	vector<MateHit> nr_alignments;
1808     vector<double> joint_mle_gammas;
1809 
1810     vector<MateHit> non_equiv_alignments;
1811     vector<double> log_conv_factors;
1812 
1813     if (final_est_run || corr_multi || corr_bias)
1814     {
1815         compute_cond_probs_and_effective_lengths(alignments, transcripts, mapped_transcripts);
1816         collect_per_replicate_mass(alignments, transcripts);
1817         calculate_gammas(alignments, log_conv_factors, transcripts, mapped_transcripts);
1818         for (size_t i = 0; i < _abundances.size(); ++i)
1819         {
1820             joint_mle_gammas.push_back(_abundances[i]->gamma());
1821         }
1822     }
1823 
1824     calculate_iterated_exp_count_covariance(joint_mle_gammas, alignments, transcripts, _iterated_exp_count_covariance);
1825 
1826     for (size_t i = 0; i < _abundances.size(); ++i)
1827     {
1828         _abundances[i]->num_fragment_uncertainty_var(_iterated_exp_count_covariance(i,i));
1829     }
1830 
1831     if (corr_multi && !final_est_run)
1832     {
1833         update_multi_reads(alignments, mapped_transcripts);
1834     }
1835 
1836     // Calculate the initial estimates for the number of fragments originating
1837     // from each transcript, the FPKMs, and set the NB variances
1838     calculate_locus_scaled_mass_and_variance(transcripts);
1839 }
1840 
aggregate_replicate_abundances(const map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<const AbundanceGroup>> & ab_group_per_replicate)1841 void AbundanceGroup::aggregate_replicate_abundances(const map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate)
1842 {
1843     for (size_t i = 0; i < _abundances.size(); ++i)
1844     {
1845         CountPerReplicateTable cpr;
1846         FPKMPerReplicateTable fpr;
1847         StatusPerReplicateTable spr;
1848 
1849         double avg_fpkm = 0.0;
1850         double avg_num_frags = 0.0;
1851         double avg_gamma = 0.0;
1852         double avg_mass_variance = 0.0;
1853         double avg_effective_length = 0.0;
1854 
1855         map<AbundanceStatus, int> status_table;
1856         status_table[NUMERIC_OK] = 0;
1857         status_table[NUMERIC_LOW_DATA] = 0;
1858         status_table[NUMERIC_FAIL] = 0;
1859         status_table[NUMERIC_HI_DATA] = 0;
1860 
1861         for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1862              itr != ab_group_per_replicate.end();
1863              ++itr)
1864         {
1865             status_table[itr->second->abundances()[i]->status()] += 1;
1866         }
1867 
1868         for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin();
1869              itr != ab_group_per_replicate.end();
1870              ++itr)
1871         {
1872             const vector<boost::shared_ptr<Abundance> >& sc_ab = itr->second->abundances();
1873             assert(itr->second->abundances().size() == _abundances.size());
1874             cpr[itr->first] = itr->second->abundances()[i]->num_fragments();
1875             //fprintf(stderr, "FPKM = %lg\n", itr->second->abundances()[i]->FPKM());
1876             fpr[itr->first] = itr->second->abundances()[i]->FPKM();
1877             spr[itr->first] = itr->second->abundances()[i]->status();
1878 
1879             /*
1880             if (itr->second->abundances()[i]->status() == NUMERIC_OK)
1881             {
1882                 avg_fpkm += itr->second->abundances()[i]->FPKM() / (double)status_table[NUMERIC_OK];
1883                 avg_num_frags += itr->second->abundances()[i]->num_fragments() / (double)status_table[NUMERIC_OK];
1884                 avg_gamma += itr->second->abundances()[i]->gamma() / (double)status_table[NUMERIC_OK];
1885                 avg_mass_variance += itr->second->abundances()[i]->mass_variance() / (double)status_table[NUMERIC_OK];
1886                 avg_effective_length += itr->second->abundances()[i]->effective_length() / (double)status_table[NUMERIC_OK];
1887             }
1888             */
1889             avg_fpkm += itr->second->abundances()[i]->FPKM() / (double)ab_group_per_replicate.size();
1890             avg_num_frags += itr->second->abundances()[i]->num_fragments() / (double)ab_group_per_replicate.size();
1891             avg_gamma += itr->second->abundances()[i]->gamma() / (double)ab_group_per_replicate.size();
1892             avg_mass_variance += itr->second->abundances()[i]->mass_variance() / (double)ab_group_per_replicate.size();
1893             avg_effective_length += itr->second->abundances()[i]->effective_length() / (double)ab_group_per_replicate.size();
1894 
1895         }
1896 
1897         _abundances[i]->FPKM(avg_fpkm);
1898         _abundances[i]->gamma(avg_gamma);
1899         _abundances[i]->num_fragments(avg_num_frags);
1900         _abundances[i]->mass_variance(avg_mass_variance);
1901         _abundances[i]->effective_length(avg_effective_length);
1902 
1903 
1904         // if there was at least one good replicate, set the status to OK.  The reduced power will be reflected
1905         // during testing
1906         if (status_table[NUMERIC_OK] >= 1)
1907         {
1908             _abundances[i]->status(NUMERIC_OK);
1909         }
1910         else
1911         {
1912             if (status_table[NUMERIC_LOW_DATA] >= status_table[NUMERIC_FAIL])
1913             {
1914                 _abundances[i]->status(NUMERIC_LOW_DATA);
1915             }
1916             else if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_FAIL])
1917             {
1918                 _abundances[i]->status(NUMERIC_HI_DATA);
1919             }
1920 //            else if (status_table[NUMERIC_HI_DATA] >= status_table[NUMERIC_LOW_DATA]) // not sure this ever happens in practice
1921 //            {
1922 //                _abundances[i]->status(NUMERIC_FAIL);
1923 //            }
1924             else
1925             {
1926                 _abundances[i]->status(NUMERIC_FAIL);
1927             }
1928         }
1929 
1930         _abundances[i]->num_fragments_by_replicate(cpr);
1931         _abundances[i]->FPKM_by_replicate(fpr);
1932         _abundances[i]->status_by_replicate(spr);
1933     }
1934 }
1935 
calculate_abundance(const vector<MateHit> & alignments,bool perform_collapse,bool calculate_variance)1936 void AbundanceGroup::calculate_abundance(const vector<MateHit>& alignments, bool perform_collapse, bool calculate_variance)
1937 {
1938 	vector<boost::shared_ptr<Abundance> > transcripts;
1939 
1940 	get_transfrags(transcripts);
1941 
1942     map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> > alignments_per_read_group;
1943 
1944     for(std::set<boost::shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
1945         itr != _read_group_props.end();
1946         ++itr)
1947     {
1948         alignments_per_read_group[*itr] = vector<MateHit>();
1949 
1950         vector<MateHit>& rep_hits = alignments_per_read_group[*itr];
1951 
1952         for (size_t i = 0; i < alignments.size(); ++i)
1953         {
1954             if (alignments[i].read_group_props() == *itr)
1955             {
1956                 rep_hits.push_back(alignments[i]);
1957             }
1958         }
1959     }
1960 
1961     collect_per_replicate_mass(alignments, transcripts);
1962 
1963     std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> > ab_group_per_replicate;
1964 
1965     calculate_per_replicate_abundances(transcripts,
1966                                        alignments_per_read_group,
1967                                        ab_group_per_replicate);
1968 
1969     std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> > const_ab_group_per_replicate;
1970 
1971     for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >::iterator itr = ab_group_per_replicate.begin();
1972          itr != ab_group_per_replicate.end(); ++itr)
1973     {
1974         const_ab_group_per_replicate[itr->first] = itr->second;
1975     }
1976 
1977     aggregate_replicate_abundances(const_ab_group_per_replicate);
1978 
1979     if (calculate_variance)
1980     {
1981         calculate_abundance_group_variance(transcripts, const_ab_group_per_replicate);
1982     }
1983 }
1984 
update_multi_reads(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> transcripts)1985 void AbundanceGroup::update_multi_reads(const vector<MateHit>& alignments, vector<boost::shared_ptr<Abundance> > transcripts)
1986 {
1987 	size_t M = alignments.size();
1988 	size_t N = transcripts.size();
1989 
1990 	if (transcripts.empty())
1991 		return;
1992 
1993     for (size_t i = 0; i < M; ++i)
1994 	{
1995 		if (alignments[i].is_multi())
1996 		{
1997 			double expr = 0.0;
1998 			for (size_t j = 0; j < N; ++j)
1999 			{
2000 				expr += _abundances[j]->cond_probs()->at(i) * _abundances[j]->FPKM() * _abundances[j]->effective_length();
2001 			}
2002 			alignments[i].read_group_props()->multi_read_table()->add_expr(alignments[i], expr);
2003 		}
2004 	}
2005 }
2006 
2007 
solve_beta(long double A,long double B,long double C)2008 long double solve_beta(long double A, long double B, long double C)
2009 {
2010     long double a = -C/B;
2011     long double b = (A + 4*A*C/(B*B) - (4*C/B));
2012     long double c = -A + B - 5*A*A*C/(B*B*B) + 10*A*C/(B*B) - 5*C/B;
2013     long double d = 2*A*A*A*C/(B*B*B*B) - 6*A*A*C/(B*B*B) + 6*A*C/(B*B) - 2*C/B;
2014     complex<long double> q((3*a*c - b*b)/(a*a*9.0));
2015     complex<long double> r((9.0*a*c*b - 27.0*a*a*d - 2.0*b*b*b)/(a*a*a*54.0));
2016     complex<long double> s1 = std::pow((r + std::sqrt(q*q*q + r*r)),complex<long double>(1/3.0));
2017     complex<long double> s2 = std::pow((r - std::sqrt(q*q*q + r*r)),complex<long double>(1/3.0));
2018     complex<long double> R1 = s1 + s2 - complex<long double>(b/(a*3.0));
2019     complex<long double> R2 = -(s1+s2)/complex<long double>(2.0) - complex<long double>(b/(a*3.0)) + (s1-s2) * complex<long double>(0, sqrtl(3.0)/2.0);
2020     complex<long double> R3 = -(s1+s2)/complex<long double>(2.0) - complex<long double>(b/(a*3.0)) - (s1-s2) * complex<long double>(0, sqrtl(3.0)/2.0);
2021 
2022     vector<long double> roots;
2023     if (R1.imag() == 0)
2024         roots.push_back(R1.real());
2025     if (R2.imag() == 0)
2026         roots.push_back(R2.real());
2027     if (R3.imag() == 0)
2028         roots.push_back(R3.real());
2029     sort(roots.begin(), roots.end());
2030 
2031     if (roots.empty())
2032         return 0;
2033 
2034     long double root = roots.back();
2035     return root;
2036 }
2037 
2038 
2039 // This function takes the point estimate of the number of fragments from
2040 // a transcript, the iterated expection count matrix, and the locus level
2041 // cross replicate variance, and calculates the transcript-level cross-replicate
2042 // count variance
estimate_count_variance(long double & variance,double gamma_t,double psi_t_count_var,double X_g,double V_X_g_t,double l_t)2043 bool estimate_count_variance(long double& variance,
2044                              double gamma_t,
2045                              double psi_t_count_var,
2046                              double X_g,
2047                              double V_X_g_t,
2048                              double l_t)
2049 {
2050     if (l_t == 0)
2051     {
2052         return true;
2053     }
2054 
2055     long double A = X_g * gamma_t;
2056 
2057     long double B = V_X_g_t;
2058 
2059     long double C = psi_t_count_var;
2060 
2061     variance = 0.0;
2062     bool numeric_ok = true;
2063 
2064     long double dispersion = V_X_g_t - (X_g * gamma_t);
2065 
2066     if (psi_t_count_var < 0)
2067     {
2068         //fprintf (stderr, "Warning: psi_t is negative! (psi_t = %lf)\n", psi_t);
2069         psi_t_count_var = 0;
2070     }
2071     assert (psi_t_count_var >= 0);
2072 
2073     // we multiply A with the constants here to make things work out
2074     // at the end of the routine when we multiply by the square of those
2075     // constants
2076     long double poisson_variance = A + psi_t_count_var;
2077     long double alpha = 0.0;
2078     long double beta = 0.0;
2079     long double bnb_mean = 0.0;
2080     long double r = 0.0;
2081 
2082     if (dispersion < -1 || abs(dispersion) < 1)
2083     {
2084         // default to poisson dispersion
2085         variance = poisson_variance;
2086     }
2087     else // there's some detectable overdispersion here, use mixture of negative binomials
2088     {
2089         if (psi_t_count_var < 1)
2090         {
2091             // default to regular negative binomial case.
2092             variance = V_X_g_t;
2093         }
2094         else
2095         {
2096             r = ceil((A * A) / (B - A));
2097 
2098             if (r < 0)
2099             {
2100                 numeric_ok = false;
2101             }
2102 
2103             // exact cubic
2104             beta = solve_beta(A,B,C);
2105             alpha = 1.0 - (A/(A-B)) * beta;
2106             //fprintf(stdout, "%Lg\t%Lg\t%Lg\n",A,alpha,beta);
2107             //if (beta <= 2 || alpha <= 1)
2108             if (alpha <= 2)
2109             {
2110                 //printf ("Warning: beta for is %Lg\n", beta);
2111                 numeric_ok = false;
2112                 variance = V_X_g_t;
2113             }
2114             else
2115             {
2116                 bnb_mean = r * beta / (alpha - 1.0);
2117                 variance = r * (alpha + r - 1.0) * beta * (alpha + beta - 1);
2118                 variance /= (alpha - 2.0) * (alpha - 1.0) * (alpha - 1.0);
2119             }
2120             if (variance < 0)
2121             {
2122                 numeric_ok = false;
2123                 variance = V_X_g_t;
2124             }
2125 
2126             if (variance == 0 && A != 0)
2127             {
2128                 variance = poisson_variance;
2129             }
2130 
2131             assert (!numeric_ok || variance >= poisson_variance);
2132             assert (!numeric_ok || variance >= V_X_g_t);
2133 
2134             if (variance < poisson_variance)
2135                 variance = poisson_variance;
2136 
2137             if (variance < V_X_g_t)
2138                 variance = V_X_g_t;
2139 
2140             //assert (abs(FPKM - mean) < 1e-3);
2141         }
2142     }
2143 
2144     if (variance < 0)
2145         variance = 0;
2146 
2147     variance = ceil(variance);
2148 
2149     assert (!numeric_ok || (!isinf(variance) && !isnan(variance)));
2150     assert (!numeric_ok || variance != 0 || A == 0);
2151     return numeric_ok;
2152 }
2153 
2154 // This version of the function draws directly from the iterated expectation count covariance matrix
2155 // and treats the betas as normally distributed (which is a good approximation), allowing good capture
2156 // of their covariance without resorting to lots of expensive fragment level sampling.
simulate_count_covariance(const vector<double> & num_fragments,const vector<double> & frag_variances,const ublas::matrix<double> & iterated_exp_count_covariance,const vector<boost::shared_ptr<Abundance>> & transcripts,ublas::matrix<double> & count_covariance,vector<Eigen::VectorXd> & assigned_count_samples,vector<ublas::vector<double>> * gamma_samples=NULL)2157 bool simulate_count_covariance(const vector<double>& num_fragments,
2158                                const vector<double>& frag_variances,
2159                                const ublas::matrix<double>& iterated_exp_count_covariance,
2160                                const vector<boost::shared_ptr<Abundance> >& transcripts,
2161                                ublas::matrix<double>& count_covariance,
2162                                vector<Eigen::VectorXd>& assigned_count_samples,
2163                                vector<ublas::vector<double> >* gamma_samples = NULL)
2164 {
2165     count_covariance = ublas::zero_matrix<double>(transcripts.size(), transcripts.size());
2166 
2167     if (frag_variances.size() <= 1)
2168     {
2169         count_covariance(0,0) = frag_variances[0];
2170         //return;
2171     }
2172 
2173     double total_frag_counts = accumulate(num_fragments.begin(), num_fragments.end(), 0.0);
2174     double total_var = accumulate(frag_variances.begin(), frag_variances.end(), 0.0);
2175 
2176     if (total_frag_counts == 0)
2177     {
2178         count_covariance = ublas::zero_matrix<double>(transcripts.size(), transcripts.size());
2179 
2180         size_t num_frags = num_frag_assignments;
2181         if (gamma_samples && gamma_samples->empty() == false)
2182             num_frags = gamma_samples->size();
2183 
2184         assigned_count_samples = vector<Eigen::VectorXd> (num_frag_count_draws * num_frags, Eigen::VectorXd::Zero(transcripts.size()));
2185         return true;
2186     }
2187 
2188     vector<ublas::vector<double> > assigned_gamma_samples;
2189 
2190     boost::mt19937 rng(random_seed);
2191 
2192     if (gamma_samples == NULL || gamma_samples->empty())
2193     {
2194         vector<Eigen::VectorXd > generated_counts (num_frag_count_draws, Eigen::VectorXd::Zero(transcripts.size()));
2195         ublas::vector<double> mle_frag_counts = ublas::zero_vector<double>(transcripts.size());
2196 
2197         for (size_t j = 0; j < transcripts.size(); ++j)
2198         {
2199             mle_frag_counts(j) = num_fragments[j];
2200         }
2201 
2202         //    cerr << "********************" << endl;
2203         //    cerr << "initial MLE counts: " << mle_frag_counts << endl;
2204 
2205         ublas::matrix<double> mle_count_covar = iterated_exp_count_covariance;
2206 
2207         ublas::matrix<double> epsilon = ublas::zero_matrix<double>(transcripts.size(),transcripts.size());
2208         for (size_t i = 0; i < transcripts.size(); ++i)
2209         {
2210             epsilon(i,i) = 1e-6;
2211         }
2212 
2213         mle_count_covar += epsilon; // modify matrix to avoid problems during inverse
2214 
2215         double ret = cholesky_factorize(mle_count_covar);
2216         if (ret != 0)
2217         {
2218             fprintf(stderr, "Warning: Iterated expectation count covariance matrix cannot be cholesky factorized!\n");
2219             //fprintf(stderr, "Warning: FPKM covariance is not positive definite (ret = %lg)!\n", ret);
2220             //        for (size_t j = 0; j < _abundances.size(); ++j)
2221             //        {
2222             //            _abundances[j]->status(NUMERIC_FAIL);
2223             //        }
2224             return false;
2225         }
2226 
2227         multinormal_generator<double> generator(mle_frag_counts, mle_count_covar);
2228 
2229         for (size_t assign_idx = 0; assign_idx < num_frag_assignments; ++assign_idx)
2230         {
2231             boost::numeric::ublas::vector<double> random_count_assign;
2232             double total_sample_counts = 0;
2233             do {
2234                 random_count_assign = generator.next_rand();
2235                 //cerr << random_count_assign << endl;
2236 
2237                 for (size_t r_idx = 0; r_idx < random_count_assign.size(); ++r_idx)
2238                 {
2239                     if (random_count_assign(r_idx) < 0)
2240                         random_count_assign(r_idx) = 0;
2241                 }
2242 
2243                 total_sample_counts = accumulate(random_count_assign.begin(), random_count_assign.end(), 0.0);
2244                 if (total_sample_counts > 0)
2245                     random_count_assign = total_frag_counts * (random_count_assign / total_sample_counts);
2246                 else
2247                     random_count_assign = boost::numeric::ublas::zero_vector<double>(transcripts.size());
2248             } while(total_sample_counts <= 0);
2249 
2250             assigned_gamma_samples.push_back(random_count_assign);
2251         }
2252     }
2253     else
2254     {
2255         for (size_t assign_idx = 0; assign_idx < gamma_samples->size(); ++assign_idx)
2256         {
2257             boost::numeric::ublas::vector<double> random_count_assign = total_frag_counts * (*gamma_samples)[assign_idx];
2258             assigned_gamma_samples.push_back(random_count_assign);
2259         }
2260     }
2261 
2262     assigned_count_samples = vector<Eigen::VectorXd> (num_frag_count_draws * assigned_gamma_samples.size(), Eigen::VectorXd::Zero(transcripts.size()));
2263 
2264     Eigen::VectorXd expected_generated_counts = Eigen::VectorXd::Zero(transcripts.size());
2265 
2266 
2267     boost::uniform_01<> uniform_dist;
2268     boost::mt19937 null_rng(random_seed);
2269     boost::variate_generator<boost::mt19937&, boost::uniform_01<> > uniform_gen(null_rng, uniform_dist);
2270 
2271 
2272     for (size_t assign_idx = 0; assign_idx < assigned_gamma_samples.size(); ++assign_idx)
2273     {
2274         boost::numeric::ublas::vector<double>& random_count_assign = assigned_gamma_samples[assign_idx];
2275 
2276         //cerr << random_count_assign << endl;
2277         for (size_t gen_idx = 0; gen_idx < num_frag_count_draws; ++gen_idx)
2278         {
2279             Eigen::VectorXd generated_and_assigned_counts = Eigen::VectorXd::Zero(transcripts.size());
2280 
2281             for (size_t j = 0; j < transcripts.size(); ++j)
2282             {
2283                 double r =  random_count_assign(j);
2284 
2285                 //r = r < 1 ? 1 : r;
2286 
2287                 if (r > 0)
2288                 {
2289                     //double fit_var = _abundances[j]->mass_variance();
2290 
2291                     double fit_var = total_var * (random_count_assign(j) / total_frag_counts);
2292                     double frags = random_count_assign(j);
2293 
2294                     if (fit_var - frags > 1e-1)
2295                     {
2296                         r *= r;
2297                         double over_disp_scale = fit_var - frags;
2298                         r /= over_disp_scale;
2299 
2300                         double after_decimal = r - (long)r;
2301                         //fprintf( stderr, "after decimal = %lg\n", after_decimal);
2302                         if (uniform_gen() < after_decimal)
2303                             r = floor(r);
2304                         else
2305                             r = ceil(r);
2306 
2307                         if (r == 0)
2308                         {
2309                             generated_and_assigned_counts(j) = 0;
2310                             continue;
2311                         }
2312 
2313                         //double p = _abundances[j]->num_fragments() / fit_var;
2314                         double p = r / (r + frags);
2315 
2316                         negative_binomial_distribution<int, double> nb_j(r, p);
2317 
2318                         generated_and_assigned_counts(j) = nb_j(rng);
2319 
2320                     }
2321                     else
2322                     {
2323                         double after_decimal = r - (long)r;
2324                         //fprintf( stderr, "after decimal = %lg\n", after_decimal);
2325                         if (uniform_gen() < after_decimal)
2326                             r = floor(r);
2327                         else
2328                             r = ceil(r);
2329 
2330                         if (r == 0)
2331                         {
2332                             generated_and_assigned_counts(j) = 0;
2333                             continue;
2334                         }
2335 
2336                         boost::random::poisson_distribution<int, double> nb_j(r);
2337 
2338                         generated_and_assigned_counts(j) = nb_j(rng);
2339 
2340                     }
2341                 }
2342                 else
2343                 {
2344                     generated_and_assigned_counts(j) = 0;
2345                 }
2346             }
2347             //cerr << "     assigned count sample: " << generated_and_assigned_counts.transpose() << endl;
2348             assigned_count_samples[assign_idx*num_frag_count_draws + gen_idx] = generated_and_assigned_counts;
2349         }
2350     }
2351 
2352     Eigen::VectorXd expected_counts = Eigen::VectorXd::Zero(transcripts.size());
2353     Eigen::VectorXd expected_relative_abundances = Eigen::VectorXd::Zero(transcripts.size());
2354 
2355     for (size_t i = 0; i < assigned_count_samples.size(); ++i)
2356     {
2357         for (int j = 0; j < assigned_count_samples[i].size(); ++j)
2358         {
2359             assert (!isnan(assigned_count_samples[i](j)) && !isinf(assigned_count_samples[i](j)));
2360         }
2361 
2362 
2363         expected_counts += assigned_count_samples[i];
2364         //
2365         //expected_relative_abundances += relative_abundances[i];
2366     }
2367     if (assigned_count_samples.size() > 0)
2368     {
2369         expected_counts /= assigned_count_samples.size();
2370         //expected_generated_counts /= assigned_counts.size();
2371         //expected_relative_abundances /= assigned_counts.size();
2372     }
2373 
2374     //    if (num_frag_assignments > 0)
2375     //    {
2376     //        expected_generated_counts /= num_frag_assignments;
2377     //        //expected_relative_abundances /= assigned_counts.size();
2378     //    }
2379 
2380 
2381     //    cerr << "======" << endl;
2382     //    cerr << "updated expected counts #1: " << endl;
2383     //    std::cerr << expected_counts << std::endl;
2384     //    cerr << "updated expected generated counts #1: " << endl;
2385     //    std::cerr << expected_generated_counts << std::endl;
2386     //    cerr << "======" << endl;
2387 
2388     for (size_t i = 0; i < transcripts.size(); ++i)
2389     {
2390         for (size_t j = 0; j < transcripts.size(); ++j)
2391         {
2392             for (size_t k = 0 ; k < assigned_count_samples.size(); ++k)
2393             {
2394                 double c = (assigned_count_samples[k](i) - expected_counts(i)) * (assigned_count_samples[k](j) - expected_counts(j));
2395                 count_covariance(i,j) += c;
2396 
2397                 assert (!isinf(count_covariance(i,j)) && !isnan(count_covariance(i,j)));
2398                 //double r = (relative_abundances[k](i) - expected_relative_abundances(i)) * (relative_abundances[k](j) - expected_relative_abundances(j));
2399                 //_kappa_covariance(i,j) +=
2400             }
2401         }
2402     }
2403 
2404     if (assigned_count_samples.empty() == false)
2405         count_covariance /= assigned_count_samples.size();
2406 
2407     for (size_t i = 0; i < transcripts.size(); ++i)
2408     {
2409         // Make sure we aren't below the fit for the single isoform case
2410         if (count_covariance(i,i) < ceil(frag_variances[i]))
2411         {
2412             //fprintf(stderr, "Counts for %d (var = %lg) are underdispersed, reverting to fitted variance model (%lg)\n", i, _count_covariance(i,i), ceil(_abundances[i]->mass_variance()));
2413             count_covariance(i,i) = ceil(frag_variances[i]);
2414             assert (!isinf(count_covariance(i,i)) && !isnan(count_covariance(i,i)));
2415         }
2416 
2417         // Check that we aren't below what the Poisson model says we ought to be at
2418         if (count_covariance(i,i) < ceil(num_fragments[i] + iterated_exp_count_covariance(i,i)))
2419         {
2420             //fprintf(stderr, "Counts for %d (var = %lg) are underdispersed, reverting to additive variance model (%lg)\n", i, _count_covariance(i,i),  ceil(_abundances[i]->num_fragments() + _iterated_exp_count_covariance(i,i)));
2421             count_covariance(i,i) = ceil(num_fragments[i] + iterated_exp_count_covariance(i,i));
2422             assert (!isinf(count_covariance(i,i)) && !isnan(count_covariance(i,i)));
2423         }
2424         for (size_t j = 0; j < transcripts.size(); ++j)
2425         {
2426             assert (!isinf(count_covariance(i,j)) && !isnan(count_covariance(i,j)));
2427         }
2428     }
2429 
2430     return true;
2431 }
2432 
2433 
generate_fpkm_samples()2434 void AbundanceGroup::generate_fpkm_samples()
2435 {
2436     double external_scale_factor = -1.0;
2437 
2438     double M = 0;
2439 
2440     for (set<boost::shared_ptr<ReadGroupProperties const> >::iterator itr = _read_group_props.begin();
2441          itr != _read_group_props.end();
2442          ++itr)
2443     {
2444         boost::shared_ptr<ReadGroupProperties const> rg_props = *itr;
2445         M += rg_props->normalized_map_mass();
2446 
2447         if (external_scale_factor < 0)
2448         {
2449             external_scale_factor = rg_props->external_scale_factor();
2450         }
2451         else
2452         {
2453             assert (external_scale_factor == rg_props->external_scale_factor());
2454         }
2455     }
2456 
2457     M /= _read_group_props.size();
2458 
2459     // set up individual vectors of FPKM samples for each abundance object in this group.
2460     vector<vector<double> > fpkm_sample_vectors(_abundances.size());
2461     vector<double> group_sum_fpkm_samples;
2462 
2463     vector<double> fpkm_means(_abundances.size(), 0);
2464 
2465     for(size_t i = 0; i < _assigned_count_samples.size(); ++i)
2466     {
2467         const Eigen::VectorXd sample = _assigned_count_samples[i];
2468         double total_fpkm = 0;
2469 
2470         for (size_t j = 0; j < sample.size(); ++j)
2471         {
2472             double fpkm_sample =  sample[j] / M;
2473 
2474             if (_abundances[j]->effective_length() > 0)
2475             {
2476                 fpkm_sample *= 1000000000;
2477                 fpkm_sample /= _abundances[j]->effective_length();
2478                 fpkm_sample /= external_scale_factor;
2479                 //double standard_fpkm = _abundances[j]->FPKM();
2480                 //fprintf(stderr, "count = %lg, fpkm = %lg, standard fpkm = %lg\n", sample[j], fpkm_sample, standard_fpkm);
2481             }
2482             else
2483             {
2484                 fpkm_sample = 0;
2485             }
2486 
2487             assert (isnan(fpkm_sample) == false);
2488 
2489             fpkm_sample_vectors[j].push_back(fpkm_sample);
2490             fpkm_means[j] += fpkm_sample;
2491             total_fpkm += fpkm_sample;
2492         }
2493 
2494         if (effective_length() > 0)
2495         {
2496             group_sum_fpkm_samples.push_back(total_fpkm);
2497         }
2498         else
2499         {
2500             group_sum_fpkm_samples.push_back(0);
2501         }
2502     }
2503 
2504     for (size_t i = 0; i < _abundances.size(); ++i)
2505     {
2506         fpkm_means[i] /= _assigned_count_samples.size();
2507         _abundances[i]->fpkm_samples(fpkm_sample_vectors[i]);
2508         //fprintf(stderr, "standard fpkm = %lg, sample mean = %lg\n", _abundances[i]->FPKM(), fpkm_means[i]);
2509     }
2510 
2511     vector<Eigen::VectorXd> mem_fpkm_samples;
2512     for (size_t i = 0; i < fpkm_sample_vectors.size(); ++i)
2513     {
2514         Eigen::VectorXd sample(fpkm_sample_vectors[i].size());
2515         mem_fpkm_samples.push_back(sample);
2516         for (size_t j = 0; j < fpkm_sample_vectors[i].size(); ++j)
2517         {
2518             mem_fpkm_samples[i](j) = fpkm_sample_vectors[i][j];
2519         }
2520     }
2521 
2522     member_fpkm_samples(mem_fpkm_samples);
2523 
2524     fpkm_samples(group_sum_fpkm_samples);
2525 
2526     assert (group_sum_fpkm_samples.empty() == false);
2527 }
2528 
calculate_FPKM_covariance()2529 void AbundanceGroup::calculate_FPKM_covariance()
2530 {
2531 	if (effective_length() == 0)
2532 	{
2533 		_fpkm_covariance = ublas::zero_matrix<double>(_abundances.size(), _abundances.size());
2534 		return;
2535 	}
2536 
2537     //estimate_count_covariance();
2538 
2539     long double total_var = 0.0;
2540 
2541     double abundance_weighted_length = 0.0;
2542     double total_abundance = 0.0;
2543 
2544     double external_scale_factor = -1.0;
2545 
2546     double M = 0;
2547 
2548     for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
2549          itr != _count_per_replicate.end();
2550          ++itr)
2551     {
2552         boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
2553         M += rg_props->normalized_map_mass();
2554 
2555         if (external_scale_factor < 0)
2556         {
2557             external_scale_factor = rg_props->external_scale_factor();
2558         }
2559         else
2560         {
2561             assert (external_scale_factor == rg_props->external_scale_factor());
2562         }
2563     }
2564 
2565     M /= _count_per_replicate.size();
2566 
2567     for (size_t j = 0; j < _abundances.size(); ++j)
2568     {
2569         abundance_weighted_length += _abundances[j]->effective_length() * _abundances[j]->FPKM();
2570         total_abundance += _abundances[j]->FPKM();
2571 
2572         for (size_t i = 0; i < _abundances.size(); ++i)
2573         {
2574             _fpkm_covariance(i,j) = _count_covariance(i,j);
2575 
2576             // FPKMs need to be on the external scale, so we can compare them
2577             // between conditions.  Counts are internally scaled up until here.
2578             _fpkm_covariance(i,j) *= 1.0 / (external_scale_factor * external_scale_factor);
2579             assert (!isinf(_count_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
2580 
2581             long double length_i = _abundances[i]->effective_length();
2582             long double length_j = _abundances[j]->effective_length();
2583             assert (!isinf(length_i) && !isnan(length_i));
2584             assert (!isinf(length_j) && !isnan(length_j));
2585             if (length_i > 0 && length_j > 0 & M > 0)
2586             {
2587                 _fpkm_covariance(i,j) *=
2588                     ((1000000000.0 / (length_j *M)))*((1000000000.0 / (length_i *M)));
2589                 assert (!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
2590             }
2591             else
2592             {
2593                 _fpkm_covariance(i,j) = 0.0;
2594             }
2595 
2596             if (i == j)
2597             {
2598                 double fpkm = _abundances[i]->FPKM();
2599                 double fpkm_var = _fpkm_covariance(i,j);
2600 //                if (fpkm > 0 && fpkm_var == 0 )
2601 //                {
2602 //                    cerr << _count_covariance << endl;
2603 //                }
2604                 assert (fpkm == 0 || fpkm_var > 0 || _abundances[i]->status() != NUMERIC_OK);
2605                 assert (!isinf(fpkm_var) && !isnan(fpkm_var));
2606                 _abundances[i]->FPKM_variance(fpkm_var);
2607                 _abundances[i]->num_fragment_var(_count_covariance(i,j));
2608 
2609             }
2610 
2611             total_var += _fpkm_covariance(i,j);
2612 
2613             assert (!isinf(_fpkm_covariance(i,j)) && !isnan(_fpkm_covariance(i,j)));
2614         }
2615     }
2616 
2617     _FPKM_variance = total_var;
2618 
2619     if (final_est_run && library_type != "transfrags")
2620     {
2621         ublas::matrix<double> test = _fpkm_covariance;
2622         double ret = cholesky_factorize(test);
2623         if (ret != 0 || (_FPKM_variance < 0 && status() == NUMERIC_OK))
2624         {
2625             //fprintf(stderr, "Warning: FPKM covariance is not positive definite (ret = %lg)!\n", ret);
2626             for (size_t j = 0; j < _abundances.size(); ++j)
2627             {
2628                 _abundances[j]->status(NUMERIC_FAIL);
2629             }
2630         }
2631 
2632         assert (FPKM() == 0 || _FPKM_variance > 0 || status() != NUMERIC_OK);
2633     }
2634 
2635 //    cerr << "FPKM covariance: " << endl;
2636 //    for (unsigned i = 0; i < _fpkm_covariance.size1 (); ++ i)
2637 //    {
2638 //        ublas::matrix_row<ublas::matrix<double> > mr (_fpkm_covariance, i);
2639 //        cerr << i << " : " << _abundances[i]->FPKM() << " : ";
2640 //        std::cerr << i << " : " << mr << std::endl;
2641 //    }
2642 
2643     assert (!isinf(_FPKM_variance) && !isnan(_FPKM_variance));
2644 }
2645 
calculate_conf_intervals()2646 void AbundanceGroup::calculate_conf_intervals()
2647 {
2648     // We only really ever call this function for primary abundance groups
2649     // (i.e. the transcript groups and read bundles with which we calculate
2650     // transcript MLE expression levels.  Genes, TSS groups, etc get broken
2651     // off of primary bundles, so we should not call this function on those
2652     // secondary groups.  The group splitting code needs to manage the task
2653     // of splitting up all the variout covariance matrices we're calculating
2654     // here.
2655 	if (status() == NUMERIC_OK)
2656 	{
2657 		// This will compute the transcript level FPKM confidence intervals
2658 		for (size_t j = 0; j < _abundances.size(); ++j)
2659 		{
2660             long double fpkm_var = _abundances[j]->FPKM_variance();
2661             double FPKM_hi = 0.0;
2662             double FPKM_lo = 0.0;
2663             if (_abundances[j]->status() != NUMERIC_FAIL)
2664             {
2665                 FPKM_hi = _abundances[j]->FPKM() + 2 * sqrt(fpkm_var);
2666                 FPKM_lo = max(0.0, (double)(_abundances[j]->FPKM() - 2 * sqrt(fpkm_var)));
2667                 if (!(FPKM_lo <= _abundances[j]->FPKM() && _abundances[j]->FPKM() <= FPKM_hi))
2668                 {
2669                     //fprintf(stderr, "Error: confidence intervals are illegal! var = %Lg, fpkm = %lg, lo = %lg, hi %lg, status = %d\n", fpkm_var, _abundances[j]->FPKM(), FPKM_lo, FPKM_hi, _abundances[j]->status());
2670                 }
2671                 assert (FPKM_lo <= _abundances[j]->FPKM() && _abundances[j]->FPKM() <= FPKM_hi);
2672                 ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2673                 _abundances[j]->FPKM_conf(conf);
2674                 //_abundances[j]->FPKM_variance(fpkm_var);
2675             }
2676             else
2677             {
2678                 // we shouldn't be able to get here
2679                 assert(false);
2680                 // TODO: nothing to do here?
2681             }
2682 		}
2683 
2684         // Now build a confidence interval for the whole abundance group
2685 		double group_fpkm = FPKM();
2686 		if (group_fpkm > 0.0)
2687 		{
2688 			double FPKM_hi = FPKM() + 2 * sqrt(FPKM_variance());
2689 			double FPKM_lo = max(0.0, FPKM() - 2 * sqrt(FPKM_variance()));
2690 			ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2691 			FPKM_conf(conf);
2692 		}
2693 		else
2694 		{
2695 			_FPKM_variance = 0.0;
2696 			ConfidenceInterval conf(0.0, 0.0);
2697 			FPKM_conf(conf);
2698 		}
2699 	}
2700 	else
2701 	{
2702 		double sum_transfrag_FPKM_hi = 0;
2703         double max_fpkm = 0.0;
2704         //double min_fpkm = 1e100;
2705 
2706         double avg_X_g = 0.0;
2707         double avg_mass_fraction = 0.0;
2708 
2709         int N = _abundances.size();
2710 
2711         vector<double> avg_mass_variances(_abundances.size(), 0.0);
2712 
2713         for (map<boost::shared_ptr<ReadGroupProperties const>, double>::iterator itr = _count_per_replicate.begin();
2714              itr != _count_per_replicate.end();
2715              ++itr)
2716         {
2717             boost::shared_ptr<ReadGroupProperties const> rg_props = itr->first;
2718             double scaled_mass = itr->second;
2719             double scaled_total_mass = rg_props->normalized_map_mass();
2720             avg_X_g += scaled_mass;
2721             boost::shared_ptr<MassDispersionModel const> disperser = rg_props->mass_dispersion_model();
2722             for (size_t j = 0; j < N; ++j)
2723             {
2724                 double scaled_variance;
2725                 //scaled_variance = disperser->scale_mass_variance(scaled_mass * _abundances[j]->gamma());
2726                 scaled_variance = _abundances[j]->gamma() * disperser->scale_mass_variance(scaled_mass);
2727                 avg_mass_variances[j] += scaled_variance;
2728             }
2729             assert (disperser->scale_mass_variance(scaled_mass) != 0 || scaled_mass == 0);
2730 
2731             //assert (scaled_total_mass != 0.0);
2732             avg_mass_fraction += (scaled_mass / scaled_total_mass);
2733         }
2734 
2735         double num_replicates = _count_per_replicate.size();
2736 
2737         if (num_replicates)
2738         {
2739             avg_X_g /= num_replicates;
2740             avg_mass_fraction /= num_replicates;
2741             for (size_t j = 0; j < N; ++j)
2742             {
2743                 avg_mass_variances[j] /= num_replicates;
2744             }
2745         }
2746 
2747 		BOOST_FOREACH(boost::shared_ptr<Abundance> pA, _abundances)
2748 		{
2749 			double FPKM_hi;
2750 			double FPKM_lo;
2751 			if (pA->effective_length() > 0 && avg_mass_fraction > 0)
2752 			{
2753                 double norm_frag_density = 1000000000;
2754                 norm_frag_density /= pA->effective_length();
2755 
2756                 norm_frag_density *= avg_mass_fraction;
2757                 double fpkm_high = norm_frag_density;
2758 
2759                 double total_mass = (num_fragments() / avg_mass_fraction);
2760                 double fpkm_constant = 1000000000 / pA->effective_length() / total_mass;
2761                 double var_fpkm = mass_variance() * (fpkm_constant * fpkm_constant);
2762 
2763 				FPKM_hi = fpkm_high + 2 * sqrt(var_fpkm);
2764 				FPKM_lo = 0.0;
2765 				ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2766 				//assert (FPKM_lo <= pA->FPKM() && pA->FPKM() <= FPKM_hi);
2767 				pA->FPKM_conf(conf);
2768                 //pA->FPKM_variance(var_fpkm);
2769 				max_fpkm = max(sum_transfrag_FPKM_hi, FPKM_hi);
2770 			}
2771 			else
2772 			{
2773 				FPKM_hi = 0.0;
2774 				FPKM_lo = 0.0;
2775 				ConfidenceInterval conf(0.0, 0.0);
2776 				pA->FPKM_conf(conf);
2777                 //pA->FPKM_variance(0.0);
2778 			}
2779 
2780 		}
2781 		// In the case of a numeric failure, the groups error bars need to be
2782 		// set such that
2783 		FPKM_conf(ConfidenceInterval(0.0, max_fpkm + 2 * sqrt(FPKM_variance())));
2784 	}
2785 }
2786 
2787 
2788 /*
2789 void AbundanceGroup::calculate_conf_intervals()
2790 {
2791     for (size_t j = 0; j < _abundances.size(); ++j)
2792     {
2793         double FPKM_hi = 0.0;
2794         double FPKM_lo = numeric_limits<double>::max();
2795         const vector<double> ab_j_samples = _abundances[j]->fpkm_samples();
2796         vector<pair<double, double> > fpkm_samples;
2797         double target_fpkm = _abundances[j]->FPKM();
2798         for (size_t i = 0; i < ab_j_samples.size(); ++i)
2799             fpkm_samples.push_back(make_pair(abs(ab_j_samples[i] - _abundances[j]->FPKM()), ab_j_samples[i]));
2800 
2801         sort(fpkm_samples.begin(), fpkm_samples.end());
2802 
2803         for (size_t i = 0; i < 0.95*fpkm_samples.size(); ++i)
2804         {
2805             if (FPKM_lo > fpkm_samples[i].second)
2806                 FPKM_lo = fpkm_samples[i].second;
2807             if (FPKM_hi < fpkm_samples[i].second)
2808                 FPKM_hi = fpkm_samples[i].second;
2809         }
2810 
2811         if (fpkm_samples.size() > 0 && FPKM_lo > target_fpkm)
2812         {
2813             fprintf(stderr, "Warning: transcript confidence interval lower bound is > FPKM\n");
2814         }
2815 
2816         if (fpkm_samples.size() > 0 && FPKM_hi < target_fpkm)
2817         {
2818             fprintf(stderr, "Warning: transcript confidence interval upper bound is < FPKM\n");
2819         }
2820 
2821         ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2822         _abundances[j]->FPKM_conf(conf);
2823     }
2824 
2825     double FPKM_hi = 0.0;
2826     double FPKM_lo = numeric_limits<double>::max();
2827     const vector<double>& ab_j_samples = _fpkm_samples;
2828     vector<pair<double, double> > fpkm_samples;
2829     for (size_t i = 0; i < ab_j_samples.size(); ++i)
2830         fpkm_samples.push_back(make_pair(abs(ab_j_samples[i] - FPKM()), ab_j_samples[i]));
2831 
2832     sort(fpkm_samples.begin(), fpkm_samples.end());
2833 
2834     for (size_t i = 0; i < 0.95*fpkm_samples.size(); ++i)
2835     {
2836         if (FPKM_lo > fpkm_samples[i].second)
2837             FPKM_lo = fpkm_samples[i].second;
2838         if (FPKM_hi < fpkm_samples[i].second)
2839             FPKM_hi = fpkm_samples[i].second;
2840     }
2841 
2842     double target_fpkm = FPKM();
2843 
2844     if (fpkm_samples.size() > 0 && FPKM_lo > target_fpkm)
2845     {
2846         fprintf(stderr, "Warning: group confidence interval lower bound is > FPKM\n");
2847     }
2848 
2849     if (fpkm_samples.size() > 0 && FPKM_hi < target_fpkm)
2850     {
2851         fprintf(stderr, "Warning: group confidence interval upper bound is < FPKM\n");
2852     }
2853 
2854     ConfidenceInterval conf(FPKM_lo, FPKM_hi);
2855     FPKM_conf(conf);
2856 
2857     return;
2858 }
2859 
2860 */
compute_cond_probs_and_effective_lengths(const vector<MateHit> & alignments,vector<boost::shared_ptr<Abundance>> & transcripts,vector<boost::shared_ptr<Abundance>> & mapped_transcripts)2861 void compute_cond_probs_and_effective_lengths(const vector<MateHit>& alignments,
2862 															  vector<boost::shared_ptr<Abundance> >& transcripts,
2863 															  vector<boost::shared_ptr<Abundance> >& mapped_transcripts)
2864 {
2865 	int N = transcripts.size();
2866 	int M = alignments.size();
2867 
2868 	vector<vector<char> > compatibilities(N, vector<char>(M,0));
2869 	compute_compatibilities(transcripts, alignments, compatibilities);
2870 
2871 	for (int j = 0; j < N; ++j)
2872     {
2873 		boost::shared_ptr<Scaffold> transfrag = transcripts[j]->transfrag();
2874 		vector<double>& cond_probs = *(new vector<double>(M,0));
2875 
2876 		BiasCorrectionHelper bch(transfrag);
2877 
2878 		for(int i = 0 ; i < M; ++i)
2879 		{
2880 			if (compatibilities[j][i]==1)
2881             {
2882                 total_cond_prob_calls++;
2883 				cond_probs[i] = bch.get_cond_prob(alignments[i]);
2884             }
2885 		}
2886 
2887 		transcripts[j]->effective_length(bch.get_effective_length());
2888 		transcripts[j]->cond_probs(&cond_probs);
2889 
2890 		if (bch.is_mapped())
2891 			mapped_transcripts.push_back(transcripts[j]);
2892 	}
2893 }
2894 
2895 
trace(const ublas::matrix<double> & m)2896 double trace(const ublas::matrix<double>& m)
2897 {
2898 
2899     double t = 0.0;
2900     for (size_t i = 0.0; i < m.size1(); ++i)
2901     {
2902         t += m(i,i);
2903     }
2904 
2905     return t;
2906 }
2907 
2908 
2909 // FIXME: This function doesn't really need to copy the transcripts out of
2910 // the cluster.  Needs refactoring
calculate_gammas(const vector<MateHit> & nr_alignments,const vector<double> & log_conv_factors,const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<boost::shared_ptr<Abundance>> & mapped_transcripts)2911 bool AbundanceGroup::calculate_gammas(const vector<MateHit>& nr_alignments,
2912                                       const vector<double>& log_conv_factors,
2913 									  const vector<boost::shared_ptr<Abundance> >& transcripts,
2914 									  const vector<boost::shared_ptr<Abundance> >& mapped_transcripts)
2915 {
2916 	if (mapped_transcripts.empty())
2917     {
2918 		//gammas = vector<double>(transfrags.size(), 0.0);
2919 		BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
2920 		{
2921 			ab->gamma(0);
2922 		}
2923 		_gamma_covariance = ublas::zero_matrix<double>(transcripts.size(),
2924                                                        transcripts.size());
2925         _count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2926                                                        transcripts.size());
2927         _iterated_exp_count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2928                                                        transcripts.size());
2929         _fpkm_covariance = ublas::zero_matrix<double>(transcripts.size(),
2930                                                       transcripts.size());
2931 
2932 		return true;
2933     }
2934 
2935 	vector<double> gammas;
2936 
2937 	verbose_msg( "Calculating intial MLE\n");
2938 
2939 	AbundanceStatus mle_success = gamma_mle(mapped_transcripts,
2940                                                   nr_alignments,
2941                                                   log_conv_factors,
2942                                                   gammas);
2943 
2944 	verbose_msg( "Tossing likely garbage isoforms\n");
2945 
2946 	for (size_t i = 0; i < gammas.size(); ++i)
2947 	{
2948 		if (isnan(gammas[i]))
2949 		{
2950 			verbose_msg("Warning: isoform abundance is NaN!\n");
2951 		}
2952 	}
2953 
2954     double locus_mass = 0.0;
2955 
2956     for (size_t i = 0; i < nr_alignments.size(); ++i)
2957     {
2958         const MateHit& alignment = nr_alignments[i];
2959         locus_mass += alignment.collapse_mass();
2960     }
2961 
2962 	vector<boost::shared_ptr<Abundance> > filtered_transcripts = mapped_transcripts;
2963 	vector<double> filtered_gammas = gammas;
2964 	filter_junk_isoforms(filtered_transcripts, filtered_gammas, mapped_transcripts, locus_mass);
2965 
2966 	if (filtered_transcripts.empty())
2967 	{
2968 		//gammas = vector<double>(transfrags.size(), 0.0);
2969 		BOOST_FOREACH (boost::shared_ptr<Abundance> ab, _abundances)
2970 		{
2971 			ab->gamma(0);
2972 		}
2973 		_gamma_covariance = ublas::zero_matrix<double>(transcripts.size(),
2974 													  transcripts.size());
2975         _count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2976                                                        transcripts.size());
2977         _iterated_exp_count_covariance = ublas::zero_matrix<double>(transcripts.size(),
2978                                                                     transcripts.size());
2979         _fpkm_covariance = ublas::zero_matrix<double>(transcripts.size(),
2980                                                       transcripts.size());
2981 
2982 		return true;
2983 	}
2984 
2985     if (filtered_transcripts.size() != mapped_transcripts.size())
2986     {
2987         filtered_gammas.clear();
2988 
2989         verbose_msg( "Revising MLE\n");
2990 
2991         mle_success = gamma_mle(filtered_transcripts,
2992                                         nr_alignments,
2993                                         log_conv_factors,
2994                                         filtered_gammas);
2995     }
2996 
2997 	for (size_t i = 0; i < filtered_gammas.size(); ++i)
2998 	{
2999 		if (isnan(filtered_gammas[i]))
3000 		{
3001 			verbose_msg("Warning: isoform abundance is NaN!\n");
3002 		}
3003 	}
3004 
3005 	size_t N = transcripts.size();
3006 
3007     set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
3008 	for (size_t i = 0; i < nr_alignments.size(); ++i)
3009 	{
3010         rg_props.insert(nr_alignments[i].read_group_props());
3011 	}
3012 
3013     AbundanceStatus map_success = NUMERIC_OK;
3014 	if (final_est_run) // Only on last estimation run.
3015 	{
3016         ublas::vector<double> gamma_mle(filtered_gammas.size());
3017         std::copy(filtered_gammas.begin(), filtered_gammas.end(), gamma_mle.begin());
3018     }
3019 
3020 	for (size_t i = 0; i < filtered_gammas.size(); ++i)
3021 	{
3022 		if (isnan(gammas[i]))
3023 		{
3024 			verbose_msg( "Warning: isoform abundance is NaN!\n");
3025 			map_success = NUMERIC_FAIL;
3026 		}
3027 	}
3028 
3029 	// Now we need to fill in zeros for the isoforms we filtered out of the
3030 	// MLE/MAP calculation
3031 	vector<double> updated_gammas = vector<double>(N, 0.0);
3032 
3033 
3034 	ublas::matrix<double> updated_gamma_cov;
3035 	updated_gamma_cov = ublas::zero_matrix<double>(N, N);
3036 
3037     ublas::matrix<double> updated_count_cov;
3038     updated_count_cov = ublas::zero_matrix<double>(N, N);
3039     ublas::matrix<double> updated_iterated_exp_count_cov;
3040     updated_iterated_exp_count_cov = ublas::zero_matrix<double>(N, N);
3041     ublas::matrix<double> updated_fpkm_cov;
3042     updated_fpkm_cov = ublas::zero_matrix<double>(N, N);
3043 
3044 	size_t cfs = 0;
3045 	boost::shared_ptr<Scaffold> curr_filtered_scaff = filtered_transcripts[cfs]->transfrag();
3046 	StructurallyEqualScaffolds se;
3047 	vector<size_t> scaff_present(N, N);
3048 
3049 	for (size_t i = 0; i < N; ++i)
3050 	{
3051 		boost::shared_ptr<Scaffold> scaff_i = transcripts[i]->transfrag();
3052 		if (cfs < filtered_transcripts.size())
3053 		{
3054 			curr_filtered_scaff = filtered_transcripts[cfs]->transfrag();
3055 			if (se(scaff_i, curr_filtered_scaff))
3056 			{
3057 				scaff_present[i] = cfs;
3058 				cfs++;
3059 			}
3060 		}
3061 	}
3062 
3063 	for (size_t i = 0; i < N; ++i)
3064 	{
3065 		if (scaff_present[i] != N)
3066 		{
3067 			// then scaffolds[i] has a non-zero abundance, we need to fill
3068 			// that in along with relevant cells from the covariance matrix
3069 			updated_gammas[i] = filtered_gammas[scaff_present[i]];
3070             //cerr << updated_gammas[i] << ",";
3071 
3072 			for (size_t j = 0; j < N; ++j)
3073 			{
3074 				if (scaff_present[j] != N)
3075 				{
3076 					updated_gamma_cov(i,j) = _gamma_covariance(scaff_present[i],
3077 															   scaff_present[j]);
3078 
3079                     updated_iterated_exp_count_cov(i,j) = _iterated_exp_count_covariance(scaff_present[i],
3080                                                                                          scaff_present[j]);
3081                     // Should still be empty but let's do these for consistency:
3082                     updated_count_cov(i,j) = _count_covariance(scaff_present[i],
3083                                                                scaff_present[j]);
3084                     updated_fpkm_cov(i,j) = _fpkm_covariance(scaff_present[i],
3085                                                              scaff_present[j]);
3086 					assert (!isinf(updated_gamma_cov(i,j)));
3087 					assert (!isnan(updated_gamma_cov(i,j)));
3088 				}
3089 			}
3090 		}
3091 	}
3092 
3093     //cerr << endl;
3094 
3095     AbundanceStatus numeric_status = NUMERIC_OK;
3096     if (mle_success == NUMERIC_LOW_DATA)
3097     {
3098         numeric_status = NUMERIC_LOW_DATA;
3099     }
3100     else if (mle_success == NUMERIC_FAIL)
3101     {
3102         numeric_status = NUMERIC_FAIL;
3103     }
3104     else
3105     {
3106         assert (mle_success == NUMERIC_OK);
3107         if (map_success == NUMERIC_FAIL)
3108         {
3109             numeric_status = NUMERIC_FAIL;
3110         }
3111         else if (map_success == NUMERIC_LOW_DATA)
3112         {
3113             numeric_status = NUMERIC_LOW_DATA;
3114         }
3115         // otherwise, we're cool.
3116     }
3117 
3118 
3119 
3120 	// All scaffolds that go in get abundances, but those that get "filtered"
3121 	// from the calculation get zeros.
3122 	//gammas = updated_gammas;
3123 	for (size_t i = 0; i < _abundances.size(); ++i)
3124 	{
3125 		_abundances[i]->gamma(updated_gammas[i]);
3126 		_abundances[i]->status(numeric_status);
3127 	}
3128 	_gamma_covariance = updated_gamma_cov;
3129     _count_covariance = updated_count_cov;
3130     _iterated_exp_count_covariance = updated_iterated_exp_count_cov;
3131     _fpkm_covariance = updated_fpkm_cov;
3132 
3133 	return (status() == NUMERIC_OK);
3134 }
3135 
calculate_assignment_probs(const Eigen::VectorXd & alignment_multiplicities,const Eigen::MatrixXd & transcript_cond_probs,const Eigen::VectorXd & proposed_gammas,Eigen::MatrixXd & assignment_probs)3136 void calculate_assignment_probs(const Eigen::VectorXd& alignment_multiplicities,
3137                                 const Eigen::MatrixXd& transcript_cond_probs,
3138                                 const Eigen::VectorXd& proposed_gammas,
3139                                 Eigen::MatrixXd& assignment_probs)
3140 {
3141 //    vector<double> u(nr_alignments.size());
3142 //    for (size_t i = 0; i < alignment_multiplicity.size(); ++i)
3143 //    {
3144 //        u[i] = nr_alignments[i].collapse_mass();
3145 //    }
3146 
3147     //ublas::vector<double> total_cond_prob = ublas::prod(proposed_gammas,transcript_cond_probs);
3148     Eigen::VectorXd total_cond_prob = proposed_gammas.transpose() * transcript_cond_probs ;
3149 
3150     // Compute the marginal conditional probability for each fragment against each isoform
3151     //ublas::matrix<double>  marg_cond_prob = ublas::zero_matrix<double>(transcript_cond_probs.size1(), transcript_cond_probs.size2());
3152     Eigen::MatrixXd marg_cond_prob(transcript_cond_probs.rows(), transcript_cond_probs.cols());
3153 
3154     for (size_t i = 0; i < alignment_multiplicities.size(); ++i)
3155     {
3156         marg_cond_prob.array().col(i) = proposed_gammas.array() * transcript_cond_probs.array().col(i);
3157 
3158         if (total_cond_prob(i) > 0)
3159         {
3160             marg_cond_prob.array().col(i) /= total_cond_prob(i);
3161             //column(marg_cond_prob,i) /= total_cond_prob(i);
3162         }
3163     }
3164 
3165     assignment_probs = marg_cond_prob;
3166 }
3167 
calculate_average_assignment_probs(const Eigen::VectorXd & alignment_multiplicities,const Eigen::MatrixXd & transcript_cond_probs,const Eigen::VectorXd & proposed_gammas,Eigen::MatrixXd & assignment_probs)3168 void calculate_average_assignment_probs(const Eigen::VectorXd& alignment_multiplicities,
3169                                         const Eigen::MatrixXd& transcript_cond_probs,
3170                                         const Eigen::VectorXd& proposed_gammas,
3171                                         Eigen::MatrixXd& assignment_probs)
3172 {
3173     //ublas::vector<double> total_cond_prob = ublas::prod(proposed_gammas,transcript_cond_probs);
3174     Eigen::VectorXd total_cond_prob = proposed_gammas.transpose() * transcript_cond_probs ;
3175 
3176     // Compute the marginal conditional probability for each fragment against each isoform
3177     //ublas::matrix<double>  marg_cond_prob = ublas::zero_matrix<double>(transcript_cond_probs.size1(), transcript_cond_probs.size2());
3178     Eigen::MatrixXd marg_cond_prob(transcript_cond_probs.rows(), transcript_cond_probs.cols());
3179 
3180     for (size_t i = 0; i < alignment_multiplicities.size(); ++i)
3181     {
3182         marg_cond_prob.array().col(i) = proposed_gammas.array() * transcript_cond_probs.array().col(i);
3183 
3184         if (total_cond_prob(i) > 0)
3185         {
3186             marg_cond_prob.array().col(i) /= total_cond_prob(i);
3187             //column(marg_cond_prob,i) /= total_cond_prob(i);
3188         }
3189     }
3190 
3191     assignment_probs = Eigen::MatrixXd::Zero(proposed_gammas.size(), proposed_gammas.size());
3192 
3193     vector<double> num_compatible(proposed_gammas.size(), 0);
3194 
3195     for (size_t i = 0; i < alignment_multiplicities.size(); ++i)
3196     {
3197         for (size_t j = 0; j < proposed_gammas.size(); ++j)
3198         {
3199             if (marg_cond_prob(j,i) > 0)
3200             {
3201                 assignment_probs.col(j) += marg_cond_prob.col(i) * alignment_multiplicities[i];
3202                 num_compatible[j] += alignment_multiplicities[i];
3203             }
3204         }
3205     }
3206 
3207 //    cerr << "marg cond prob:" << endl;
3208 //    cerr << marg_cond_prob << endl;
3209 
3210     for (size_t j = 0; j < proposed_gammas.size(); ++j)
3211     {
3212         if (num_compatible[j] > 0)
3213             assignment_probs.col(j) /= num_compatible[j];
3214     }
3215 
3216 //    cerr << "multiplicities:" << endl;
3217 //    cerr << alignment_multiplicities << endl;
3218 //    cerr << "avg matrix:" << endl;
3219 //    cerr << assignment_probs << endl;
3220     //assignment_probs = marg_cond_prob;
3221 }
3222 
3223 
3224 
calculate_iterated_exp_count_covariance(const vector<double> & gammas,const vector<MateHit> & nr_alignments,const vector<boost::shared_ptr<Abundance>> & transcripts,ublas::matrix<double> & count_covariance)3225 void calculate_iterated_exp_count_covariance(const vector<double>& gammas,
3226                                              const vector<MateHit>& nr_alignments,
3227                                              const vector<boost::shared_ptr<Abundance> >& transcripts,
3228                                              ublas::matrix<double>& count_covariance)
3229 {
3230     // Now calculate the _iterated_exp_count_covariance matrix via iterated expectation
3231     vector<vector<double> > cond_probs(transcripts.size(), vector<double>());
3232     for(size_t j = 0; j < transcripts.size(); ++j)
3233     {
3234         cond_probs[j]= *(transcripts[j]->cond_probs());
3235     }
3236 
3237     vector<double> u(nr_alignments.size());
3238     for (size_t i = 0; i < nr_alignments.size(); ++i)
3239     {
3240         u[i] = nr_alignments[i].collapse_mass();
3241     }
3242 
3243     count_covariance = ublas::zero_matrix<double>(transcripts.size(), transcripts.size());
3244 
3245     ublas::vector<double> total_cond_prob = ublas::zero_vector<double>(nr_alignments.size());
3246 
3247     for (size_t i = 0; i < nr_alignments.size(); ++i)
3248     {
3249         // the replicate gamma mles might not be available, if one of the
3250         // replicates returned an error, we'll consider all to be unreliable
3251         for (size_t j = 0; j < cond_probs.size(); ++j)
3252         {
3253             if (cond_probs[j][i] > 0)
3254             {
3255                 total_cond_prob(i) += gammas[j] * cond_probs[j][i];
3256                 assert (!isnan(total_cond_prob(i) && ! isinf(total_cond_prob(i))));
3257             }
3258         }
3259     }
3260 
3261     // Compute the marginal conditional probability for each fragment against each isoform
3262     ublas::matrix<double>  marg_cond_prob = ublas::zero_matrix<double>(transcripts.size(), nr_alignments.size());
3263 
3264     for (size_t i = 0; i < nr_alignments.size(); ++i)
3265     {
3266         // the replicate gamma mles might not be available, if one of the
3267         // replicates returned an error, we'll consider all to be unreliable
3268         for (size_t j = 0; j < cond_probs.size(); ++j)
3269         {
3270             if (total_cond_prob(i))
3271             {
3272                 if (cond_probs[j][i] > 0)
3273                 {
3274                     marg_cond_prob(j,i) = (gammas[j] * cond_probs[j][i])/total_cond_prob(i);
3275                 }
3276             }
3277         }
3278     }
3279 
3280     double total_var = 0.0;
3281 
3282     ublas::vector<double> expected_counts = ublas::zero_vector<double>(cond_probs.size());
3283 
3284     //iterate over fragments
3285     for (size_t i = 0; i < marg_cond_prob.size2(); ++i)
3286     {
3287 
3288         // iterate over transcripts
3289         for (size_t j = 0; j < marg_cond_prob.size1(); ++j)
3290         {
3291             double c_j_i = marg_cond_prob(j,i);
3292 
3293             expected_counts(j) += u[i] * marg_cond_prob(j,i);
3294 
3295             //if (c_j_i == 0 || c_j_i == 1.0)
3296             //    continue;
3297 
3298             for (size_t k = 0; k < marg_cond_prob.size1(); ++k)
3299             {
3300                 double c_k_i = marg_cond_prob(k,i);
3301                 //if (c_k_i == 0 || c_k_i == 1.0)
3302                 //    continue;
3303 
3304                 if (j == k)
3305                 {
3306                     if (c_k_i != 0 && c_k_i != 1.0)
3307                     {
3308                         double var = u[i] * c_k_i * (1.0 - c_k_i);
3309                         count_covariance(k,k) += var;
3310                         assert (var >= 0);
3311                         assert (!isnan(var) && !isinf(var));
3312                         total_var += var;
3313                     }
3314                 }
3315                 else
3316                 {
3317                     if (c_k_i != 0 && c_k_i != 1.0 &&
3318                         c_j_i != 0 && c_j_i != 1.0)
3319                     {
3320                         double covar = -u[i] * c_k_i * c_j_i;
3321                         assert (covar <= 0);
3322                         assert (!isnan(covar) && !isinf(covar));
3323                         count_covariance(k,j) += covar;
3324                     }
3325                 }
3326             }
3327         }
3328 
3329     }
3330     //cerr << "expected counts" << endl;
3331     //cerr << expected_counts << endl;
3332 
3333 //    // take care of little rounding errors
3334 //    for (size_t i = 0; i < _iterated_exp_count_covariance.size1(); ++i)
3335 //    {
3336 //        for (size_t j = 0; j < _iterated_exp_count_covariance.size2(); ++j)
3337 //        {
3338 //            if (i == j)
3339 //            {
3340 //                double c = _iterated_exp_count_covariance(i,j);
3341 //                if (c < 0)
3342 //                    _iterated_exp_count_covariance(i,j) = 0;
3343 //                //assert(c >= 0);
3344 //            }
3345 //            else
3346 //            {
3347 //                double c = _iterated_exp_count_covariance(i,j);
3348 //                if (c > 0)
3349 //                    _iterated_exp_count_covariance(i,j) = 0;
3350 //                //assert(c <= 0);
3351 //            }
3352 //        }
3353 //        _abundances[i]->num_fragment_uncertainty_var(_iterated_exp_count_covariance(i,i));
3354 //    }
3355 }
3356 
calculate_kappas()3357 void AbundanceGroup::calculate_kappas()
3358 {
3359     size_t num_members = _abundances.size();
3360     _kappa_covariance = ublas::matrix<double>(num_members,
3361 											  num_members);
3362 	//cerr << gamma_cov <<endl;
3363 
3364 	assert (_gamma_covariance.size1() == num_members);
3365 	assert (_gamma_covariance.size2() == num_members);
3366 
3367 	//tss_group.sub_quants = vector<QuantGroup>(isos_in_tss);
3368 
3369 	double S_FPKM = 0.0;
3370 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
3371 	{
3372 		if (pA->effective_length() > 0)
3373 		{
3374 			S_FPKM += pA->FPKM();
3375 		}
3376 	}
3377 
3378     //fprintf (stderr, "*********\n");
3379 	BOOST_FOREACH (boost::shared_ptr<Abundance> pA, _abundances)
3380 	{
3381 		if (S_FPKM > 0)
3382 		{
3383 			pA->kappa(pA->FPKM() / S_FPKM);
3384         }
3385 		else
3386 		{
3387 			pA->kappa(0);
3388 		}
3389 	}
3390 
3391 	for (size_t k = 0; k < num_members; ++k)
3392 	{
3393 		for (size_t m = 0; m < num_members; ++m)
3394 		{
3395             double L = _abundances[k]->effective_length() *
3396             _abundances[m]->effective_length();
3397             if (L == 0.0)
3398             {
3399                 _kappa_covariance(k,m) = 0.0;
3400             }
3401             else if (m == k)
3402             {
3403                 // Use the modeled count variance here instead
3404                 double kappa_var;
3405                 if (S_FPKM)
3406                 {
3407                     kappa_var = _fpkm_covariance(k,k) / (S_FPKM * S_FPKM);
3408                 }
3409                 else
3410                 {
3411                     kappa_var = 0.0;
3412                 }
3413 
3414                 if (isnan(kappa_var) || isinf(kappa_var)) // to protect against underflow
3415                     kappa_var = 0;
3416                 _kappa_covariance(k,m) = kappa_var;
3417             }
3418             else
3419             {
3420                 double kappa_covar;
3421                 if (S_FPKM)
3422                 {
3423                     kappa_covar = _fpkm_covariance(k,m) / (S_FPKM * S_FPKM);
3424                 }
3425                 else
3426                 {
3427                     kappa_covar = 0.0;
3428                 }
3429                 _kappa_covariance(k,m) = kappa_covar;
3430             }
3431 		}
3432 	}
3433 
3434 
3435 //	size_t num_members = _abundances.size();
3436 //	_kappa_covariance = ublas::zero_matrix<double>(num_members,
3437 //											  num_members);
3438 //    if (FPKM() == 0)
3439 //    {
3440 //        for (size_t k = 0; k < num_members; ++k)
3441 //        {
3442 //            _abundances[k]->kappa(0);
3443 //        }
3444 //        return;
3445 //    }
3446 //
3447 
3448     // FIXME:
3449     size_t num_count_draws = _assigned_count_samples.size();
3450     vector<Eigen::VectorXd > relative_abundances (num_count_draws, Eigen::VectorXd::Zero(num_members));
3451 
3452     // We'll use the effective lengths to transform counts into relative abundances,
3453     // and then use that to calculate the kappa variances and covariances.
3454     Eigen::VectorXd effective_length_recip = Eigen::VectorXd::Zero(_abundances.size());
3455     for (size_t i = 0; i < _abundances.size(); ++i)
3456     {
3457         if (_abundances[i]->effective_length() > 0)
3458             effective_length_recip(i) = 1.0 / _abundances[i]->effective_length();
3459     }
3460 
3461     for (size_t i = 0; i < num_count_draws; ++i)
3462     {
3463 
3464         Eigen::VectorXd relative_abundance = effective_length_recip.array() * _assigned_count_samples[i].array();
3465         double total = relative_abundance.sum();
3466         if (total > 0)
3467             relative_abundance /= total;
3468         //cerr << relative_abundance.transpose() << endl;
3469         relative_abundances[i] = relative_abundance;
3470     }
3471 }
3472 
get_alignments_from_scaffolds(const vector<boost::shared_ptr<Abundance>> & abundances,vector<MateHit> & alignments)3473 void get_alignments_from_scaffolds(const vector<boost::shared_ptr<Abundance> >& abundances,
3474 								   vector<MateHit>& alignments)
3475 {
3476 	set<const MateHit*> hits_in_gene_set;
3477 
3478 	BOOST_FOREACH(boost::shared_ptr<Abundance> pA, abundances)
3479 	{
3480 		boost::shared_ptr<Scaffold> pS = pA->transfrag();
3481 		assert (pS);
3482 		hits_in_gene_set.insert(pS->mate_hits().begin(),
3483 								pS->mate_hits().end());
3484 	}
3485 
3486 	for(set<const MateHit*>::iterator itr = hits_in_gene_set.begin();
3487 		itr != hits_in_gene_set.end();
3488 		++itr)
3489 	{
3490 		alignments.push_back(**itr);
3491 	}
3492 
3493 	sort(alignments.begin(), alignments.end(), mate_hit_lt);
3494 }
3495 
3496 //void round(Eigen::VectorXd&  p) {
3497 //
3498 //	double KILLP = 0; // kill all probabilities below this
3499 //
3500 //	for (size_t i = 0; i < p.size(); ++i) {
3501 //		if (p(i) < KILLP)
3502 //			p(i) = 0;
3503 //	}
3504 //}
3505 
Estep(int N,int M,const Eigen::VectorXd & p,Eigen::MatrixXd & U,const Eigen::MatrixXd & cond_probs,const Eigen::VectorXd & alignment_multiplicities)3506 void Estep (int N,
3507 			int M,
3508 			const Eigen::VectorXd& p,
3509 			Eigen::MatrixXd& U,
3510 			const Eigen::MatrixXd& cond_probs,
3511 			const Eigen::VectorXd& alignment_multiplicities) {
3512 	// given p, fills U with expected frequencies
3513 	int i,j;
3514 
3515 //    Eigen::VectorXd frag_prob_sums = Eigen::VectorXd::Zero(M);
3516 //
3517 //    for (j = 0; j < N; ++j)
3518 //    {
3519 //        for (i = 0; i < M; ++i)
3520 //        {
3521 //            frag_prob_sums(i) += cond_probs(j,i) * p(j);
3522 //        }
3523 //    }
3524 //
3525 //    for (i = 0; i < M; ++i)
3526 //    {
3527 //        frag_prob_sums(i) = frag_prob_sums(i) ? (1.0 / frag_prob_sums(i)) : 0.0;
3528 //    }
3529 //
3530 //    for (j = 0; j < N; ++j)
3531 //    {
3532 //        for (i = 0; i < M; ++i)
3533 //        {
3534 //            double ProbY = frag_prob_sums(i);
3535 //            double exp_i_j = alignment_multiplicities(i) * cond_probs(j,i) * p(j) * ProbY;
3536 //            U(j,i) = exp_i_j;
3537 //        }
3538 //    }
3539 
3540     Eigen::VectorXd frag_prob_sums;// = Eigen::VectorXd::Zero(M);
3541 
3542 //    for (j = 0; j < N; ++j)
3543 //    {
3544 //        for (i = 0; i < M; ++i)
3545 //        {
3546 //            frag_prob_sums(i) += cond_probs(j,i) * p(j);
3547 //            assert (!isnan(cond_probs(j,i)) && !isinf(cond_probs(j,i)));
3548 //        }
3549 //    }
3550 
3551     frag_prob_sums = p.transpose() * cond_probs;
3552 
3553     for (i = 0; i < M; ++i)
3554     {
3555         assert (!isnan(frag_prob_sums(i)) && !isinf(frag_prob_sums(i)));
3556         //double x = frag_prob_sums(i);
3557         frag_prob_sums(i) = frag_prob_sums(i) ? (1.0 / frag_prob_sums(i)) : 0.0;
3558         if (isnan(frag_prob_sums(i)) || isinf(frag_prob_sums(i)))
3559         {
3560             frag_prob_sums(i) = 0; // protect against overflow/underflow
3561         }
3562     }
3563 
3564     Eigen::VectorXd x = frag_prob_sums.array() * alignment_multiplicities.array();
3565 //    Eigen::MatrixXd y = p.transpose() * cond_probs;
3566 //    Eigen::MatrixXd UU = y.array() * x.array();
3567 
3568 //    cerr << UU <<endl;
3569 //    for (j = 0; j < N; ++j)
3570 //    {
3571 //        for (i = 0; i < M; ++i)
3572 //        {
3573 //            //double ProbY = frag_prob_sums(i);
3574 //            double exp_i_j = cond_probs(j,i) * p(j) * x(i);
3575 //            U(j,i) = exp_i_j;
3576 //        }
3577 //    }
3578 
3579     U = Eigen::MatrixXd(N,M);
3580 
3581     for (i = 0; i < M; ++i)
3582     {
3583         //double ProbY = frag_prob_sums(i);
3584         //double exp_i_j = cond_probs(j,i) * p(j) * x(i);
3585         //U.r = exp_i_j;
3586         U.col(i) = cond_probs.col(i).array() * p.array();
3587     }
3588     for (j = 0; j < N; ++j)
3589     {
3590         U.array().row(j) *= x.transpose().array();
3591     }
3592 
3593     //cerr << UU << endl;
3594     //cerr << "==========" << endl;
3595     //cerr << U << endl;
3596     //cerr << "**********" << endl;
3597     //Eigen::ArrayXXd a_cond_probs = cond_probs.array();
3598     //Eigen::ArrayXXd a_p = p.array();
3599     //Eigen::ArrayXXd a_U = a_p * a_cond_probs.colwise();
3600     //U = (.colwise() * p.transpose().array()).matrix();
3601     //U = cond_probs.colwise() * p.array();
3602     //U = alignment_multiplicities.array() * p.array() * cond_probs.array() * frag_prob_sums.array();
3603     //U = (frag_prob_sums.array() * alignment_multiplicities.array()).rowwise() * p.array().colwise() * cond_probs.array().colwise();
3604 }
3605 
3606 
Mstep(int N,int M,Eigen::VectorXd & p,const Eigen::MatrixXd & U)3607 void Mstep (int N,
3608             int M,
3609             Eigen::VectorXd& p,
3610             const Eigen::MatrixXd& U)
3611 {
3612 	Eigen::VectorXd v;// = Eigen::VectorXd::Zero(N);
3613 
3614 	double m = 0;
3615 
3616 	v = U.rowwise().sum();
3617     m = v.colwise().sum()(0);
3618 
3619 	if (m)
3620 	{
3621 		p = v / m;
3622 	}
3623 	else
3624 	{
3625         p = Eigen::VectorXd::Zero(N);
3626 	}
3627 }
3628 
3629 
logLike(int N,int M,Eigen::VectorXd & p,const Eigen::MatrixXd & cond_prob,const Eigen::VectorXd & alignment_multiplicities,const vector<double> & log_conv_factors)3630 double logLike (int N,
3631 				int M,
3632 				Eigen::VectorXd& p,
3633 				const Eigen::MatrixXd& cond_prob,
3634 				const Eigen::VectorXd& alignment_multiplicities,
3635                 const vector<double>& log_conv_factors) {
3636 	//int i,j;
3637 
3638 	double ell = accumulate(log_conv_factors.begin(), log_conv_factors.end(), 0.0);
3639 	//double Prob_Y;
3640 //	for (int i= 0; i < M; i++) {
3641 //		Prob_Y = 0;
3642 //		for (int j= 0; j < N; j++) {
3643 //			Prob_Y += cond_prob(j,i) * p(j);
3644 //		}
3645 //		if (Prob_Y > 0) {
3646 //			ell += (alignment_multiplicities(i) * log(Prob_Y));
3647 //		}
3648 //	}
3649     Eigen::VectorXd Prob_Ys = p.transpose() * cond_prob;
3650     Eigen::VectorXd log_Prob_Ys = Prob_Ys.array().log();
3651     //log_Prob_Ys *= alignment_multiplicities;
3652     for (int i = 0; i < M; i++) {
3653         if (Prob_Ys(i) > 0)
3654         {
3655             ell += alignment_multiplicities(i) * log_Prob_Ys(i);
3656         }
3657     }
3658 	return ell;
3659 }
3660 
EM(int N,int M,Eigen::VectorXd & newP,const Eigen::MatrixXd & cond_prob,const Eigen::VectorXd & alignment_multiplicities,vector<double> const & log_conv_factors,bool & converged)3661 double EM(int N, int M,
3662           Eigen::VectorXd&  newP,
3663           const Eigen::MatrixXd& cond_prob,
3664 		  const Eigen::VectorXd& alignment_multiplicities,
3665           vector<double> const & log_conv_factors,
3666           bool& converged)
3667 {
3668     converged = true;
3669 	//double sum = 0;
3670 	double newEll = 0;
3671     Eigen::VectorXd p = Eigen::VectorXd::Zero(N);
3672     Eigen::MatrixXd  U = Eigen::MatrixXd::Zero(N,M);
3673 	double ell = 0;
3674 	int iter = 0;
3675 	int j;
3676 
3677     if (N == 0 || M == 0)
3678         return NUMERIC_OK;
3679 
3680     for (j = 0; j < N; ++j) {
3681         //p[j] = drand48();
3682         //sum += p[j];
3683         p(j) = 1.0/(double)N;
3684     }
3685 
3686 
3687 
3688     static const double ACCURACY = mle_accuracy; // convergence for EM
3689 
3690 	while (((iter <= 2) || (abs(ell - newEll) > ACCURACY)) && (iter < max_mle_iterations)) {
3691 		if (iter > 0) {
3692 			//round(newP);
3693 			p = newP;
3694 			ell = newEll;
3695 		}
3696 
3697 		Estep(N, M, p, U, cond_prob, alignment_multiplicities); //  fills U
3698 		Mstep(N, M, newP,U); // fills p
3699 
3700 		newEll = logLike(N, M, newP, cond_prob,alignment_multiplicities, log_conv_factors);
3701 
3702 		//fprintf(stderr, "%d\t%lf\n", iter, newEll);
3703 
3704 		//printf("%.3f %.3f %.3f ", newP[0], newP[1], newP[2]);
3705 		//printf("%.3f %.3f %.3f ", newP[3], newP[4], newP[5]);
3706 		//printf("%.3f %.3f %.3f\n", newP[6], newP[7], newP[8]);
3707 		iter++;
3708 	}
3709 	if (iter >= max_mle_iterations)
3710     {
3711 		verbose_msg("Warning: ITERMAX reached in abundance estimation, estimation hasn't fully converged\n");
3712         converged = false;
3713     }
3714     verbose_msg("Convergence reached in %d iterations \n", iter);
3715 	return newEll;
3716 }
3717 
compute_fisher(const vector<boost::shared_ptr<Abundance>> & transcripts,const ublas::vector<double> & abundances,const vector<MateHit> & alignments,const vector<double> & u,boost::numeric::ublas::matrix<double> & fisher)3718 void compute_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts,
3719 					const ublas::vector<double>& abundances,
3720 					const vector<MateHit>& alignments,
3721 					const vector<double>& u,
3722 					boost::numeric::ublas::matrix<double>& fisher)
3723 {
3724 	int M = alignments.size();
3725 	int N = transcripts.size();
3726 
3727 	vector<long double> denoms(M, 0.0);
3728 	vector<vector<long double> > P(M,vector<long double>(N,0));
3729 
3730 	for (int j = 0; j < N; ++j)
3731 	{
3732 		const vector<double>& cond_probs_j = *(transcripts[j]->cond_probs());
3733 		for (int x = 0; x < M; ++x)
3734 		{
3735 			if (cond_probs_j[x]==0)
3736 				continue;
3737 			long double alpha = 0.0;
3738 			alpha = cond_probs_j[x];
3739 			alpha *= abundances(j);
3740 			denoms[x] += alpha;
3741 		}
3742 	}
3743 
3744 	for (int x = 0; x < M; ++x)
3745 		denoms[x] *= denoms[x];
3746 
3747 
3748 	for (int j = 0; j < N; ++j)
3749 	{
3750 		const vector<double>& cond_probs_j = *(transcripts[j]->cond_probs());
3751 		for (int k = 0; k < N; ++k)
3752 		{
3753 
3754 			const vector<double>& cond_probs_k = *(transcripts[k]->cond_probs());
3755 
3756 			for (int x = 0; x < M; ++x)
3757 			{
3758 				if (cond_probs_j[x]==0 && cond_probs_k[x]==0)
3759 					continue;
3760 
3761 				assert(denoms[x] != 0.0);
3762 
3763 				double fisher_x_j_k = cond_probs_j[x] * cond_probs_k[x] / denoms[x];
3764 
3765 				fisher(j,k) += u[x] * fisher_x_j_k;
3766 			}
3767 		}
3768 	}
3769 }
3770 
compute_posterior_expectation(const vector<ublas::vector<double>> & weighted_samples,const vector<pair<size_t,double>> & sample_weights,ublas::vector<double> & expectation,long double & log_total_weight)3771 AbundanceStatus compute_posterior_expectation(const vector<ublas::vector<double> >& weighted_samples,
3772                                               const vector<pair<size_t, double> >& sample_weights,
3773                                               ublas::vector<double>& expectation,
3774                                               long double& log_total_weight)
3775 {
3776     ublas::vector<double> log_expectation(expectation.size());
3777     log_total_weight = 0.0;
3778 
3779     // compute the weighted sum of the samples from the proposal distribution,
3780     // but keep the result in log space to avoid underflow.
3781 	for (size_t i = 0; i < weighted_samples.size(); ++i)
3782 	{
3783 		const ublas::vector<double>& scaled_sample = weighted_samples[i];
3784 		double log_weight = sample_weights[i].second;
3785 		if (log_total_weight == 0.0)
3786 		{
3787 			log_expectation = weighted_samples[i];
3788 			log_total_weight = log_weight;
3789 		}
3790 		else
3791 		{
3792 			for (size_t e = 0; e < log_expectation.size(); ++e)
3793 			{
3794 				log_expectation(e) = log_space_add<long double>(log_expectation[e], scaled_sample[e]);
3795 			}
3796 			log_total_weight = log_space_add<long double>(log_total_weight, log_weight);
3797 		}
3798 	}
3799 
3800     if (log_total_weight == 0 || sample_weights.size() < 100)
3801 	{
3802 		verbose_msg("Warning: restimation failed, importance samples have zero weight.\n\tResorting to MLE and observed Fisher\n");
3803 		return NUMERIC_FAIL;
3804 	}
3805 
3806     // compute the weighted mean, and transform back out of log space
3807     for (size_t e = 0; e < expectation.size(); ++e)
3808 	{
3809 		expectation(e) = (long double)log_expectation(e) - log_total_weight;
3810 		expectation(e) = exp(expectation(e));
3811 	}
3812 
3813 	for (size_t e = 0; e < expectation.size(); ++e)
3814 	{
3815 		if (isinf(expectation(e)) || isnan(expectation(e)))
3816 		{
3817 			verbose_msg("Warning: isoform abundance is NaN, restimation failed.\n\tResorting to MLE and observed Fisher.");
3818 			return NUMERIC_FAIL;
3819 		}
3820 	}
3821 
3822 	for (size_t j = 0; j < expectation.size(); ++j)
3823 	{
3824 		if (expectation(j) < 0)
3825 			expectation(j) = 0;
3826 	}
3827 
3828 	long double m = sum(expectation);
3829 
3830 	if (m == 0 || isinf(m) || isnan(m))
3831 	{
3832 		verbose_msg("Warning: restimation failed, could not renormalize MAP estimate\n\tResorting to MLE and observed Fisher.");
3833 		return NUMERIC_FAIL;
3834 	}
3835 
3836 	for (size_t j = 0; j < expectation.size(); ++j) {
3837 		expectation(j) = expectation(j) / m;
3838 	}
3839     return NUMERIC_OK;
3840 }
3841 
calculate_per_replicate_abundances(vector<boost::shared_ptr<Abundance>> & transcripts,const map<boost::shared_ptr<ReadGroupProperties const>,vector<MateHit>> & alignments_per_read_group,std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<AbundanceGroup>> & ab_group_per_replicate,bool perform_collapse)3842 AbundanceStatus AbundanceGroup::calculate_per_replicate_abundances(vector<boost::shared_ptr<Abundance> >& transcripts,
3843                                                                    const map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >& alignments_per_read_group,
3844                                                                    std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate,
3845                                                                    bool perform_collapse)
3846 {
3847     for(std::set<boost::shared_ptr<ReadGroupProperties const > >::iterator itr = _read_group_props.begin();
3848         itr != _read_group_props.end();
3849         ++itr)
3850     {
3851         vector<boost::shared_ptr<Abundance> > new_transcripts;
3852         BOOST_FOREACH(boost::shared_ptr<Abundance> ab, transcripts)
3853         {
3854             boost::shared_ptr<TranscriptAbundance> d = boost::static_pointer_cast<TranscriptAbundance>(ab);
3855             //new_transcripts.push_back(boost::shared_ptr<Abundance>(new TranscriptAbundance(*boost::static_pointer_cast<TranscriptAbundance>(ab))));
3856             TranscriptAbundance* pT = new TranscriptAbundance;
3857             pT->transfrag(d->transfrag());
3858             boost::shared_ptr<Abundance> ab_new(pT);
3859             ab_new->description(ab_new->description());
3860             ab_new->locus_tag("");
3861             new_transcripts.push_back(ab_new);
3862         }
3863         boost::shared_ptr<AbundanceGroup> ab_group(new AbundanceGroup(new_transcripts));
3864         std::set<boost::shared_ptr<ReadGroupProperties const > > rg_props;
3865         rg_props.insert(*itr);
3866         ab_group->init_rg_props(rg_props);
3867 
3868         map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >::const_iterator al_itr =
3869             alignments_per_read_group.find(*itr);
3870 
3871         assert(al_itr != alignments_per_read_group.end());
3872         const vector<MateHit>& rep_hits = alignments_per_read_group.find(*itr)->second;
3873 
3874         vector<MateHit> nr_alignments;
3875         vector<MateHit> non_equiv_alignments;
3876         vector<double> log_conv_factors;
3877         vector<boost::shared_ptr<Abundance> > mapped_transcripts;
3878 
3879         if (perform_collapse)
3880         {
3881             collapse_hits(rep_hits, nr_alignments);
3882             collapse_equivalent_hits_helper(nr_alignments, transcripts, non_equiv_alignments, log_conv_factors);
3883             assert (non_equiv_alignments.size() == log_conv_factors.size());
3884             log_conv_factors = vector<double>(nr_alignments.size(), 0);
3885             nr_alignments.clear();
3886         }
3887         else
3888         {
3889             nr_alignments = rep_hits;
3890             non_equiv_alignments = nr_alignments;
3891             log_conv_factors = vector<double>(nr_alignments.size(), 0);
3892         }
3893 
3894         ab_group->calculate_abundance_for_replicate(non_equiv_alignments, false);
3895 
3896         //fprintf (stderr, "FPKM = %lg\n", ab_group->FPKM());
3897         ab_group_per_replicate[*itr] = ab_group;
3898     }
3899 
3900     return NUMERIC_OK;
3901 }
3902 
calculate_inverse_fisher(const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<MateHit> & alignments,const ublas::vector<double> & gamma_mean,ublas::matrix<double> & inverse_fisher)3903 AbundanceStatus calculate_inverse_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts,
3904                                          const vector<MateHit>& alignments,
3905                                          const ublas::vector<double>& gamma_mean,
3906                                          ublas::matrix<double>& inverse_fisher)
3907 {
3908 //    size_t N = gamma_covariance.size1();
3909 
3910 //	gamma_map_covariance = ublas::zero_matrix<double>(N);
3911 
3912 	typedef ublas::matrix<double> matrix_type;
3913 	matrix_type fisher = ublas::zero_matrix<double>(gamma_mean.size(),gamma_mean.size());
3914 
3915 	vector<double> u(alignments.size());
3916 	for (size_t i = 0; i < alignments.size(); ++i)
3917 	{
3918 		u[i] = alignments[i].collapse_mass();
3919 	}
3920 
3921 	compute_fisher(transcripts,
3922 				   gamma_mean,
3923 				   alignments,
3924 				   u,
3925 				   fisher);
3926 
3927 	ublas::matrix<double> epsilon = ublas::zero_matrix<double>(gamma_mean.size(),gamma_mean.size());
3928 	for (size_t i = 0; i < gamma_mean.size(); ++i)
3929 	{
3930 		epsilon(i,i) = 1e-6;
3931 	}
3932 
3933 	fisher += epsilon; // modify matrix to avoid problems during inverse
3934 
3935 	ublas::matrix<double> fisher_chol = fisher;
3936 
3937 	double ch = cholesky_factorize(fisher_chol);
3938 	if (ch != 0.0)
3939 	{
3940 		verbose_msg("Warning: Fisher matrix is not positive definite (bad element: %lg)\n", ch);
3941 		return NUMERIC_FAIL;
3942 	}
3943 
3944 	inverse_fisher = ublas::zero_matrix<double>(gamma_mean.size(),gamma_mean.size());
3945 	bool invertible = chol_invert_matrix(fisher_chol, inverse_fisher);
3946 
3947     ublas::matrix<double> test_fisher = inverse_fisher;
3948 	ch = cholesky_factorize(test_fisher);
3949 	if (ch != 0.0 || !invertible)
3950 	{
3951 		verbose_msg("Warning: Fisher matrix is not inverible\n", ch);
3952 		return NUMERIC_FAIL;
3953 	}
3954 
3955     return NUMERIC_OK;
3956 }
3957 
revise_map_mean_and_cov_estimate(double log_total_weight,const ublas::vector<double> & expectation,const vector<pair<size_t,double>> & sample_weights,const vector<ublas::vector<double>> & weighted_samples,ublas::vector<double> & gamma_map_estimate,ublas::matrix<double> & gamma_map_covariance)3958 AbundanceStatus revise_map_mean_and_cov_estimate(double log_total_weight,
3959                                                  const ublas::vector<double>& expectation,
3960                                                  const vector<pair<size_t, double> >& sample_weights,
3961                                                  const vector<ublas::vector<double> >& weighted_samples,
3962                                                  ublas::vector<double>& gamma_map_estimate,
3963                                                  ublas::matrix<double>& gamma_map_covariance)
3964 {
3965     int N = expectation.size();
3966 
3967 	// revise gamma by setting it to the posterior expectation computed via the
3968 	// importance sampling
3969 	gamma_map_estimate = expectation;
3970 
3971 	// calculate the sample - mean vectors, store them in log space
3972 	vector<ublas::vector<double> > sample_expectation_diffs;
3973 
3974     ublas::vector<double> check_expectation = ublas::zero_vector<double>(expectation.size());
3975 
3976 	for (size_t j = 0; j < weighted_samples.size(); ++j)
3977 	{
3978 		ublas::vector<double> sample = weighted_samples[j];
3979 		double log_sample_weight = sample_weights[j].second;
3980 
3981 		for (size_t e = 0; e < expectation.size(); ++e)
3982 		{
3983 			// sample is already log transformed after it was weighted, so we
3984 			// need to divide by the sample weight to recover the original sample
3985 			// value, then undo the log transform, then subtract the mean from it
3986 			sample(e) = (exp(((long double)sample(e) - log_sample_weight)) - expectation(e));
3987 			//sample(e) *= exp((log_sample_weight - log_total_weight));
3988 		}
3989 		//cerr << sample << endl;
3990 		sample_expectation_diffs.push_back(sample);
3991 	}
3992 
3993     // We want to revise the covariance matrix from the samples, since we'll
3994     // need it later for the CIs.
3995     ublas::matrix<double> revised_cov = ublas::zero_matrix<double>(N,N);
3996 
3997     // accumulate the contributions from the other samples (doing one cell of
3998 	// covariance matrix per outer (i x j) loop iteration.
3999 
4000     for (size_t j = 0; j < sample_expectation_diffs.size(); ++j)
4001     {
4002         double log_sample_weight = sample_weights[j].second;
4003         double w = exp((log_sample_weight - log_total_weight));
4004         ublas::vector<double> sample = weighted_samples[j];
4005 
4006         for (size_t e = 0; e < expectation.size(); ++e)
4007 		{
4008 			// sample is already log transformed after it was weighted, so we
4009 			// need to divide by the sample weight to recover the original sample
4010 			// value, then undo the log transform, then subtract the mean from it
4011 			sample(e) = exp(sample(e) - log_sample_weight);
4012 			//sample(e) *= exp((log_sample_weight - log_total_weight));
4013 		}
4014 
4015         revised_cov += w * (outer_prod(sample,sample));
4016 	}
4017 
4018     revised_cov -= outer_prod(expectation,expectation);
4019 
4020 	//cerr << "Revised COV" << endl;
4021 	//cerr << revised_cov << endl;
4022 	gamma_map_covariance = revised_cov;
4023 
4024     //cerr << "Revised MAP estimate: " << expectation << endl;
4025     //cerr << "Revised Covariance matrix:" << endl;
4026     //cerr << gamma_map_covariance << endl;
4027     //cerr << "*************" << endl;
4028 
4029     return NUMERIC_OK;
4030 }
4031 
calc_is_scale_factor(const ublas::matrix<double> & covariance_chol,double & is_scale_factor)4032 AbundanceStatus calc_is_scale_factor(const ublas::matrix<double>& covariance_chol,
4033                                      double& is_scale_factor)
4034 {
4035     double det = determinant(covariance_chol);
4036     is_scale_factor = pow(2.0*boost::math::constants::pi<double>(), covariance_chol.size1()/2.0);
4037 	double s = sqrt(det);
4038 	is_scale_factor *= s;
4039 
4040 	//assert (det);
4041 	if (s == 0.0)
4042 	{
4043 		verbose_msg("Error: sqrt(det(cov)) == 0, %lf after rounding. \n", det);
4044 		//cerr << covariance << endl;
4045 		return NUMERIC_FAIL;
4046 	}
4047 	assert (s);
4048 	assert (is_scale_factor);
4049     return NUMERIC_OK;
4050 }
4051 
4052 template<class M, class PM>
is_identifiable(M & m,PM & pm)4053 bool is_identifiable(M &m, PM &pm)
4054 {
4055     using namespace ublas;
4056     typedef M matrix_type;
4057     typedef typename M::size_type size_type;
4058     typedef typename M::value_type value_type;
4059 
4060     int singular = 0;
4061     size_type size1 = m.size1 ();
4062     size_type size2 = m.size2 ();
4063     size_type size = (std::min) (size1, size2);
4064     for (size_type i = 0; i < size; ++ i) {
4065         matrix_column<M> mci (column (m, i));
4066         matrix_row<M> mri (row (m, i));
4067         size_type i_norm_inf = i + index_norm_inf (project (mci, boost::numeric::ublas::range (i, size1)));
4068         if (m (i_norm_inf, i) != value_type/*zero*/()) {
4069             if (i_norm_inf != i) {
4070                 pm (i) = i_norm_inf;
4071                 row (m, i_norm_inf).swap (mri);
4072             } else {
4073                 //BOOST_UBLAS_CHECK (pm (i) == i_norm_inf, external_logic ());
4074             }
4075             project (mci, boost::numeric::ublas::range (i + 1, size1)) *= value_type (1) / m (i, i);
4076         } else if (singular == 0) {
4077             singular = i + 1;
4078         }
4079         project (m, boost::numeric::ublas::range (i + 1, size1), boost::numeric::ublas::range (i + 1, size2)).minus_assign (outer_prod (project (mci, boost::numeric::ublas::range (i + 1, size1)),
4080                                                                               project (mri, boost::numeric::ublas::range (i + 1, size2))));
4081     }
4082     return singular == 0;
4083 }
4084 
gamma_mle(const vector<boost::shared_ptr<Abundance>> & transcripts,const vector<MateHit> & nr_alignments,const vector<double> & log_conv_factors,vector<double> & gammas,bool check_identifiability)4085 AbundanceStatus gamma_mle(const vector<boost::shared_ptr<Abundance> >& transcripts,
4086                           const vector<MateHit>& nr_alignments,
4087                           const vector<double>& log_conv_factors,
4088                           vector<double>& gammas,
4089                           bool check_identifiability)
4090 {
4091 	//gammas.clear();
4092     gammas = vector<double>(transcripts.size(), 0);
4093 	if (transcripts.empty())
4094 		return NUMERIC_OK;
4095 
4096 
4097 	if (transcripts.size() == 1)
4098 	{
4099 		gammas = vector<double>(1,1.0);
4100 		return NUMERIC_OK;
4101 	}
4102 
4103 	size_t M = nr_alignments.size();
4104 	size_t N = transcripts.size();
4105 
4106     bool converged = true;
4107     bool identifiable = true;
4108 
4109 	if (M > 0)
4110 	{
4111 
4112 		//vector<vector<double> > saliencies (M,vector<double>(N,0));
4113 
4114 
4115 		//compute_saliencies(cond_probs, saliencies, saliency_weight);
4116 
4117         Eigen::VectorXd prob = Eigen::VectorXd::Zero(N);
4118         Eigen::VectorXd _phint = Eigen::VectorXd::Zero(N);
4119 
4120 		double logL;
4121 
4122         Eigen::MatrixXd cond_probs(N, M);
4123 
4124         for (size_t j = 0; j < cond_probs.rows(); ++j)
4125         {
4126             for (size_t i = 0; i < cond_probs.cols(); ++i)
4127             {
4128                 cond_probs(j,i) = (*(transcripts[j]->cond_probs()))[i];
4129             }
4130         }
4131 
4132         if (check_identifiability)
4133         {
4134             ublas::matrix<double> compat = ublas::zero_matrix<double>(M,N);
4135 
4136             for (size_t j = 0; j < N; ++j)
4137             {
4138                 for (size_t i = 0; i < M; ++i)
4139                 {
4140                     if (cond_probs(j,i))
4141                     {
4142                         //compat(i,j) = cond_probs[j][i];
4143                         compat(i,j) = 1;
4144                     }
4145                 }
4146             }
4147 
4148             vector<size_t> transcripts_with_frags;
4149             for (size_t j = 0; j < N; ++j)
4150             {
4151                 bool has_fragment = false;
4152                 for (size_t i = 0; i < M; ++i)
4153                 {
4154                     if (compat(i,j))
4155                     {
4156                         has_fragment = true;
4157                         break;
4158                     }
4159                 }
4160                 if (has_fragment)
4161                     transcripts_with_frags.push_back(j);
4162             }
4163             ublas::matrix<double> reduced_compat = ublas::zero_matrix<double>(M,transcripts_with_frags.size());
4164             for (size_t j = 0; j < transcripts_with_frags.size(); ++j)
4165             {
4166                 column(reduced_compat, j) = column(compat, transcripts_with_frags[j]);
4167             }
4168 
4169 
4170             typedef ublas::permutation_matrix<std::size_t> pmatrix;
4171 
4172             // create a permutation matrix for the LU-factorization
4173             pmatrix pm(reduced_compat.size1());
4174 
4175             // cerr << compat.size2() <<endl;
4176             // perform LU-factorization
4177             identifiable = is_identifiable<ublas::matrix<double>,pmatrix>(reduced_compat,pm);
4178         }
4179 
4180         Eigen::VectorXd alignment_multiplicities(M);
4181 		for (size_t i = 0; i < M; ++i)
4182 		{
4183 			alignment_multiplicities[i] = nr_alignments[i].collapse_mass();
4184 		}
4185 
4186         logL = EM(N, M, prob, cond_probs, alignment_multiplicities, log_conv_factors, converged);
4187 
4188 		gammas = vector<double>(N, 0.0);
4189 
4190 		for (size_t i = 0; i < gammas.size(); ++i)
4191 		{
4192             gammas[i] = prob(i);
4193 			if (isnan(gammas[i]) || isinf(gammas[i]))
4194             {
4195                 return NUMERIC_FAIL;
4196             }
4197 		}
4198 	}
4199 	else
4200 	{
4201 		gammas = vector<double>(N, 0.0);
4202 	}
4203 
4204     double round_err = 0.0;
4205     double num_good = 0;
4206     BOOST_FOREACH (double& g, gammas)
4207     {
4208         if (g < min_isoform_fraction)
4209         {
4210             round_err += g;
4211             g = 0.0;
4212         }
4213         else
4214         {
4215             num_good += 1;
4216         }
4217     }
4218     BOOST_FOREACH (double& g, gammas)
4219     {
4220         if (g != 0)
4221         {
4222             g += (round_err/num_good);
4223         }
4224     }
4225 
4226     if (converged && identifiable)
4227         return NUMERIC_OK;
4228     else
4229     {
4230         if (!identifiable)
4231             //return NUMERIC_LOW_DATA;
4232             return NUMERIC_OK;
4233         else
4234             return NUMERIC_FAIL;
4235     }
4236 
4237     return NUMERIC_OK;
4238 }
4239 
calc_isoform_fpkm_conf_intervals(double FPKM,double variance,ConfidenceInterval & FPKM_conf)4240 void calc_isoform_fpkm_conf_intervals(double FPKM,
4241 									  double variance,
4242 									  ConfidenceInterval& FPKM_conf)
4243 {
4244 	double FPKM_lo = 0.0;
4245 	double FPKM_hi = 0.0;
4246 	FPKM_hi = FPKM + 2 * sqrt(variance);
4247 	FPKM_lo = max(0.0, FPKM - 2 * sqrt(variance));
4248 	FPKM_conf = ConfidenceInterval(FPKM_lo, FPKM_hi);
4249 }
4250 
not_intronic(int p,vector<float> & depth_of_coverage,vector<float> & intronic_cov,float min_intra_intron_fraction,int & intronic_status)4251 bool not_intronic(int p, vector<float>& depth_of_coverage, vector<float>& intronic_cov, float min_intra_intron_fraction,
4252       int& intronic_status) {
4253   bool not_an_intron = (intronic_cov[p]==0 ||
4254         depth_of_coverage[p]/intronic_cov[p] >= min_intra_intron_fraction);
4255  if (not_an_intron) intronic_status--;
4256                else intronic_status++;
4257  return not_an_intron;
4258 }
4259 
4260 
compute_doc(int bundle_origin,const vector<Scaffold> & scaffolds,vector<float> & depth_of_coverage,map<pair<int,int>,float> & intron_depth_of_coverage,bool exclude_intra_intron,vector<float> * intronic_cov,vector<int> * scaff_intronic_status)4261 double compute_doc(int bundle_origin,
4262 				   const vector<Scaffold>& scaffolds,
4263 				   vector<float>& depth_of_coverage,
4264 				   map<pair<int, int>, float>& intron_depth_of_coverage,
4265 				   bool exclude_intra_intron,
4266 				   vector<float>* intronic_cov,
4267 				   vector<int>* scaff_intronic_status)
4268 {
4269   vector<int> i_status;
4270   if (scaff_intronic_status==NULL)
4271     scaff_intronic_status=&i_status;
4272   *scaff_intronic_status = vector<int>(scaffolds.size(), 0);
4273   vector<float> intronic;
4274   if (intronic_cov==NULL)
4275     intronic_cov=&intronic;
4276   *intronic_cov = vector<float>(depth_of_coverage.size(), 0);
4277   //vector<bool> intronic(depth_of_coverage.size(), false);
4278 	depth_of_coverage = vector<float>(depth_of_coverage.size(), 0);
4279 
4280 	set<const MateHit*> hits_in_gene_set;
4281 	for (size_t i = 0; i < scaffolds.size(); ++i)
4282 	{
4283 		hits_in_gene_set.insert(scaffolds[i].mate_hits().begin(),
4284 								scaffolds[i].mate_hits().end());
4285 	}
4286 
4287 	vector<Scaffold> hits;
4288 
4289 	for(set<const MateHit*>::iterator itr = hits_in_gene_set.begin();
4290 		itr != hits_in_gene_set.end();
4291 		++itr)
4292 	{
4293 		hits.push_back(Scaffold(**itr));
4294         hits.back().fpkm((**itr).mass());
4295 	}
4296 
4297 	/*
4298 	//no need for this here, we do it below with depth_of_coverage
4299 	for (size_t i = 0; i < hits.size(); ++i)
4300 	{
4301 		const vector<AugmentedCuffOp>& aug_ops = hits[i].augmented_ops();
4302 		for (size_t j = 0; j < aug_ops.size(); ++j)
4303 		{
4304 			const AugmentedCuffOp& op = aug_ops[j];
4305 			if (op.opcode == CUFF_INTRON)
4306 			{
4307 				for (int K = op.g_left(); K < op.g_right(); ++K)
4308 				{
4309 					intronic[K - bundle_origin] = true;
4310 				}
4311 			}
4312 		}
4313 	}
4314 	*/
4315 	for (size_t i = 0; i < hits.size(); ++i)
4316 	{
4317 		const vector<AugmentedCuffOp>& aug_ops = hits[i].augmented_ops();
4318 		for (size_t j = 0; j < aug_ops.size(); ++j)
4319 		{
4320 			const AugmentedCuffOp& op = aug_ops[j];
4321 			if (op.opcode == CUFF_MATCH)
4322 			{
4323 				for (int K = op.g_left(); K < op.g_right(); ++K)
4324 				{
4325 					depth_of_coverage[K - bundle_origin] += hits[i].fpkm();
4326 				}
4327 			}
4328 			else if (op.opcode == CUFF_INTRON)
4329 			{
4330         for (int K = op.g_left(); K < op.g_right(); ++K)
4331         {
4332           (*intronic_cov)[K - bundle_origin] += hits[i].fpkm();
4333           //intronic[K - bundle_origin] = true;
4334         }
4335 
4336 				pair<map<pair<int,int>,float>::iterator, bool> is = intron_depth_of_coverage.insert(make_pair(make_pair(op.g_left(), op.g_right()), 0));
4337 				is.first->second += hits[i].fpkm();
4338 			}
4339 		}
4340 	}
4341 
4342 	vector<float> knockout(depth_of_coverage);
4343 
4344 	double total_doc = 0;
4345 	int total_len = 0;
4346 	float min_intra_intron_fraction = min(pre_mrna_fraction, min_isoform_fraction);
4347 	//for (size_t i = 0; i < hits.size(); ++i)
4348 	for (size_t i = 0; i < scaffolds.size(); ++i)
4349 	{
4350 		//const vector<AugmentedCuffOp>& aug_ops = hits[i].augmented_ops();
4351 	  const vector<AugmentedCuffOp>& aug_ops = scaffolds[i].augmented_ops();
4352 		for (size_t j = 0; j < aug_ops.size(); ++j)
4353 		{
4354 			const AugmentedCuffOp& op = aug_ops[j];
4355 			if (op.opcode == CUFF_MATCH)
4356 			{
4357 				for (int K = op.g_left(); K < op.g_right(); ++K)
4358 				{
4359 					//if (!exclude_intra_intron || !intronic[K - bundle_origin])
4360 				  if (!exclude_intra_intron ||
4361 				       not_intronic(K-bundle_origin, depth_of_coverage, *intronic_cov, min_intra_intron_fraction,
4362 				           (*scaff_intronic_status)[i]) )
4363 					{
4364 						total_doc += knockout[K - bundle_origin];
4365 						total_len += (knockout[K - bundle_origin] != 0);
4366 						knockout[K - bundle_origin] = 0;
4367 					}
4368 				}
4369 			}
4370 		}
4371 	}
4372 
4373 	return total_doc/(double)total_len;
4374 }
4375 
major_isoform_intron_doc(map<pair<int,int>,float> & intron_doc)4376 double major_isoform_intron_doc(map<pair<int, int>, float>& intron_doc)
4377 {
4378 	double major_isoform_intron_doc = 0;
4379 	int num_major_introns = 0;
4380 	for(map<pair<int, int>, float>::const_iterator itr = intron_doc.begin();
4381 		itr != intron_doc.end();
4382 		++itr)
4383 	{
4384 		bool heaviest = true;
4385 
4386 		for (map<pair<int,int>, float>::const_iterator itr2 = intron_doc.begin();
4387 			 itr2 != intron_doc.end();
4388 			 ++itr2)
4389 		{
4390 			if (itr != itr2 &&
4391 				itr->second < itr2->second &&
4392 				overlap_in_genome(itr->first.first,
4393 								  itr->first.second,
4394 								  itr2->first.first,
4395 								  itr2->first.second))
4396 			{
4397 				heaviest = false;
4398 				break;
4399 			}
4400 		}
4401 
4402 		if (heaviest)
4403 		{
4404 			major_isoform_intron_doc += itr->second;
4405 			num_major_introns++;
4406 		}
4407 	}
4408 	if (num_major_introns)
4409 	{
4410 		return major_isoform_intron_doc / num_major_introns;
4411 	}
4412 	else
4413 	{
4414 		return 0.0;
4415 	}
4416 }
4417 
record_min_doc_for_scaffolds(int bundle_origin,const vector<Scaffold> & hits,const vector<float> & depth_of_coverage,const map<pair<int,int>,float> & intron_depth_of_coverage,vector<double> & scaff_doc)4418 void record_min_doc_for_scaffolds(int bundle_origin,
4419 								  const vector<Scaffold>& hits,
4420 								  const vector<float>& depth_of_coverage,
4421 								  const map<pair<int, int>, float>& intron_depth_of_coverage,
4422 								  vector<double>& scaff_doc)
4423 {
4424 	for (size_t h = 0; h < hits.size(); ++h)
4425 	{
4426 		double doc = 99999999.0;
4427 		if (hits[h].has_intron())
4428 			doc = get_intron_doc(hits[h], intron_depth_of_coverage);
4429 
4430 		doc = min(doc, get_scaffold_min_doc(bundle_origin,
4431 											hits[h],
4432 											depth_of_coverage));
4433 		scaff_doc.push_back(doc);
4434 	}
4435 }
4436 
record_doc_for_scaffolds(int bundle_origin,const vector<Scaffold> & hits,const vector<float> & depth_of_coverage,vector<double> & scaff_doc)4437 void record_doc_for_scaffolds(int bundle_origin,
4438 							  const vector<Scaffold>& hits,
4439 							  const vector<float>& depth_of_coverage,
4440 							  vector<double>& scaff_doc)
4441 {
4442 	for (size_t h = 0; h < hits.size(); ++h)
4443 	{
4444 		double doc;
4445 		doc = get_scaffold_doc(bundle_origin,
4446 							   hits[h],
4447 							   depth_of_coverage);
4448 		scaff_doc.push_back(doc);
4449 	}
4450 }
4451 
record_doc_for_scaffolds(int bundle_origin,const vector<Scaffold> & hits,const vector<float> & depth_of_coverage,const map<pair<int,int>,float> & intron_depth_of_coverage,vector<double> & scaff_doc)4452 void record_doc_for_scaffolds(int bundle_origin,
4453 							  const vector<Scaffold>& hits,
4454 							  const vector<float>& depth_of_coverage,
4455 							  const map<pair<int, int>, float>& intron_depth_of_coverage,
4456 							  vector<double>& scaff_doc)
4457 {
4458 	for (size_t h = 0; h < hits.size(); ++h)
4459 	{
4460 		double doc;
4461 		if (hits[h].has_intron())
4462 			doc = get_intron_doc(hits[h], intron_depth_of_coverage);
4463 		else
4464 			doc = get_scaffold_doc(bundle_origin,
4465 								   hits[h],
4466 								   depth_of_coverage);
4467 		scaff_doc.push_back(doc);
4468 	}
4469 }
4470 
get_intron_doc(const Scaffold & s,const map<pair<int,int>,float> & intron_depth_of_coverage)4471 double get_intron_doc(const Scaffold& s,
4472 					  const map<pair<int, int>, float >& intron_depth_of_coverage)
4473 {
4474 	const vector<AugmentedCuffOp>& aug_ops = s.augmented_ops();
4475 	int num_introns = 0;
4476 	double doc = 0;
4477 	for (size_t j = 0; j < aug_ops.size(); ++j)
4478 	{
4479 		const AugmentedCuffOp& op = aug_ops[j];
4480 		if (op.opcode == CUFF_INTRON)
4481 		{
4482 			num_introns++;
4483 			pair<int,int> op_intron(op.g_left(), op.g_right());
4484 			map<pair<int, int>, float >::const_iterator itr = intron_depth_of_coverage.find(op_intron);
4485 			//			assert (itr != intron_depth_of_coverage.end());
4486 			if (itr == intron_depth_of_coverage.end())
4487 			{
4488 				map<pair<int, int>, float >::const_iterator zi;
4489 				for (zi = intron_depth_of_coverage.begin();
4490 					 zi != intron_depth_of_coverage.end();
4491 					 ++zi)
4492 				{
4493 					verbose_msg( "Warning: intron not within scaffold ([%d-%d], %d)\n", zi->first.first, zi->first.second, zi->second);
4494 				}
4495 			}
4496 
4497 			doc += itr->second;
4498 		}
4499 	}
4500 	return doc / (double)num_introns;
4501 }
4502 
get_scaffold_doc(int bundle_origin,const Scaffold & s,const vector<float> & depth_of_coverage)4503 double get_scaffold_doc(int bundle_origin,
4504 						const Scaffold& s,
4505 						const vector<float>& depth_of_coverage)
4506 {
4507 	const vector<AugmentedCuffOp>& aug_ops = s.augmented_ops();
4508 	int m_len = 0;
4509 	double doc = 0;
4510 	for (size_t j = 0; j < aug_ops.size(); ++j)
4511 	{
4512 		const AugmentedCuffOp& op = aug_ops[j];
4513 		if (op.opcode == CUFF_MATCH)
4514 		{
4515 			for (int K = op.g_left(); K < op.g_right(); ++K)
4516 			{
4517 				m_len++;
4518 				doc += depth_of_coverage[K - bundle_origin];
4519 			}
4520 		}
4521 	}
4522 
4523 	return doc/(double)m_len;
4524 }
4525 
get_scaffold_min_doc(int bundle_origin,const Scaffold & s,const vector<float> & depth_of_coverage)4526 double get_scaffold_min_doc(int bundle_origin,
4527 							const Scaffold& s,
4528 							const vector<float>& depth_of_coverage)
4529 {
4530 	const vector<AugmentedCuffOp>& aug_ops = s.augmented_ops();
4531 	float min_doc = 99999999;
4532 
4533 	for (size_t j = 0; j < aug_ops.size(); ++j)
4534 	{
4535 		const AugmentedCuffOp& op = aug_ops[j];
4536 		if (op.opcode == CUFF_MATCH)
4537 		{
4538 			for (int K = op.g_left(); K < op.g_right(); ++K)
4539 			{
4540 				if (min_doc > depth_of_coverage[K - bundle_origin])
4541 					min_doc = depth_of_coverage[K - bundle_origin];
4542 			}
4543 		}
4544 	}
4545 
4546 	return min_doc;
4547 }
4548 
tss_analysis(const string & locus_tag,SampleAbundances & sample)4549 void tss_analysis(const string& locus_tag, SampleAbundances& sample)
4550 {
4551     // Cluster transcripts by start site (TSS)
4552     vector<AbundanceGroup> transcripts_by_tss;
4553 
4554     ublas::matrix<double> tss_gamma_cov;
4555     ublas::matrix<double> tss_count_cov;
4556     ublas::matrix<double> tss_iterated_exp_count_cov;
4557     ublas::matrix<double> tss_fpkm_cov;
4558     vector<Eigen::VectorXd> tss_assigned_counts;
4559 
4560     vector<bool> mask(sample.transcripts.abundances().size(), true);
4561     for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
4562     {
4563         if (*(sample.transcripts.abundances()[i]->tss_id().begin()) == "")
4564         {
4565             mask[i] = false;
4566         }
4567     }
4568 
4569     AbundanceGroup trans_with_tss;
4570     sample.transcripts.filter_group(mask, trans_with_tss);
4571 
4572     cluster_transcripts<ConnectByAnnotatedTssId>(trans_with_tss,
4573                                                  transcripts_by_tss,
4574                                                  &tss_gamma_cov,
4575                                                  &tss_iterated_exp_count_cov,
4576                                                  &tss_count_cov,
4577                                                  &tss_fpkm_cov);
4578 
4579 
4580     BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_tss)
4581     {
4582         ab_group.locus_tag(locus_tag);
4583         set<string> tss_ids = ab_group.tss_id();
4584         assert (tss_ids.size() == 1);
4585         string desc = *(tss_ids.begin());
4586         assert (desc != "");
4587         ab_group.description(*(tss_ids.begin()));
4588     }
4589 
4590     sample.primary_transcripts = transcripts_by_tss;
4591 
4592     // Group TSS clusters by gene
4593     vector<boost::shared_ptr<Abundance> > primary_transcript_abundances;
4594     set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
4595     BOOST_FOREACH (AbundanceGroup& ab_group, sample.primary_transcripts)
4596     {
4597         primary_transcript_abundances.push_back(boost::shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
4598         rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
4599     }
4600 
4601     AbundanceGroup primary_transcripts(primary_transcript_abundances,
4602                                        tss_gamma_cov,
4603                                        tss_iterated_exp_count_cov,
4604                                        tss_count_cov,
4605                                        tss_fpkm_cov,
4606                                        rg_props);
4607 
4608     vector<AbundanceGroup> primary_transcripts_by_gene;
4609 
4610     cluster_transcripts<ConnectByAnnotatedGeneId>(primary_transcripts,
4611                                                   primary_transcripts_by_gene);
4612 
4613     BOOST_FOREACH(AbundanceGroup& ab_group, primary_transcripts_by_gene)
4614     {
4615         ab_group.locus_tag(locus_tag);
4616         set<string> gene_ids = ab_group.gene_id();
4617         if (gene_ids.size() > 1)
4618         {
4619             BOOST_FOREACH (string st, gene_ids)
4620             {
4621                 fprintf(stderr, "%s\n", st.c_str());
4622             }
4623             ab_group.gene_id();
4624         }
4625         assert (gene_ids.size() == 1);
4626         ab_group.description(*(gene_ids.begin()));
4627     }
4628 
4629     sample.gene_primary_transcripts = primary_transcripts_by_gene;
4630 }
4631 
cds_analyis(const string & locus_tag,SampleAbundances & sample)4632 void cds_analyis(const string& locus_tag, SampleAbundances& sample)
4633 {
4634     // Cluster transcripts by CDS
4635     vector<AbundanceGroup> transcripts_by_cds;
4636     ublas::matrix<double> cds_gamma_cov;
4637     ublas::matrix<double> cds_count_cov;
4638     ublas::matrix<double> cds_iterated_exp_count_cov;
4639     ublas::matrix<double> cds_fpkm_cov;
4640 
4641     vector<bool> mask(sample.transcripts.abundances().size(), true);
4642     for (size_t i = 0; i < sample.transcripts.abundances().size(); ++i)
4643     {
4644         if (*(sample.transcripts.abundances()[i]->protein_id().begin()) == "")
4645         {
4646             mask[i] = false;
4647         }
4648     }
4649 
4650     AbundanceGroup trans_with_p_id;
4651     sample.transcripts.filter_group(mask, trans_with_p_id);
4652 
4653     cluster_transcripts<ConnectByAnnotatedProteinId>(trans_with_p_id,
4654                                                      transcripts_by_cds,
4655                                                      &cds_gamma_cov,
4656                                                      &cds_iterated_exp_count_cov,
4657                                                      &cds_count_cov,
4658                                                      &cds_fpkm_cov);
4659 
4660     BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_cds)
4661     {
4662         ab_group.locus_tag(locus_tag);
4663         set<string> protein_ids = ab_group.protein_id();
4664         assert (protein_ids.size() == 1);
4665         string desc = *(protein_ids.begin());
4666         //if (desc != "")
4667         //{
4668         assert (desc != "");
4669         ab_group.description(*(protein_ids.begin()));
4670         //}
4671     }
4672 
4673     sample.cds = transcripts_by_cds;
4674 
4675     // Group the CDS clusters by gene
4676     vector<boost::shared_ptr<Abundance> > cds_abundances;
4677 
4678     set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
4679     BOOST_FOREACH (AbundanceGroup& ab_group, sample.cds)
4680     {
4681         //if (ab_group.description() != "")
4682         {
4683             cds_abundances.push_back(boost::shared_ptr<Abundance>(new AbundanceGroup(ab_group)));
4684             rg_props.insert(ab_group.rg_props().begin(), ab_group.rg_props().end());
4685         }
4686     }
4687     AbundanceGroup cds(cds_abundances,
4688                        cds_gamma_cov,
4689                        cds_iterated_exp_count_cov,
4690                        cds_count_cov,
4691                        cds_fpkm_cov,
4692                        rg_props);
4693 
4694     vector<AbundanceGroup> cds_by_gene;
4695 
4696     cluster_transcripts<ConnectByAnnotatedGeneId>(cds,
4697                                                   cds_by_gene);
4698 
4699     BOOST_FOREACH(AbundanceGroup& ab_group, cds_by_gene)
4700     {
4701         ab_group.locus_tag(locus_tag);
4702         set<string> gene_ids = ab_group.gene_id();
4703         assert (gene_ids.size() == 1);
4704         ab_group.description(*(gene_ids.begin()));
4705     }
4706 
4707     sample.gene_cds = cds_by_gene;
4708 
4709 }
4710 
sample_abundance_worker(const string & locus_tag,const set<boost::shared_ptr<ReadGroupProperties const>> & rg_props,SampleAbundances & sample,boost::shared_ptr<HitBundle> sample_bundle,bool perform_cds_analysis,bool perform_tss_analysis,bool calculate_variance)4711 void sample_abundance_worker(const string& locus_tag,
4712                              const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props,
4713                              SampleAbundances& sample,
4714                              boost::shared_ptr<HitBundle> sample_bundle,
4715                              bool perform_cds_analysis,
4716                              bool perform_tss_analysis,
4717                              bool calculate_variance)
4718 {
4719     vector<boost::shared_ptr<Abundance> > abundances;
4720 
4721     BOOST_FOREACH(boost::shared_ptr<Scaffold> s, sample_bundle->ref_scaffolds())
4722     {
4723         TranscriptAbundance* pT = new TranscriptAbundance;
4724         pT->transfrag(s);
4725         boost::shared_ptr<Abundance> ab(pT);
4726         ab->description(s->annotated_trans_id());
4727         ab->locus_tag(locus_tag);
4728         abundances.push_back(ab);
4729     }
4730 
4731     sample.transcripts = AbundanceGroup(abundances);
4732 
4733     sample.transcripts.init_rg_props(rg_props);
4734 
4735     vector<MateHit> hits_in_cluster;
4736 
4737     if (sample_bundle->hits().size() < (size_t)max_frags_per_bundle)
4738     {
4739         get_alignments_from_scaffolds(sample.transcripts.abundances(),
4740                                       hits_in_cluster);
4741 
4742         // Compute the individual transcript FPKMs via each sample's
4743         // AbundanceGroup for this locus.
4744 
4745         sample.transcripts.calculate_abundance(hits_in_cluster, true, calculate_variance);
4746     }
4747     else
4748     {
4749         BOOST_FOREACH(boost::shared_ptr<Abundance>  ab, abundances)
4750         {
4751             ab->status(NUMERIC_HI_DATA);
4752 
4753             CountPerReplicateTable cpr;
4754             FPKMPerReplicateTable fpr;
4755             StatusPerReplicateTable spr;
4756             for (set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg_props.begin();
4757                  itr != rg_props.end();
4758                  ++itr)
4759             {
4760                 cpr[*itr] = 0;
4761                 fpr[*itr] = 0;
4762                 spr[*itr] = NUMERIC_HI_DATA;
4763             }
4764             ab->num_fragments_by_replicate(cpr);
4765             ab->FPKM_by_replicate(fpr);
4766             ab->status_by_replicate(spr);
4767         }
4768     }
4769 
4770     // Cluster transcripts by gene_id
4771     vector<AbundanceGroup> transcripts_by_gene_id;
4772     cluster_transcripts<ConnectByAnnotatedGeneId>(sample.transcripts,
4773                                                   transcripts_by_gene_id);
4774 
4775 	BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_gene_id)
4776     {
4777         ab_group.locus_tag(locus_tag);
4778         set<string> gene_ids = ab_group.gene_id();
4779         assert (gene_ids.size() == 1);
4780         ab_group.description(*(gene_ids.begin()));
4781     }
4782 
4783     sample.genes = transcripts_by_gene_id;
4784 
4785     if (perform_cds_analysis)
4786     {
4787         cds_analyis(locus_tag, sample);
4788     }
4789 
4790     if (perform_tss_analysis)    {
4791         tss_analysis(locus_tag, sample);
4792     }
4793 }
4794 
4795 // This function applies library size factors to pre-computed expression entries
apply_normalization_to_abundances(const map<boost::shared_ptr<const ReadGroupProperties>,boost::shared_ptr<const AbundanceGroup>> & unnormalized_ab_group_per_replicate,map<boost::shared_ptr<const ReadGroupProperties>,boost::shared_ptr<AbundanceGroup>> & normalized_ab_group_per_replicate)4796 void AbundanceGroup::apply_normalization_to_abundances(const map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >& unnormalized_ab_group_per_replicate,
4797                                                        map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >& normalized_ab_group_per_replicate)
4798 {
4799     for (map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = unnormalized_ab_group_per_replicate.begin();
4800          itr != unnormalized_ab_group_per_replicate.end(); ++itr)
4801     {
4802         boost::shared_ptr<AbundanceGroup> norm_ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(*itr->second));
4803         boost::shared_ptr<const ReadGroupProperties> rg_props = itr->first;
4804         boost::shared_ptr<const MassDispersionModel> disp_model = rg_props->mass_dispersion_model();
4805 
4806         boost::shared_ptr<const ReadGroupProperties> old_rg_props = *(itr->second->rg_props().begin());
4807 
4808         double fpkm_correction_factor = 1.0;
4809         fpkm_correction_factor = old_rg_props->normalized_map_mass() / rg_props->normalized_map_mass();
4810 
4811         double internal_scale_factor = rg_props->internal_scale_factor();
4812 
4813         double total_mass = 0.0;
4814 
4815         for (size_t i = 0; i < norm_ab->_abundances.size(); ++i)
4816         {
4817             norm_ab->_abundances[i]->num_fragments(itr->second->_abundances[i]->num_fragments() / internal_scale_factor);
4818 
4819             total_mass += norm_ab->_abundances[i]->num_fragments();
4820 
4821             norm_ab->_abundances[i]->FPKM(fpkm_correction_factor * itr->second->_abundances[i]->FPKM() / internal_scale_factor);
4822             norm_ab->_iterated_exp_count_covariance = norm_ab->iterated_count_cov() / (internal_scale_factor*internal_scale_factor);
4823             norm_ab->_fpkm_covariance = norm_ab->_fpkm_covariance * (fpkm_correction_factor * fpkm_correction_factor)/ (internal_scale_factor*internal_scale_factor);
4824             norm_ab->_count_covariance = norm_ab->_count_covariance/ (internal_scale_factor*internal_scale_factor);
4825         }
4826 
4827         double locus_mass_variance = disp_model->scale_mass_variance(total_mass);
4828 
4829         for (size_t i = 0; i < norm_ab->_abundances.size(); ++i)
4830         {
4831             norm_ab->_abundances[i]->mass_variance(locus_mass_variance * norm_ab->_abundances[i]->gamma());
4832         }
4833 
4834         normalized_ab_group_per_replicate[itr->first] = norm_ab;
4835     }
4836 }
4837 
merge_precomputed_expression_worker(const string & locus_tag,const vector<boost::shared_ptr<PrecomputedExpressionBundleFactory>> & expression_factories,SampleAbundances & sample,boost::shared_ptr<HitBundle> sample_bundle,bool perform_cds_analysis,bool perform_tss_analysis,bool calculate_variance)4838 void merge_precomputed_expression_worker(const string& locus_tag,
4839                                          const vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> >& expression_factories,
4840                                          SampleAbundances& sample,
4841                                          boost::shared_ptr<HitBundle> sample_bundle,
4842                                          bool perform_cds_analysis,
4843                                          bool perform_tss_analysis,
4844                                          bool calculate_variance)
4845 {
4846     map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> > unnormalized_ab_group_per_replicate;
4847     map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> > normalized_ab_group_per_replicate;
4848     map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> > const_ab_group_per_replicate;
4849 
4850     set<boost::shared_ptr<const ReadGroupProperties> > rg_props;
4851     for (size_t i = 0; i < expression_factories.size(); ++i)
4852     {
4853         boost::shared_ptr<PrecomputedExpressionBundleFactory> pBundleFac = expression_factories[i];
4854         boost::shared_ptr<const PrecomputedExpressionHitFactory> pHitFac = boost::dynamic_pointer_cast<const PrecomputedExpressionHitFactory> (pBundleFac->hit_factory());
4855         assert (pHitFac);
4856 
4857         boost::shared_ptr<const ReadGroupProperties> rg_prop = pBundleFac->read_group_properties();
4858         rg_props.insert(rg_prop);
4859 
4860         boost::shared_ptr<const AbundanceGroup> ab = pBundleFac->get_abundance_for_locus(sample_bundle->id());
4861         pBundleFac->clear_abundance_for_locus(sample_bundle->id());
4862         if (!ab)
4863         {
4864             fprintf(stderr, "Error: no bundle with id %d in precomputed expression file\n", sample_bundle->id());
4865         }
4866         else if(ab->abundances().size() != sample_bundle->ref_scaffolds().size())
4867         {
4868             fprintf(stderr, "Error: bad bundle merge %s != %s\n", ab->description().c_str(), locus_tag.c_str());
4869         }
4870         unnormalized_ab_group_per_replicate[rg_prop] = ab;
4871     }
4872 
4873     AbundanceGroup::apply_normalization_to_abundances(unnormalized_ab_group_per_replicate, normalized_ab_group_per_replicate);
4874 
4875     for (map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >::const_iterator itr = normalized_ab_group_per_replicate.begin();
4876          itr != normalized_ab_group_per_replicate.end(); ++itr)
4877     {
4878         const_ab_group_per_replicate[itr->first] = itr->second;
4879     }
4880 
4881     vector<boost::shared_ptr<Abundance> > abundances;
4882 
4883     BOOST_FOREACH(boost::shared_ptr<Scaffold> s, sample_bundle->ref_scaffolds())
4884     {
4885         TranscriptAbundance* pT = new TranscriptAbundance;
4886         pT->transfrag(s);
4887         boost::shared_ptr<Abundance> ab(pT);
4888         ab->description(s->annotated_trans_id());
4889         ab->locus_tag(locus_tag);
4890         abundances.push_back(ab);
4891     }
4892 
4893     sample.transcripts = AbundanceGroup(abundances);
4894 
4895     sample.transcripts.init_rg_props(rg_props);
4896 
4897     vector<MateHit> hits_in_cluster;
4898 
4899     if (sample_bundle->hits().size() < (size_t)max_frags_per_bundle)
4900     {
4901         sample.transcripts.collect_per_replicate_mass(const_ab_group_per_replicate);
4902         sample.transcripts.aggregate_replicate_abundances(const_ab_group_per_replicate);
4903         if (calculate_variance)
4904         {
4905             sample.transcripts.calculate_abundance_group_variance(abundances, const_ab_group_per_replicate);
4906         }
4907     }
4908     else // FIXME: THIS needs to do the right thing with sample.transcripts...
4909     {
4910         BOOST_FOREACH(boost::shared_ptr<Abundance>  ab, abundances)
4911         {
4912             ab->status(NUMERIC_HI_DATA);
4913 
4914             CountPerReplicateTable cpr;
4915             FPKMPerReplicateTable fpr;
4916             StatusPerReplicateTable spr;
4917             for (set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg_props.begin();
4918                  itr != rg_props.end();
4919                  ++itr)
4920             {
4921                 cpr[*itr] = 0;
4922                 fpr[*itr] = 0;
4923                 spr[*itr] = NUMERIC_HI_DATA;
4924             }
4925             ab->num_fragments_by_replicate(cpr);
4926             ab->FPKM_by_replicate(fpr);
4927             ab->status_by_replicate(spr);
4928         }
4929     }
4930 
4931     // Cluster transcripts by gene_id
4932     vector<AbundanceGroup> transcripts_by_gene_id;
4933     cluster_transcripts<ConnectByAnnotatedGeneId>(sample.transcripts,
4934                                                   transcripts_by_gene_id);
4935 
4936 	BOOST_FOREACH(AbundanceGroup& ab_group, transcripts_by_gene_id)
4937     {
4938         ab_group.locus_tag(locus_tag);
4939         set<string> gene_ids = ab_group.gene_id();
4940         assert (gene_ids.size() == 1);
4941         ab_group.description(*(gene_ids.begin()));
4942     }
4943 
4944     sample.genes = transcripts_by_gene_id;
4945 
4946     if (perform_cds_analysis)
4947     {
4948         cds_analyis(locus_tag, sample);
4949     }
4950 
4951     if (perform_tss_analysis)
4952     {
4953         tss_analysis(locus_tag, sample);
4954     }
4955 }
4956 
4957 
4958 //BOOST_CLASS_EXPORT(Abundance)
4959 BOOST_CLASS_EXPORT(TranscriptAbundance)
4960 BOOST_SERIALIZATION_SHARED_PTR(TranscriptAbundance);
4961 BOOST_CLASS_EXPORT(AbundanceGroup);
4962 BOOST_SERIALIZATION_SHARED_PTR(AbundanceGroup);
4963 BOOST_SERIALIZATION_ASSUME_ABSTRACT(Abundance);
4964 
4965