1 //
2 //  replicates.cpp
3 //  cufflinks
4 //
5 //  Created by Cole Trapnell on 3/11/11.
6 //  Copyright 2011 Cole Trapnell. All rights reserved.
7 //
8 
9 #include <boost/thread.hpp>
10 
11 extern "C" {
12 #include "locfit/local.h"
13 }
14 
15 #include <boost/random/gamma_distribution.hpp>
16 #include <boost/random/poisson_distribution.hpp>
17 
18 #include "replicates.h"
19 //#include "rounding.h"
20 
21 #if ENABLE_THREADS
22 boost::mutex _locfit_lock;
23 #endif
24 
MassDispersionModel(const std::string & name,const std::vector<double> & scaled_compatible_mass_means,const std::vector<double> & scaled_compatible_variances,const std::vector<double> & scaled_mass_variances)25 MassDispersionModel::MassDispersionModel(const std::string& name,
26                                          const std::vector<double>& scaled_compatible_mass_means,
27                                          const std::vector<double>& scaled_compatible_variances,
28                                          const std::vector<double>& scaled_mass_variances)
29 {
30     _name = name;
31 
32     if (scaled_compatible_mass_means.size() != scaled_mass_variances.size())
33     {
34         fprintf (stderr, "Error: dispersion model table is malformed\n");
35     }
36 
37     double last_val = 0;
38     for (size_t i = 0; i < scaled_compatible_mass_means.size(); i++)
39     {
40 
41         if (last_val > scaled_compatible_mass_means[i])
42         {
43             fprintf (stderr, "Error: DispersionModel input is malformed\n");
44         }
45 
46         if ( i == 0 || last_val < scaled_compatible_mass_means[i])
47         {
48             _scaled_compatible_mass_means.push_back(scaled_compatible_mass_means[i]);
49             _scaled_compatible_variances.push_back(scaled_compatible_variances[i]);
50             _scaled_mass_variances.push_back(scaled_mass_variances[i]);
51         }
52         else
53         {
54             // skip this element if it's equal to what we've already seen
55         }
56 
57         last_val = scaled_compatible_mass_means[i];
58     }
59 }
60 
scale_mass_variance(double scaled_mass) const61 double MassDispersionModel::scale_mass_variance(double scaled_mass) const
62 {
63     if (scaled_mass <= 0)
64         return 0.0;
65 
66     if (_scaled_compatible_mass_means.size() < 2 || _scaled_mass_variances.size() < 2)
67     {
68         return scaled_mass; // revert to poisson.
69     }
70     if (scaled_mass > _scaled_compatible_mass_means.back())
71     {
72         // extrapolate to the right
73         // off the right end
74         double x1_mean = _scaled_compatible_mass_means[_scaled_compatible_mass_means.size()-2];
75         double x2_mean = _scaled_compatible_mass_means[_scaled_compatible_mass_means.size()-1];
76 
77         double y1_var = _scaled_mass_variances[_scaled_compatible_mass_means.size()-2];
78         double y2_var = _scaled_mass_variances[_scaled_compatible_mass_means.size()-1];
79         double slope = 0.0;
80         if (x2_mean != x1_mean)
81         {
82             slope = (y2_var - y1_var) / (x2_mean-x1_mean);
83         }
84         else if (y1_var == y2_var)
85         {
86             assert (false); // should have a unique'd table
87         }
88         double mean_interp = _scaled_mass_variances[_scaled_compatible_mass_means.size()-1] -
89         slope*(scaled_mass - _scaled_compatible_mass_means.size()-1);
90         if (mean_interp < scaled_mass)
91             mean_interp = scaled_mass;
92         assert (!isnan(mean_interp) && !isinf(mean_interp));
93         return mean_interp;
94     }
95     else if (scaled_mass < _scaled_compatible_mass_means.front())
96     {
97         // extrapolate to the left
98         // off the left end?
99         double x1_mean = _scaled_compatible_mass_means[0];
100         double x2_mean = _scaled_compatible_mass_means[1];
101 
102         double y1_var = _scaled_mass_variances[0];
103         double y2_var = _scaled_mass_variances[1];
104         double slope = 0.0;
105         if (x2_mean != x1_mean)
106         {
107             slope = (y2_var - y1_var) / (x2_mean-x1_mean);
108         }
109         else if (y1_var == y2_var)
110         {
111             assert (false); // should have a unique'd table
112         }
113         double mean_interp = _scaled_mass_variances[0] - slope*(_scaled_compatible_mass_means[0] - scaled_mass);
114         if (mean_interp < scaled_mass)
115             mean_interp = scaled_mass;
116 
117         assert (!isnan(mean_interp) && !isinf(mean_interp));
118         return mean_interp;
119     }
120 
121     vector<double>::const_iterator lb;
122     lb = lower_bound(_scaled_compatible_mass_means.begin(),
123                      _scaled_compatible_mass_means.end(),
124                      scaled_mass);
125     if (lb < _scaled_compatible_mass_means.end())
126     {
127         int d = lb - _scaled_compatible_mass_means.begin();
128         if (*lb == scaled_mass || lb == _scaled_compatible_mass_means.begin())
129         {
130             double var = _scaled_mass_variances[d];
131             if (var < scaled_mass) // revert to poisson if underdispersed
132                 var = scaled_mass;
133             assert (!isnan(var) && !isinf(var));
134             return var;
135         }
136 
137 
138         //in between two points on the scale.
139         d--;
140 
141         if (d < 0)
142         {
143             fprintf(stderr, "ARG d < 0, d = %d \n", d);
144         }
145 
146         if (d >= _scaled_compatible_mass_means.size())
147         {
148             fprintf(stderr, "ARG d >= _scaled_compatible_mass_means.size(), d = %d\n", d);
149         }
150         if (d >= _scaled_mass_variances.size())
151         {
152             fprintf(stderr, "ARG d >= _scaled_mass_variances.size(), d = %d\n", d);
153         }
154 
155         double x1_mean = _scaled_compatible_mass_means[d];
156         double x2_mean = _scaled_compatible_mass_means[d + 1];
157 
158         double y1_var = _scaled_mass_variances[d];
159         double y2_var = _scaled_mass_variances[d + 1];
160         double slope = 0.0;
161         if (x2_mean != x1_mean)
162         {
163             slope = (y2_var - y1_var) / (x2_mean-x1_mean);
164         }
165         else if (y1_var == y2_var)
166         {
167             assert (false); // should have a unique'd table
168         }
169         double mean_interp = _scaled_mass_variances[d] + slope*(scaled_mass - _scaled_compatible_mass_means[d]);
170         if (mean_interp < scaled_mass) // revert to poisson if underdispersed
171             mean_interp = scaled_mass;
172 
173         assert (!isnan(mean_interp) && !isinf(mean_interp));
174         return mean_interp;
175     }
176     else
177     {
178         assert (!isnan(scaled_mass) && !isinf(scaled_mass));
179         return scaled_mass; // revert to poisson assumption
180     }
181 }
182 
183 
MleErrorModel(const std::string & name,const std::vector<double> & scaled_compatible_mass_means,const std::vector<double> & scaled_mle_variances)184 MleErrorModel::MleErrorModel(const std::string& name,
185                              const std::vector<double>& scaled_compatible_mass_means,
186                              const std::vector<double>& scaled_mle_variances)
187 {
188     _name = name;
189 
190     if (scaled_compatible_mass_means.size() != scaled_mle_variances.size())
191     {
192         fprintf (stderr, "Error: dispersion model table is malformed\n");
193     }
194 
195     double last_val = 0;
196     for (size_t i = 0; i < scaled_compatible_mass_means.size(); i++)
197     {
198 
199         if (last_val > scaled_compatible_mass_means[i])
200         {
201             fprintf (stderr, "Error: MLEError input is malformed\n");
202         }
203 
204         if ( i == 0 || last_val < scaled_compatible_mass_means[i])
205         {
206             _scaled_compatible_mass_means.push_back(scaled_compatible_mass_means[i]);
207             _scaled_mle_variances.push_back(scaled_mle_variances[i]);
208         }
209         else
210         {
211             // skip this element if it's equal to what we've already seen
212         }
213 
214         last_val = scaled_compatible_mass_means[i];
215     }
216 }
217 
scale_mle_variance(double scaled_mass) const218 double MleErrorModel::scale_mle_variance(double scaled_mass) const
219 {
220     if (scaled_mass <= 0)
221         return 0.0;
222 
223     if (_scaled_compatible_mass_means.size() < 2 || _scaled_mle_variances.size() < 2)
224     {
225         return 0; // revert to poisson.
226     }
227     if (scaled_mass > _scaled_compatible_mass_means.back())
228     {
229         return 0;
230     }
231     else if (scaled_mass < _scaled_compatible_mass_means.front())
232     {
233         return 0; // we won't add anything if we're out of the range of the table
234     }
235 
236     vector<double>::const_iterator lb;
237     lb = lower_bound(_scaled_compatible_mass_means.begin(),
238                      _scaled_compatible_mass_means.end(),
239                      scaled_mass);
240     if (lb < _scaled_compatible_mass_means.end())
241     {
242         int d = lb - _scaled_compatible_mass_means.begin();
243         if (*lb == scaled_mass || lb == _scaled_compatible_mass_means.begin())
244         {
245             double var = _scaled_mle_variances[d];
246             assert (!isnan(var) && !isinf(var));
247             if (var < 0)
248                 var = 0;
249             return var;
250         }
251 
252         //in between two points on the scale.
253         d--;
254 
255         if (d < 0)
256         {
257             fprintf(stderr, "ARG d < 0, d = %d \n", d);
258         }
259 
260         if (d >= _scaled_compatible_mass_means.size())
261         {
262             fprintf(stderr, "ARG d >= _scaled_compatible_mass_means.size(), d = %d\n", d);
263         }
264         if (d >= _scaled_mle_variances.size())
265         {
266             fprintf(stderr, "ARG d >= _scaled_mass_variances.size(), d = %d\n", d);
267         }
268 
269         double x1_mean = _scaled_compatible_mass_means[d];
270         double x2_mean = _scaled_compatible_mass_means[d + 1];
271 
272         double y1_var = _scaled_mle_variances[d];
273         double y2_var = _scaled_mle_variances[d + 1];
274         double slope = 0.0;
275         if (x2_mean != x1_mean)
276         {
277             slope = (y2_var - y1_var) / (x2_mean-x1_mean);
278         }
279         else if (y1_var == y2_var)
280         {
281             assert (false); // should have a unique'd table
282         }
283         double mean_interp = _scaled_mle_variances[d] + slope*(scaled_mass - _scaled_compatible_mass_means[d]);
284         if (mean_interp < 0)
285             mean_interp = 0;
286 
287         assert (!isnan(mean_interp) && !isinf(mean_interp));
288         return mean_interp;
289     }
290     else
291     {
292         assert (!isnan(scaled_mass) && !isinf(scaled_mass));
293         return 0; // revert to poisson assumption
294     }
295 }
296 
transform_counts_to_common_scale(const vector<double> & scale_factors,vector<LocusCountList> & sample_compatible_count_table)297 void transform_counts_to_common_scale(const vector<double>& scale_factors,
298                                       vector<LocusCountList>& sample_compatible_count_table)
299 {
300     // Transform raw counts to the common scale
301     for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
302     {
303         LocusCountList& p = sample_compatible_count_table[i];
304         for (size_t j = 0; j < p.counts.size(); ++j)
305         {
306             assert (scale_factors.size() > j);
307             p.counts[j] *= (1.0 / scale_factors[j]);
308         }
309     }
310 }
311 
calc_geometric_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)312 void calc_geometric_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
313                                     vector<double>& scale_factors)
314 {
315 
316     vector<double> log_geom_means(sample_compatible_count_table.size(), 0.0);
317 
318     for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
319     {
320         const LocusCountList& p = sample_compatible_count_table[i];
321 
322         for (size_t j = 0; j < p.counts.size(); ++j)
323         {
324             //assert (log_geom_means.size() > j);
325             //if (floor(p.counts[j]) > 0)
326             //{
327                 log_geom_means[i] += (1.0/p.counts.size()) * log(floor(p.counts[j]));
328             //}
329 
330         }
331         //log_geom_means[i] = pow(log_geom_means[i], 1.0/(double)p.counts.size());
332     }
333 
334     for (size_t j = 0; j < scale_factors.size(); ++j)
335     {
336         vector<double> tmp_counts;
337         for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
338         {
339             if (log_geom_means[i] && !isinf(log_geom_means[i]) && !isnan(log_geom_means[i]) && floor(sample_compatible_count_table[i].counts[j]))
340             {
341                 double gm = (double)log(floor(sample_compatible_count_table[i].counts[j])) - log_geom_means[i];
342                 assert (!isinf(gm));
343                 tmp_counts.push_back(gm);
344             }
345         }
346         sort(tmp_counts.begin(), tmp_counts.end());
347         if (!tmp_counts.empty())
348             scale_factors[j] = exp(tmp_counts[tmp_counts.size()/2]);
349         else
350             scale_factors[j] = 1.0;
351     }
352 }
353 
calc_classic_fpkm_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)354 void calc_classic_fpkm_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
355                                        vector<double>& scale_factors)
356 {
357     vector<double> total_counts(sample_compatible_count_table.size(), 0.0);
358 
359     for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
360     {
361         const LocusCountList& p = sample_compatible_count_table[i];
362 
363         for (size_t j = 0; j < p.counts.size(); ++j)
364         {
365             total_counts[j] += floor(p.counts[j]);
366         }
367     }
368 
369     double all_library_counts = accumulate(total_counts.begin(), total_counts.end(), 0.0);
370     if (all_library_counts == 0.0)
371     {
372         for (size_t i = 0; i < scale_factors.size(); ++i)
373         {
374             scale_factors[i] = 1.0;
375         }
376         return;
377     }
378     for (size_t i = 0; i < scale_factors.size(); ++i)
379     {
380         scale_factors[i] = total_counts[i] / all_library_counts;
381     }
382 
383     double avg_scaling_factor = accumulate(scale_factors.begin(), scale_factors.end(), 0.0);
384     avg_scaling_factor /= scale_factors.size();
385 
386     for (size_t i = 0; i < scale_factors.size(); ++i)
387     {
388         scale_factors[i] = scale_factors[i] / avg_scaling_factor;
389     }
390 }
391 
calc_quartile_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)392 void calc_quartile_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
393                                    vector<double>& scale_factors)
394 {
395 
396     if (sample_compatible_count_table.empty())
397         return;
398 
399     vector<double> upper_quartiles(sample_compatible_count_table.front().counts.size(), 0.0);
400     vector<double> total_common_masses(sample_compatible_count_table.front().counts.size(), 0.0);
401 
402     for (size_t i = 0; i < sample_compatible_count_table.front().counts.size(); ++i)
403     {
404         vector<double> common_scaled_counts;
405         double total_common = 0.0;
406 
407         for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
408         {
409 
410             //boost::shared_ptr<ReadGroupProperties> rg = bundle_factories[fac_idx];
411             //double scaled_mass = scale_factors[fac_idx] * rg->total_map_mass();
412 
413             total_common += sample_compatible_count_table[j].counts[i];
414             common_scaled_counts.push_back(sample_compatible_count_table[j].counts[i]);
415         }
416 
417         sort(common_scaled_counts.begin(), common_scaled_counts.end());
418         if (common_scaled_counts.empty())
419             continue;
420 
421         int upper_quart_index = common_scaled_counts.size() * 0.75;
422         double upper_quart_count = common_scaled_counts[upper_quart_index];
423         upper_quartiles[i] = upper_quart_count;
424         total_common_masses[i] = total_common;
425     }
426 
427     long double total_mass = accumulate(total_common_masses.begin(), total_common_masses.end(), 0.0);
428     long double total_norm_mass = accumulate(upper_quartiles.begin(), upper_quartiles.end(), 0.0);
429 
430     for (size_t i = 0; i < sample_compatible_count_table.front().counts.size(); ++i)
431     {
432         if (total_mass > 0)
433         {
434             double scaling_factor = upper_quartiles[i];
435             scaling_factor /= (total_norm_mass / upper_quartiles.size());
436             scale_factors[i] = scaling_factor;
437         }
438         else
439         {
440             scale_factors[i] = 1.0;
441         }
442     }
443 }
444 
calc_tmm_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)445 void calc_tmm_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
446                               vector<double>& scale_factors)
447 {
448     scale_factors = vector<double>(sample_compatible_count_table.size(), 1.0);
449 }
450 
451 static const int min_loci_for_fitting = 30;
452 
453 struct SCVInterpolator
454 {
add_scv_pairSCVInterpolator455     void add_scv_pair(double est_scv, double true_scv)
456     {
457         true_scvs.push_back(true_scv);
458         est_scvs.push_back(est_scv);
459     }
460 
finalizeSCVInterpolator461     void finalize()
462     {
463         vector<pair<double, double> > pv;
464         for (size_t i =0; i < true_scvs.size(); ++i)
465         {
466             pv.push_back(make_pair(est_scvs[i], true_scvs[i]));
467         }
468         sort(pv.begin(), pv.end());
469     }
470 
471     // This was built from the dispersion model interpolator - we should refactor these
472     // into a single routine.
interpolate_scvSCVInterpolator473     double interpolate_scv(double est_scv)
474     {
475 //        if (est_scv <= 0)
476 //            return 0.0;
477 
478         if (est_scvs.size() < 2 || true_scvs.size() < 2)
479         {
480             return est_scv; // revert to poisson.
481         }
482         if (est_scv > est_scvs.back())
483         {
484             //fprintf(stderr, "Warning: extrapolating to the right\n");
485             // extrapolate to the right
486             // off the right end
487             double x1_mean = est_scvs[est_scvs.size()-2];
488             double x2_mean = est_scvs[est_scvs.size()-1];
489 
490             double y1_var = true_scvs[est_scvs.size()-2];
491             double y2_var = true_scvs[est_scvs.size()-1];
492             double slope = 0.0;
493             if (x2_mean != x1_mean)
494             {
495                 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
496             }
497             else if (y1_var == y2_var)
498             {
499                 assert (false); // should have a unique'd table
500             }
501             double mean_interp = true_scvs[est_scvs.size()-1] -
502                 slope*(est_scv - est_scvs.size()-1);
503 //            if (mean_interp < est_scv)
504 //                mean_interp = est_scv;
505             assert (!isnan(mean_interp) && !isinf(mean_interp));
506             return mean_interp;
507         }
508         else if (est_scv < est_scvs.front())
509         {
510             //fprintf(stderr, "Warning: extrapolating to the left\n");
511 
512             // If we're extrapolating to the left, our fit is too coarse, but
513             // that probably means we don't need SCV bias correction at all.
514             return est_scv;
515         }
516 
517         vector<double>::const_iterator lb;
518         lb = lower_bound(est_scvs.begin(),
519                          est_scvs.end(),
520                          est_scv);
521         if (lb < est_scvs.end())
522         {
523             int d = lb - est_scvs.begin();
524             if (*lb == est_scv || lb == est_scvs.begin())
525             {
526                 double var = true_scvs[d];
527 //                if (var < est_scv) // revert to poisson if underdispersed
528 //                    var = est_scv;
529                 assert (!isnan(var) && !isinf(var));
530                 return var;
531             }
532 
533 
534             //in between two points on the scale.
535             d--;
536 
537             if (d < 0)
538             {
539                 fprintf(stderr, "ARG d < 0, d = %d \n", d);
540             }
541 
542             if (d >= est_scvs.size())
543             {
544                 fprintf(stderr, "ARG d >= est_scvs.size(), d = %d\n", d);
545             }
546             if (d >= true_scvs.size())
547             {
548                 fprintf(stderr, "ARG d >= true_scvs.size(), d = %d\n", d);
549             }
550 
551             double x1_mean = est_scvs[d];
552             double x2_mean = est_scvs[d + 1];
553 
554             double y1_var = true_scvs[d];
555             double y2_var = true_scvs[d + 1];
556             double slope = 0.0;
557             if (x2_mean != x1_mean)
558             {
559                 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
560             }
561             else if (y1_var == y2_var)
562             {
563                 fprintf(stderr, "Warning: SCV table does not have unique keys\n!");
564                 assert (false); // should have a unique'd table
565             }
566             double mean_interp = true_scvs[d] + slope*(est_scv - est_scvs[d]);
567 //            if (mean_interp < est_scv) // revert to poisson if underdispersed
568 //                mean_interp = est_scv;
569 
570             assert (!isnan(mean_interp) && !isinf(mean_interp));
571             return mean_interp;
572         }
573         else
574         {
575             assert (!isnan(est_scv) && !isinf(est_scv));
576             return est_scv; // revert to poisson assumption
577         }
578 
579         return est_scv;
580     }
581 
582 private:
583     vector<double> true_scvs;
584     vector<double> est_scvs;
585 };
586 
build_scv_correction_fit(int nreps,int ngenes,int mean_count,SCVInterpolator & true_to_est_scv_table)587 void build_scv_correction_fit(int nreps, int ngenes, int mean_count, SCVInterpolator& true_to_est_scv_table)
588 {
589     setuplf();
590 
591     vector<pair<double, double> > alpha_vs_scv;
592 
593     boost::mt19937 rng(random_seed);
594     vector<boost::random::negative_binomial_distribution<int, double> > nb_gens;
595 
596     vector<double> alpha_range;
597     for(double a = 0.0002; a < 2.0; a += 0.0002)
598     {
599         alpha_range.push_back(a);
600     }
601 
602     for (double a = 2; a < 100.0; a += 1)
603     {
604         alpha_range.push_back(a);
605     }
606 
607     BOOST_FOREACH (double alpha, alpha_range)
608     {
609         double k = 1.0/alpha;
610         double p = k / (k + mean_count);
611         double r = (mean_count * p) / (1-p);
612 
613         //boost::random::negative_binomial_distribution<int, double> nb(r, p);
614 
615         boost::random::gamma_distribution<double> gamma(r, (1-p)/p);
616 
617 
618         vector<double> scvs_for_alpha;
619         vector<double> draws;
620         for (size_t i = 0; i < ngenes; ++i)
621         {
622             LocusCountList locus_count("", nreps, 1, vector<string>(), vector<string>());
623             for (size_t rep_idx = 0; rep_idx < nreps; ++rep_idx)
624             {
625                 double gamma_draw = gamma(rng);
626                 if (gamma_draw == 0)
627                 {
628                     locus_count.counts[rep_idx] = 0;
629                     draws.push_back(0);
630                 }
631                 else
632                 {
633                     boost::random::poisson_distribution<long, double> poisson(gamma_draw);
634                     locus_count.counts[rep_idx] = poisson(rng);
635                     draws.push_back(locus_count.counts[rep_idx]);
636                     //fprintf(stderr, "%lg\t", locus_count.counts[rep_idx]);
637                 }
638             }
639 
640             double mean = accumulate(locus_count.counts.begin(), locus_count.counts.end(), 0.0);
641             if (mean == 0)
642                 continue;
643             mean /= locus_count.counts.size();
644             double var = 0.0;
645             BOOST_FOREACH(double c,  locus_count.counts)
646             {
647                 var += (c-mean)*(c-mean);
648             }
649             var /= locus_count.counts.size();
650             var *= locus_count.counts.size() / (locus_count.counts.size() - 1);
651 
652             double scv = var / (mean*mean);
653             scvs_for_alpha.push_back(scv);
654             //fprintf(stderr, " : mean = %lg, var = %lg, scv = %lg\n", mean, var, scv);
655 
656             //fprintf(stderr, "\n");
657         }
658 
659         double mean = accumulate(draws.begin(), draws.end(), 0.0);
660         mean /= draws.size();
661         double var = 0.0;
662         BOOST_FOREACH(int c,  draws)
663         {
664             var += (c - mean)*(c-mean);
665         }
666         var /= draws.size();
667         var *= draws.size() / (draws.size() - 1);
668 
669 
670 
671         //fprintf(stderr, "##########\n");
672         //fprintf(stderr, "mean = %lf, var = %lg\n", mean, var);
673         if (scvs_for_alpha.size() > 0)
674         {
675             double mean_scv = accumulate(scvs_for_alpha.begin(),scvs_for_alpha.end(), 0.0);
676             mean_scv /= scvs_for_alpha.size();
677             //fprintf(stderr, "alpha = %lg scv = %lg\n", alpha, mean_scv);
678             alpha_vs_scv.push_back(make_pair(alpha, mean_scv));
679         }
680     }
681 
682     //fprintf(stderr, "$$$$$$$$$\n");
683 
684     //sort (alpha_range.begin(), alpha_range.end());
685 
686     char namebuf[256];
687     sprintf(namebuf, "trueSCV");
688     vari* cm = createvar(namebuf,STREGULAR,alpha_vs_scv.size(),VDOUBLE);
689     for (size_t i = 0; i < alpha_vs_scv.size(); ++i)
690     {
691         cm->dpr[i] = alpha_vs_scv[i].first;
692     }
693 
694     sprintf(namebuf, "estSCV");
695     vari* cv = createvar(namebuf,STREGULAR,alpha_vs_scv.size(),VDOUBLE);
696     for (size_t i = 0; i < alpha_vs_scv.size(); ++i)
697     {
698         cv->dpr[i] = alpha_vs_scv[i].second;
699     }
700 
701     char locfit_cmd[2048];
702     sprintf(locfit_cmd, "locfit trueSCV~estSCV");
703 
704     locfit_dispatch(locfit_cmd);
705 
706     sprintf(namebuf, "domainSCV");
707     vari* cd = createvar(namebuf,STREGULAR,alpha_vs_scv.size(),VDOUBLE);
708     for (size_t i = 0; i < alpha_vs_scv.size(); ++i)
709     {
710         cd->dpr[i] = alpha_vs_scv[i].second;
711     }
712 
713     sprintf(locfit_cmd, "fittedSCV=predict domainSCV");
714     locfit_dispatch(locfit_cmd);
715 
716     int n = 0;
717     sprintf(namebuf, "fittedSCV");
718     vari* cp = findvar(namebuf, 1, &n);
719     assert(cp != NULL);
720 
721     for (size_t i = 0; i < cp->n; ++i)
722     {
723         //fprintf(stderr, "%lg\t%lg\n",alpha_range[i], cp->dpr[i]);
724         true_to_est_scv_table.add_scv_pair(alpha_range[i], cp->dpr[i]);
725     }
726     true_to_est_scv_table.finalize();
727 }
728 
calculate_count_means_and_vars(const vector<LocusCountList> & sample_compatible_count_table,vector<pair<double,double>> & means_and_vars)729 void calculate_count_means_and_vars(const vector<LocusCountList>& sample_compatible_count_table,
730                                     vector<pair<double, double> >& means_and_vars)
731 {
732 
733     for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
734     {
735         const LocusCountList& p = sample_compatible_count_table[i];
736         double mean = accumulate(p.counts.begin(), p.counts.end(), 0.0);
737         if (mean > 0.0 && p.counts.size() > 0)
738             mean /= p.counts.size();
739 
740         double var = 0.0;
741         double num_non_zero = 0;
742         BOOST_FOREACH (double d, p.counts)
743         {
744             if (d > 0)
745                 num_non_zero++;
746             var += (d - mean) * (d - mean);
747         }
748         if (var > 0.0 && p.counts.size())
749         {
750             var /= p.counts.size();
751             var *= p.counts.size() / (p.counts.size() - 1);
752         }
753         means_and_vars.push_back(make_pair(mean, var));
754     }
755 }
756 
757 boost::shared_ptr<MassDispersionModel>
fit_dispersion_model_helper(const string & condition_name,const vector<double> & scale_factors,const vector<LocusCountList> & sample_compatible_count_table)758 fit_dispersion_model_helper(const string& condition_name,
759                             const vector<double>& scale_factors,
760                             const vector<LocusCountList>& sample_compatible_count_table)
761 {
762     vector<pair<double, double> > compatible_means_and_vars;
763 
764     SCVInterpolator true_to_est_scv_table;
765 
766     int num_samples = sample_compatible_count_table.front().counts.size();
767     if (no_scv_correction == false)
768     {
769         build_scv_correction_fit(num_samples, 10000, 100000, true_to_est_scv_table);
770     }
771 
772     setuplf();
773 
774     calculate_count_means_and_vars(sample_compatible_count_table, compatible_means_and_vars);
775 
776     sort(compatible_means_and_vars.begin(), compatible_means_and_vars.end());
777 
778     vector<double> compatible_count_means;
779     vector<double> raw_variances;
780 
781     for(size_t i = 0; i < compatible_means_and_vars.size(); ++i)
782     {
783         if (compatible_means_and_vars[i].first > 0 && compatible_means_and_vars[i].second > 0.0)
784         {
785             compatible_count_means.push_back(compatible_means_and_vars[i].first);
786             raw_variances.push_back(compatible_means_and_vars[i].second);
787         }
788     }
789 
790     if (compatible_count_means.size() < min_loci_for_fitting)
791     {
792         boost::shared_ptr<MassDispersionModel> disperser;
793         disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
794 
795         return disperser;
796     }
797 
798 
799     vector<double> fitted_values;
800 
801     // WARNING: locfit doesn't like undescores - need camel case for
802     // variable names
803 
804     char namebuf[256];
805     sprintf(namebuf, "countMeans");
806     vari* cm = createvar(namebuf,STREGULAR,compatible_count_means.size(),VDOUBLE);
807     for (size_t i = 0; i < compatible_count_means.size(); ++i)
808     {
809         cm->dpr[i] = log(compatible_count_means[i]);
810     }
811 
812     //sprintf(namebuf, "countSCV");
813     sprintf(namebuf, "countVariances");
814     vari* cv = createvar(namebuf,STREGULAR,raw_variances.size(),VDOUBLE);
815     for (size_t i = 0; i < raw_variances.size(); ++i)
816     {
817         cv->dpr[i] = raw_variances[i];
818         //cv->dpr[i] = raw_scvs[i];
819     }
820 
821     char locfit_cmd[2048];
822     //sprintf(locfit_cmd, "locfit countVariances~countMeans family=gamma");
823     sprintf(locfit_cmd, "locfit countVariances~countMeans family=gamma");
824 
825     locfit_dispatch(locfit_cmd);
826 
827     sprintf(locfit_cmd, "fittedVars=predict countMeans");
828     locfit_dispatch(locfit_cmd);
829 
830     //sprintf(locfit_cmd, "prfit x fhat h nlx");
831     //locfit_dispatch(locfit_cmd);
832 
833     double xim = 0;
834     BOOST_FOREACH(double s, scale_factors)
835     {
836         if (s)
837             xim += 1.0 / s;
838     }
839     xim /= scale_factors.size();
840 
841     int n = 0;
842     sprintf(namebuf, "fittedVars");
843     vari* cp = findvar(namebuf, 1, &n);
844     assert(cp != NULL);
845     for (size_t i = 0; i < cp->n; ++i)
846     {
847 //        if (cp->dpr[i] >= 0)
848 //        {
849             double mean = exp(cm->dpr[i]);
850             double fitted_scv = (cp->dpr[i] - mean) / (mean * mean);
851             double corrected_scv = true_to_est_scv_table.interpolate_scv(fitted_scv);
852             double corrected_variance = mean + (corrected_scv * (mean * mean));
853             double uncorrected_variance = mean + (fitted_scv * (mean * mean));
854             //fitted_values.push_back(mean + (cp->dpr[i] - xim * mean));
855             if (no_scv_correction == false && corrected_variance > uncorrected_variance)
856                 fitted_values.push_back(corrected_variance);
857             else if (uncorrected_variance > 0)
858                 fitted_values.push_back(uncorrected_variance);
859             else
860                 fitted_values.push_back(compatible_count_means[i]);
861 
862 
863 //        }
864 //        else
865 //        {
866 //            fitted_values.push_back(compatible_count_means[i]);
867 //        }
868     }
869 
870     boost::shared_ptr<MassDispersionModel> disperser;
871     disperser = boost::shared_ptr<MassDispersionModel>(new MassDispersionModel(condition_name, compatible_count_means, raw_variances, fitted_values));
872     if (dispersion_method == POISSON)
873         disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
874 
875 //    for (map<string, pair<double, double> >::iterator itr = labeled_mv_table.begin();
876 //         itr != labeled_mv_table.end();
877 //         ++itr)
878 //    {
879 //        string label = itr->first;
880 //        disperser->set_compatible_mean_and_var(itr->first, itr->second);
881 //    }
882 
883     return disperser;
884 }
885 
886 boost::shared_ptr<MassDispersionModel>
fit_dispersion_model(const string & condition_name,const vector<double> & scale_factors,const vector<LocusCountList> & sample_compatible_count_table)887 fit_dispersion_model(const string& condition_name,
888                      const vector<double>& scale_factors,
889                      const vector<LocusCountList>& sample_compatible_count_table)
890 {
891 //
892 //#if ENABLE_THREADS
893 //	boost::mutex::scoped_lock lock(_locfit_lock);
894 //#endif
895     for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
896     {
897         if (sample_compatible_count_table[i].counts.size() <= 1)
898         {
899             // only one replicate - no point in fitting variance
900             return boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
901         }
902     }
903 #if ENABLE_THREADS
904     _locfit_lock.lock();
905 #endif
906 
907     ProgressBar p_bar("Modeling fragment count overdispersion.",0);
908 
909     int max_transcripts = 0;
910     BOOST_FOREACH(const LocusCountList& L, sample_compatible_count_table)
911     {
912         if (L.num_transcripts > max_transcripts)
913         {
914             max_transcripts = L.num_transcripts;
915         }
916     }
917 
918     boost::shared_ptr<MassDispersionModel>  model = fit_dispersion_model_helper(condition_name, scale_factors, sample_compatible_count_table);
919 
920 #if ENABLE_THREADS
921     _locfit_lock.unlock();
922 #endif
923     return model;
924 }
925 
build_norm_table(const vector<LocusCountList> & full_count_table,boost::shared_ptr<const map<string,LibNormStandards>> normalizing_standards,vector<LocusCountList> & norm_table)926 void build_norm_table(const vector<LocusCountList>& full_count_table,
927                       boost::shared_ptr<const map<string, LibNormStandards> > normalizing_standards,
928                       vector<LocusCountList>& norm_table)
929 {
930     // If we're using housekeeping genes or spike-in controls, select the rows we'll be using from the full count table.
931     if (normalizing_standards)
932     {
933         for (size_t i = 0; i < full_count_table.size(); ++i)
934         {
935             const vector<string>& gene_ids = full_count_table[i].gene_ids;
936             const vector<string>& gene_short_names = full_count_table[i].gene_short_names;
937 
938             // If the row has an ID that's in the table, take it.
939             map<string, LibNormStandards>::const_iterator g_id_itr = normalizing_standards->end();
940             map<string, LibNormStandards>::const_iterator g_name_itr = normalizing_standards->end();
941 
942             for (size_t j = 0; j < gene_ids.size(); ++j)
943             {
944                 g_id_itr = normalizing_standards->find(gene_ids[j]);
945                 if (g_id_itr != normalizing_standards->end())
946                 {
947                     break;
948                 }
949             }
950 
951             if (g_id_itr != normalizing_standards->end())
952             {
953                 norm_table.push_back(full_count_table[i]);
954                 continue;
955             }
956 
957             for (size_t j = 0; j < gene_short_names.size(); ++j)
958             {
959                 g_name_itr = normalizing_standards->find(gene_short_names[j]);
960                 if (g_name_itr != normalizing_standards->end())
961                 {
962                     break;
963                 }
964             }
965 
966             if (g_name_itr != normalizing_standards->end())
967             {
968                 norm_table.push_back(full_count_table[i]);
969                 continue;
970             }
971 
972         }
973     }
974     else // otherwise, just take all rows.
975     {
976         norm_table = full_count_table;
977     }
978 }
979 
normalize_counts(vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)980 void normalize_counts(vector<boost::shared_ptr<ReadGroupProperties> > & all_read_groups)
981 {
982     vector<LocusCountList> sample_compatible_count_table;
983     vector<LocusCountList> sample_total_count_table;
984 
985     for (size_t i = 0; i < all_read_groups.size(); ++i)
986     {
987         boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
988         const vector<LocusCount>& raw_compatible_counts = rg_props->raw_compatible_counts();
989         const vector<LocusCount>& raw_total_counts = rg_props->raw_total_counts();
990 
991         for (size_t j = 0; j < raw_compatible_counts.size(); ++j)
992         {
993             if (sample_compatible_count_table.size() == j)
994             {
995                 const string& locus_id = raw_compatible_counts[j].locus_desc;
996                 int num_transcripts = raw_compatible_counts[j].num_transcripts;
997 
998                 const vector<string>& gene_ids = raw_compatible_counts[j].gene_ids;
999                 const vector<string>& gene_short_names = raw_compatible_counts[j].gene_short_names;
1000 
1001                 sample_compatible_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts, gene_ids, gene_short_names));
1002                 sample_total_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts, gene_ids, gene_short_names));
1003             }
1004             double scaled = raw_compatible_counts[j].count;
1005             //sample_compatible_count_table[j].counts[i] = scaled * unscaling_factor;
1006             sample_compatible_count_table[j].counts[i] = floor(scaled);
1007             sample_total_count_table[j].counts[i] = floor(raw_total_counts[j].count);
1008 
1009             assert(sample_compatible_count_table[j].counts[i] >= 0 && !isinf(sample_compatible_count_table[j].counts[i]));
1010         }
1011     }
1012 
1013     vector<double> scale_factors(all_read_groups.size(), 0.0);
1014 
1015     vector<LocusCountList> norm_table;
1016 
1017     if (use_compat_mass)
1018     {
1019         build_norm_table(sample_compatible_count_table, lib_norm_standards, norm_table);
1020     }
1021     else // use_total_mass
1022     {
1023         assert(use_total_mass);
1024         build_norm_table(sample_total_count_table, lib_norm_standards, norm_table);
1025     }
1026 
1027     if (lib_norm_method == GEOMETRIC)
1028     {
1029         calc_geometric_scaling_factors(norm_table, scale_factors);
1030     }
1031     else if (lib_norm_method == CLASSIC_FPKM)
1032     {
1033         calc_classic_fpkm_scaling_factors(norm_table, scale_factors);
1034     }
1035     else if (lib_norm_method == QUARTILE)
1036     {
1037         calc_quartile_scaling_factors(norm_table, scale_factors);
1038     }
1039     else if (lib_norm_method == TMM)
1040     {
1041         calc_tmm_scaling_factors(norm_table, scale_factors);
1042     }
1043     else
1044     {
1045         assert (false);
1046     }
1047 
1048     for (size_t i = 0; i < all_read_groups.size(); ++i)
1049     {
1050         boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
1051         rg_props->internal_scale_factor(scale_factors[i]);
1052     }
1053 
1054     assert(sample_compatible_count_table.size() == sample_total_count_table.size());
1055 
1056     // Transform raw counts to the common scale
1057     for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
1058     {
1059         LocusCountList& p = sample_compatible_count_table[i];
1060         for (size_t j = 0; j < p.counts.size(); ++j)
1061         {
1062             assert (scale_factors.size() > j);
1063             p.counts[j] *= (1.0 / scale_factors[j]);
1064         }
1065 
1066         LocusCountList& t = sample_total_count_table[i];
1067         for (size_t j = 0; j < t.counts.size(); ++j)
1068         {
1069             assert (scale_factors.size() > j);
1070             t.counts[j] *= (1.0 / scale_factors[j]);
1071         }
1072     }
1073 
1074     for (size_t i = 0; i < all_read_groups.size(); ++i)
1075     {
1076         boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
1077         vector<LocusCount> scaled_compatible_counts;
1078         for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
1079         {
1080             string& locus_id = sample_compatible_count_table[j].locus_desc;
1081             double count = sample_compatible_count_table[j].counts[i];
1082             int num_transcripts = sample_compatible_count_table[j].num_transcripts;
1083 
1084             const vector<string>& gids = sample_compatible_count_table[j].gene_ids;
1085             const vector<string>& gnms = sample_compatible_count_table[j].gene_short_names;
1086 
1087             LocusCount locus_count(locus_id, count, num_transcripts, gids, gnms);
1088             scaled_compatible_counts.push_back(locus_count);
1089         }
1090         rg_props->common_scale_compatible_counts(scaled_compatible_counts);
1091     }
1092 
1093     for (size_t i = 0; i < all_read_groups.size(); ++i)
1094     {
1095         boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
1096         vector<LocusCount> scaled_total_counts;
1097         for (size_t j = 0; j < sample_total_count_table.size(); ++j)
1098         {
1099             string& locus_id = sample_total_count_table[j].locus_desc;
1100             double count = sample_total_count_table[j].counts[i];
1101             int num_transcripts = sample_total_count_table[j].num_transcripts;
1102 
1103             const vector<string>& gids = sample_total_count_table[j].gene_ids;
1104             const vector<string>& gnms = sample_total_count_table[j].gene_short_names;
1105 
1106             LocusCount locus_count(locus_id, count, num_transcripts, gids, gnms);
1107             scaled_total_counts.push_back(locus_count);
1108         }
1109         rg_props->common_scale_total_counts(scaled_total_counts);
1110     }
1111 
1112     double avg_total_common_scaled_count = 0.0;
1113 
1114     for (size_t fac_idx = 0; fac_idx < all_read_groups.size(); ++fac_idx)
1115     {
1116         double total_common = 0.0;
1117         if (use_compat_mass)
1118         {
1119             for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
1120             {
1121                 total_common += sample_compatible_count_table[j].counts[fac_idx];
1122             }
1123         }
1124         else
1125         {
1126             for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
1127             {
1128                 total_common += sample_total_count_table[j].counts[fac_idx];
1129             }
1130 
1131         }
1132 
1133         avg_total_common_scaled_count += (1.0/all_read_groups.size()) * total_common;
1134     }
1135 
1136     BOOST_FOREACH(boost::shared_ptr<ReadGroupProperties> rg, all_read_groups)
1137     {
1138         rg->normalized_map_mass(avg_total_common_scaled_count);
1139     }
1140 }
1141 
1142