1 //
2 // replicates.cpp
3 // cufflinks
4 //
5 // Created by Cole Trapnell on 3/11/11.
6 // Copyright 2011 Cole Trapnell. All rights reserved.
7 //
8
9 #include <boost/thread.hpp>
10
11 extern "C" {
12 #include "locfit/local.h"
13 }
14
15 #include <boost/random/gamma_distribution.hpp>
16 #include <boost/random/poisson_distribution.hpp>
17
18 #include "replicates.h"
19 //#include "rounding.h"
20
21 #if ENABLE_THREADS
22 boost::mutex _locfit_lock;
23 #endif
24
MassDispersionModel(const std::string & name,const std::vector<double> & scaled_compatible_mass_means,const std::vector<double> & scaled_compatible_variances,const std::vector<double> & scaled_mass_variances)25 MassDispersionModel::MassDispersionModel(const std::string& name,
26 const std::vector<double>& scaled_compatible_mass_means,
27 const std::vector<double>& scaled_compatible_variances,
28 const std::vector<double>& scaled_mass_variances)
29 {
30 _name = name;
31
32 if (scaled_compatible_mass_means.size() != scaled_mass_variances.size())
33 {
34 fprintf (stderr, "Error: dispersion model table is malformed\n");
35 }
36
37 double last_val = 0;
38 for (size_t i = 0; i < scaled_compatible_mass_means.size(); i++)
39 {
40
41 if (last_val > scaled_compatible_mass_means[i])
42 {
43 fprintf (stderr, "Error: DispersionModel input is malformed\n");
44 }
45
46 if ( i == 0 || last_val < scaled_compatible_mass_means[i])
47 {
48 _scaled_compatible_mass_means.push_back(scaled_compatible_mass_means[i]);
49 _scaled_compatible_variances.push_back(scaled_compatible_variances[i]);
50 _scaled_mass_variances.push_back(scaled_mass_variances[i]);
51 }
52 else
53 {
54 // skip this element if it's equal to what we've already seen
55 }
56
57 last_val = scaled_compatible_mass_means[i];
58 }
59 }
60
scale_mass_variance(double scaled_mass) const61 double MassDispersionModel::scale_mass_variance(double scaled_mass) const
62 {
63 if (scaled_mass <= 0)
64 return 0.0;
65
66 if (_scaled_compatible_mass_means.size() < 2 || _scaled_mass_variances.size() < 2)
67 {
68 return scaled_mass; // revert to poisson.
69 }
70 if (scaled_mass > _scaled_compatible_mass_means.back())
71 {
72 // extrapolate to the right
73 // off the right end
74 double x1_mean = _scaled_compatible_mass_means[_scaled_compatible_mass_means.size()-2];
75 double x2_mean = _scaled_compatible_mass_means[_scaled_compatible_mass_means.size()-1];
76
77 double y1_var = _scaled_mass_variances[_scaled_compatible_mass_means.size()-2];
78 double y2_var = _scaled_mass_variances[_scaled_compatible_mass_means.size()-1];
79 double slope = 0.0;
80 if (x2_mean != x1_mean)
81 {
82 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
83 }
84 else if (y1_var == y2_var)
85 {
86 assert (false); // should have a unique'd table
87 }
88 double mean_interp = _scaled_mass_variances[_scaled_compatible_mass_means.size()-1] -
89 slope*(scaled_mass - _scaled_compatible_mass_means.size()-1);
90 if (mean_interp < scaled_mass)
91 mean_interp = scaled_mass;
92 assert (!isnan(mean_interp) && !isinf(mean_interp));
93 return mean_interp;
94 }
95 else if (scaled_mass < _scaled_compatible_mass_means.front())
96 {
97 // extrapolate to the left
98 // off the left end?
99 double x1_mean = _scaled_compatible_mass_means[0];
100 double x2_mean = _scaled_compatible_mass_means[1];
101
102 double y1_var = _scaled_mass_variances[0];
103 double y2_var = _scaled_mass_variances[1];
104 double slope = 0.0;
105 if (x2_mean != x1_mean)
106 {
107 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
108 }
109 else if (y1_var == y2_var)
110 {
111 assert (false); // should have a unique'd table
112 }
113 double mean_interp = _scaled_mass_variances[0] - slope*(_scaled_compatible_mass_means[0] - scaled_mass);
114 if (mean_interp < scaled_mass)
115 mean_interp = scaled_mass;
116
117 assert (!isnan(mean_interp) && !isinf(mean_interp));
118 return mean_interp;
119 }
120
121 vector<double>::const_iterator lb;
122 lb = lower_bound(_scaled_compatible_mass_means.begin(),
123 _scaled_compatible_mass_means.end(),
124 scaled_mass);
125 if (lb < _scaled_compatible_mass_means.end())
126 {
127 int d = lb - _scaled_compatible_mass_means.begin();
128 if (*lb == scaled_mass || lb == _scaled_compatible_mass_means.begin())
129 {
130 double var = _scaled_mass_variances[d];
131 if (var < scaled_mass) // revert to poisson if underdispersed
132 var = scaled_mass;
133 assert (!isnan(var) && !isinf(var));
134 return var;
135 }
136
137
138 //in between two points on the scale.
139 d--;
140
141 if (d < 0)
142 {
143 fprintf(stderr, "ARG d < 0, d = %d \n", d);
144 }
145
146 if (d >= _scaled_compatible_mass_means.size())
147 {
148 fprintf(stderr, "ARG d >= _scaled_compatible_mass_means.size(), d = %d\n", d);
149 }
150 if (d >= _scaled_mass_variances.size())
151 {
152 fprintf(stderr, "ARG d >= _scaled_mass_variances.size(), d = %d\n", d);
153 }
154
155 double x1_mean = _scaled_compatible_mass_means[d];
156 double x2_mean = _scaled_compatible_mass_means[d + 1];
157
158 double y1_var = _scaled_mass_variances[d];
159 double y2_var = _scaled_mass_variances[d + 1];
160 double slope = 0.0;
161 if (x2_mean != x1_mean)
162 {
163 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
164 }
165 else if (y1_var == y2_var)
166 {
167 assert (false); // should have a unique'd table
168 }
169 double mean_interp = _scaled_mass_variances[d] + slope*(scaled_mass - _scaled_compatible_mass_means[d]);
170 if (mean_interp < scaled_mass) // revert to poisson if underdispersed
171 mean_interp = scaled_mass;
172
173 assert (!isnan(mean_interp) && !isinf(mean_interp));
174 return mean_interp;
175 }
176 else
177 {
178 assert (!isnan(scaled_mass) && !isinf(scaled_mass));
179 return scaled_mass; // revert to poisson assumption
180 }
181 }
182
183
MleErrorModel(const std::string & name,const std::vector<double> & scaled_compatible_mass_means,const std::vector<double> & scaled_mle_variances)184 MleErrorModel::MleErrorModel(const std::string& name,
185 const std::vector<double>& scaled_compatible_mass_means,
186 const std::vector<double>& scaled_mle_variances)
187 {
188 _name = name;
189
190 if (scaled_compatible_mass_means.size() != scaled_mle_variances.size())
191 {
192 fprintf (stderr, "Error: dispersion model table is malformed\n");
193 }
194
195 double last_val = 0;
196 for (size_t i = 0; i < scaled_compatible_mass_means.size(); i++)
197 {
198
199 if (last_val > scaled_compatible_mass_means[i])
200 {
201 fprintf (stderr, "Error: MLEError input is malformed\n");
202 }
203
204 if ( i == 0 || last_val < scaled_compatible_mass_means[i])
205 {
206 _scaled_compatible_mass_means.push_back(scaled_compatible_mass_means[i]);
207 _scaled_mle_variances.push_back(scaled_mle_variances[i]);
208 }
209 else
210 {
211 // skip this element if it's equal to what we've already seen
212 }
213
214 last_val = scaled_compatible_mass_means[i];
215 }
216 }
217
scale_mle_variance(double scaled_mass) const218 double MleErrorModel::scale_mle_variance(double scaled_mass) const
219 {
220 if (scaled_mass <= 0)
221 return 0.0;
222
223 if (_scaled_compatible_mass_means.size() < 2 || _scaled_mle_variances.size() < 2)
224 {
225 return 0; // revert to poisson.
226 }
227 if (scaled_mass > _scaled_compatible_mass_means.back())
228 {
229 return 0;
230 }
231 else if (scaled_mass < _scaled_compatible_mass_means.front())
232 {
233 return 0; // we won't add anything if we're out of the range of the table
234 }
235
236 vector<double>::const_iterator lb;
237 lb = lower_bound(_scaled_compatible_mass_means.begin(),
238 _scaled_compatible_mass_means.end(),
239 scaled_mass);
240 if (lb < _scaled_compatible_mass_means.end())
241 {
242 int d = lb - _scaled_compatible_mass_means.begin();
243 if (*lb == scaled_mass || lb == _scaled_compatible_mass_means.begin())
244 {
245 double var = _scaled_mle_variances[d];
246 assert (!isnan(var) && !isinf(var));
247 if (var < 0)
248 var = 0;
249 return var;
250 }
251
252 //in between two points on the scale.
253 d--;
254
255 if (d < 0)
256 {
257 fprintf(stderr, "ARG d < 0, d = %d \n", d);
258 }
259
260 if (d >= _scaled_compatible_mass_means.size())
261 {
262 fprintf(stderr, "ARG d >= _scaled_compatible_mass_means.size(), d = %d\n", d);
263 }
264 if (d >= _scaled_mle_variances.size())
265 {
266 fprintf(stderr, "ARG d >= _scaled_mass_variances.size(), d = %d\n", d);
267 }
268
269 double x1_mean = _scaled_compatible_mass_means[d];
270 double x2_mean = _scaled_compatible_mass_means[d + 1];
271
272 double y1_var = _scaled_mle_variances[d];
273 double y2_var = _scaled_mle_variances[d + 1];
274 double slope = 0.0;
275 if (x2_mean != x1_mean)
276 {
277 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
278 }
279 else if (y1_var == y2_var)
280 {
281 assert (false); // should have a unique'd table
282 }
283 double mean_interp = _scaled_mle_variances[d] + slope*(scaled_mass - _scaled_compatible_mass_means[d]);
284 if (mean_interp < 0)
285 mean_interp = 0;
286
287 assert (!isnan(mean_interp) && !isinf(mean_interp));
288 return mean_interp;
289 }
290 else
291 {
292 assert (!isnan(scaled_mass) && !isinf(scaled_mass));
293 return 0; // revert to poisson assumption
294 }
295 }
296
transform_counts_to_common_scale(const vector<double> & scale_factors,vector<LocusCountList> & sample_compatible_count_table)297 void transform_counts_to_common_scale(const vector<double>& scale_factors,
298 vector<LocusCountList>& sample_compatible_count_table)
299 {
300 // Transform raw counts to the common scale
301 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
302 {
303 LocusCountList& p = sample_compatible_count_table[i];
304 for (size_t j = 0; j < p.counts.size(); ++j)
305 {
306 assert (scale_factors.size() > j);
307 p.counts[j] *= (1.0 / scale_factors[j]);
308 }
309 }
310 }
311
calc_geometric_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)312 void calc_geometric_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
313 vector<double>& scale_factors)
314 {
315
316 vector<double> log_geom_means(sample_compatible_count_table.size(), 0.0);
317
318 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
319 {
320 const LocusCountList& p = sample_compatible_count_table[i];
321
322 for (size_t j = 0; j < p.counts.size(); ++j)
323 {
324 //assert (log_geom_means.size() > j);
325 //if (floor(p.counts[j]) > 0)
326 //{
327 log_geom_means[i] += (1.0/p.counts.size()) * log(floor(p.counts[j]));
328 //}
329
330 }
331 //log_geom_means[i] = pow(log_geom_means[i], 1.0/(double)p.counts.size());
332 }
333
334 for (size_t j = 0; j < scale_factors.size(); ++j)
335 {
336 vector<double> tmp_counts;
337 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
338 {
339 if (log_geom_means[i] && !isinf(log_geom_means[i]) && !isnan(log_geom_means[i]) && floor(sample_compatible_count_table[i].counts[j]))
340 {
341 double gm = (double)log(floor(sample_compatible_count_table[i].counts[j])) - log_geom_means[i];
342 assert (!isinf(gm));
343 tmp_counts.push_back(gm);
344 }
345 }
346 sort(tmp_counts.begin(), tmp_counts.end());
347 if (!tmp_counts.empty())
348 scale_factors[j] = exp(tmp_counts[tmp_counts.size()/2]);
349 else
350 scale_factors[j] = 1.0;
351 }
352 }
353
calc_classic_fpkm_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)354 void calc_classic_fpkm_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
355 vector<double>& scale_factors)
356 {
357 vector<double> total_counts(sample_compatible_count_table.size(), 0.0);
358
359 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
360 {
361 const LocusCountList& p = sample_compatible_count_table[i];
362
363 for (size_t j = 0; j < p.counts.size(); ++j)
364 {
365 total_counts[j] += floor(p.counts[j]);
366 }
367 }
368
369 double all_library_counts = accumulate(total_counts.begin(), total_counts.end(), 0.0);
370 if (all_library_counts == 0.0)
371 {
372 for (size_t i = 0; i < scale_factors.size(); ++i)
373 {
374 scale_factors[i] = 1.0;
375 }
376 return;
377 }
378 for (size_t i = 0; i < scale_factors.size(); ++i)
379 {
380 scale_factors[i] = total_counts[i] / all_library_counts;
381 }
382
383 double avg_scaling_factor = accumulate(scale_factors.begin(), scale_factors.end(), 0.0);
384 avg_scaling_factor /= scale_factors.size();
385
386 for (size_t i = 0; i < scale_factors.size(); ++i)
387 {
388 scale_factors[i] = scale_factors[i] / avg_scaling_factor;
389 }
390 }
391
calc_quartile_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)392 void calc_quartile_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
393 vector<double>& scale_factors)
394 {
395
396 if (sample_compatible_count_table.empty())
397 return;
398
399 vector<double> upper_quartiles(sample_compatible_count_table.front().counts.size(), 0.0);
400 vector<double> total_common_masses(sample_compatible_count_table.front().counts.size(), 0.0);
401
402 for (size_t i = 0; i < sample_compatible_count_table.front().counts.size(); ++i)
403 {
404 vector<double> common_scaled_counts;
405 double total_common = 0.0;
406
407 for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
408 {
409
410 //boost::shared_ptr<ReadGroupProperties> rg = bundle_factories[fac_idx];
411 //double scaled_mass = scale_factors[fac_idx] * rg->total_map_mass();
412
413 total_common += sample_compatible_count_table[j].counts[i];
414 common_scaled_counts.push_back(sample_compatible_count_table[j].counts[i]);
415 }
416
417 sort(common_scaled_counts.begin(), common_scaled_counts.end());
418 if (common_scaled_counts.empty())
419 continue;
420
421 int upper_quart_index = common_scaled_counts.size() * 0.75;
422 double upper_quart_count = common_scaled_counts[upper_quart_index];
423 upper_quartiles[i] = upper_quart_count;
424 total_common_masses[i] = total_common;
425 }
426
427 long double total_mass = accumulate(total_common_masses.begin(), total_common_masses.end(), 0.0);
428 long double total_norm_mass = accumulate(upper_quartiles.begin(), upper_quartiles.end(), 0.0);
429
430 for (size_t i = 0; i < sample_compatible_count_table.front().counts.size(); ++i)
431 {
432 if (total_mass > 0)
433 {
434 double scaling_factor = upper_quartiles[i];
435 scaling_factor /= (total_norm_mass / upper_quartiles.size());
436 scale_factors[i] = scaling_factor;
437 }
438 else
439 {
440 scale_factors[i] = 1.0;
441 }
442 }
443 }
444
calc_tmm_scaling_factors(const vector<LocusCountList> & sample_compatible_count_table,vector<double> & scale_factors)445 void calc_tmm_scaling_factors(const vector<LocusCountList>& sample_compatible_count_table,
446 vector<double>& scale_factors)
447 {
448 scale_factors = vector<double>(sample_compatible_count_table.size(), 1.0);
449 }
450
451 static const int min_loci_for_fitting = 30;
452
453 struct SCVInterpolator
454 {
add_scv_pairSCVInterpolator455 void add_scv_pair(double est_scv, double true_scv)
456 {
457 true_scvs.push_back(true_scv);
458 est_scvs.push_back(est_scv);
459 }
460
finalizeSCVInterpolator461 void finalize()
462 {
463 vector<pair<double, double> > pv;
464 for (size_t i =0; i < true_scvs.size(); ++i)
465 {
466 pv.push_back(make_pair(est_scvs[i], true_scvs[i]));
467 }
468 sort(pv.begin(), pv.end());
469 }
470
471 // This was built from the dispersion model interpolator - we should refactor these
472 // into a single routine.
interpolate_scvSCVInterpolator473 double interpolate_scv(double est_scv)
474 {
475 // if (est_scv <= 0)
476 // return 0.0;
477
478 if (est_scvs.size() < 2 || true_scvs.size() < 2)
479 {
480 return est_scv; // revert to poisson.
481 }
482 if (est_scv > est_scvs.back())
483 {
484 //fprintf(stderr, "Warning: extrapolating to the right\n");
485 // extrapolate to the right
486 // off the right end
487 double x1_mean = est_scvs[est_scvs.size()-2];
488 double x2_mean = est_scvs[est_scvs.size()-1];
489
490 double y1_var = true_scvs[est_scvs.size()-2];
491 double y2_var = true_scvs[est_scvs.size()-1];
492 double slope = 0.0;
493 if (x2_mean != x1_mean)
494 {
495 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
496 }
497 else if (y1_var == y2_var)
498 {
499 assert (false); // should have a unique'd table
500 }
501 double mean_interp = true_scvs[est_scvs.size()-1] -
502 slope*(est_scv - est_scvs.size()-1);
503 // if (mean_interp < est_scv)
504 // mean_interp = est_scv;
505 assert (!isnan(mean_interp) && !isinf(mean_interp));
506 return mean_interp;
507 }
508 else if (est_scv < est_scvs.front())
509 {
510 //fprintf(stderr, "Warning: extrapolating to the left\n");
511
512 // If we're extrapolating to the left, our fit is too coarse, but
513 // that probably means we don't need SCV bias correction at all.
514 return est_scv;
515 }
516
517 vector<double>::const_iterator lb;
518 lb = lower_bound(est_scvs.begin(),
519 est_scvs.end(),
520 est_scv);
521 if (lb < est_scvs.end())
522 {
523 int d = lb - est_scvs.begin();
524 if (*lb == est_scv || lb == est_scvs.begin())
525 {
526 double var = true_scvs[d];
527 // if (var < est_scv) // revert to poisson if underdispersed
528 // var = est_scv;
529 assert (!isnan(var) && !isinf(var));
530 return var;
531 }
532
533
534 //in between two points on the scale.
535 d--;
536
537 if (d < 0)
538 {
539 fprintf(stderr, "ARG d < 0, d = %d \n", d);
540 }
541
542 if (d >= est_scvs.size())
543 {
544 fprintf(stderr, "ARG d >= est_scvs.size(), d = %d\n", d);
545 }
546 if (d >= true_scvs.size())
547 {
548 fprintf(stderr, "ARG d >= true_scvs.size(), d = %d\n", d);
549 }
550
551 double x1_mean = est_scvs[d];
552 double x2_mean = est_scvs[d + 1];
553
554 double y1_var = true_scvs[d];
555 double y2_var = true_scvs[d + 1];
556 double slope = 0.0;
557 if (x2_mean != x1_mean)
558 {
559 slope = (y2_var - y1_var) / (x2_mean-x1_mean);
560 }
561 else if (y1_var == y2_var)
562 {
563 fprintf(stderr, "Warning: SCV table does not have unique keys\n!");
564 assert (false); // should have a unique'd table
565 }
566 double mean_interp = true_scvs[d] + slope*(est_scv - est_scvs[d]);
567 // if (mean_interp < est_scv) // revert to poisson if underdispersed
568 // mean_interp = est_scv;
569
570 assert (!isnan(mean_interp) && !isinf(mean_interp));
571 return mean_interp;
572 }
573 else
574 {
575 assert (!isnan(est_scv) && !isinf(est_scv));
576 return est_scv; // revert to poisson assumption
577 }
578
579 return est_scv;
580 }
581
582 private:
583 vector<double> true_scvs;
584 vector<double> est_scvs;
585 };
586
build_scv_correction_fit(int nreps,int ngenes,int mean_count,SCVInterpolator & true_to_est_scv_table)587 void build_scv_correction_fit(int nreps, int ngenes, int mean_count, SCVInterpolator& true_to_est_scv_table)
588 {
589 setuplf();
590
591 vector<pair<double, double> > alpha_vs_scv;
592
593 boost::mt19937 rng(random_seed);
594 vector<boost::random::negative_binomial_distribution<int, double> > nb_gens;
595
596 vector<double> alpha_range;
597 for(double a = 0.0002; a < 2.0; a += 0.0002)
598 {
599 alpha_range.push_back(a);
600 }
601
602 for (double a = 2; a < 100.0; a += 1)
603 {
604 alpha_range.push_back(a);
605 }
606
607 BOOST_FOREACH (double alpha, alpha_range)
608 {
609 double k = 1.0/alpha;
610 double p = k / (k + mean_count);
611 double r = (mean_count * p) / (1-p);
612
613 //boost::random::negative_binomial_distribution<int, double> nb(r, p);
614
615 boost::random::gamma_distribution<double> gamma(r, (1-p)/p);
616
617
618 vector<double> scvs_for_alpha;
619 vector<double> draws;
620 for (size_t i = 0; i < ngenes; ++i)
621 {
622 LocusCountList locus_count("", nreps, 1, vector<string>(), vector<string>());
623 for (size_t rep_idx = 0; rep_idx < nreps; ++rep_idx)
624 {
625 double gamma_draw = gamma(rng);
626 if (gamma_draw == 0)
627 {
628 locus_count.counts[rep_idx] = 0;
629 draws.push_back(0);
630 }
631 else
632 {
633 boost::random::poisson_distribution<long, double> poisson(gamma_draw);
634 locus_count.counts[rep_idx] = poisson(rng);
635 draws.push_back(locus_count.counts[rep_idx]);
636 //fprintf(stderr, "%lg\t", locus_count.counts[rep_idx]);
637 }
638 }
639
640 double mean = accumulate(locus_count.counts.begin(), locus_count.counts.end(), 0.0);
641 if (mean == 0)
642 continue;
643 mean /= locus_count.counts.size();
644 double var = 0.0;
645 BOOST_FOREACH(double c, locus_count.counts)
646 {
647 var += (c-mean)*(c-mean);
648 }
649 var /= locus_count.counts.size();
650 var *= locus_count.counts.size() / (locus_count.counts.size() - 1);
651
652 double scv = var / (mean*mean);
653 scvs_for_alpha.push_back(scv);
654 //fprintf(stderr, " : mean = %lg, var = %lg, scv = %lg\n", mean, var, scv);
655
656 //fprintf(stderr, "\n");
657 }
658
659 double mean = accumulate(draws.begin(), draws.end(), 0.0);
660 mean /= draws.size();
661 double var = 0.0;
662 BOOST_FOREACH(int c, draws)
663 {
664 var += (c - mean)*(c-mean);
665 }
666 var /= draws.size();
667 var *= draws.size() / (draws.size() - 1);
668
669
670
671 //fprintf(stderr, "##########\n");
672 //fprintf(stderr, "mean = %lf, var = %lg\n", mean, var);
673 if (scvs_for_alpha.size() > 0)
674 {
675 double mean_scv = accumulate(scvs_for_alpha.begin(),scvs_for_alpha.end(), 0.0);
676 mean_scv /= scvs_for_alpha.size();
677 //fprintf(stderr, "alpha = %lg scv = %lg\n", alpha, mean_scv);
678 alpha_vs_scv.push_back(make_pair(alpha, mean_scv));
679 }
680 }
681
682 //fprintf(stderr, "$$$$$$$$$\n");
683
684 //sort (alpha_range.begin(), alpha_range.end());
685
686 char namebuf[256];
687 sprintf(namebuf, "trueSCV");
688 vari* cm = createvar(namebuf,STREGULAR,alpha_vs_scv.size(),VDOUBLE);
689 for (size_t i = 0; i < alpha_vs_scv.size(); ++i)
690 {
691 cm->dpr[i] = alpha_vs_scv[i].first;
692 }
693
694 sprintf(namebuf, "estSCV");
695 vari* cv = createvar(namebuf,STREGULAR,alpha_vs_scv.size(),VDOUBLE);
696 for (size_t i = 0; i < alpha_vs_scv.size(); ++i)
697 {
698 cv->dpr[i] = alpha_vs_scv[i].second;
699 }
700
701 char locfit_cmd[2048];
702 sprintf(locfit_cmd, "locfit trueSCV~estSCV");
703
704 locfit_dispatch(locfit_cmd);
705
706 sprintf(namebuf, "domainSCV");
707 vari* cd = createvar(namebuf,STREGULAR,alpha_vs_scv.size(),VDOUBLE);
708 for (size_t i = 0; i < alpha_vs_scv.size(); ++i)
709 {
710 cd->dpr[i] = alpha_vs_scv[i].second;
711 }
712
713 sprintf(locfit_cmd, "fittedSCV=predict domainSCV");
714 locfit_dispatch(locfit_cmd);
715
716 int n = 0;
717 sprintf(namebuf, "fittedSCV");
718 vari* cp = findvar(namebuf, 1, &n);
719 assert(cp != NULL);
720
721 for (size_t i = 0; i < cp->n; ++i)
722 {
723 //fprintf(stderr, "%lg\t%lg\n",alpha_range[i], cp->dpr[i]);
724 true_to_est_scv_table.add_scv_pair(alpha_range[i], cp->dpr[i]);
725 }
726 true_to_est_scv_table.finalize();
727 }
728
calculate_count_means_and_vars(const vector<LocusCountList> & sample_compatible_count_table,vector<pair<double,double>> & means_and_vars)729 void calculate_count_means_and_vars(const vector<LocusCountList>& sample_compatible_count_table,
730 vector<pair<double, double> >& means_and_vars)
731 {
732
733 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
734 {
735 const LocusCountList& p = sample_compatible_count_table[i];
736 double mean = accumulate(p.counts.begin(), p.counts.end(), 0.0);
737 if (mean > 0.0 && p.counts.size() > 0)
738 mean /= p.counts.size();
739
740 double var = 0.0;
741 double num_non_zero = 0;
742 BOOST_FOREACH (double d, p.counts)
743 {
744 if (d > 0)
745 num_non_zero++;
746 var += (d - mean) * (d - mean);
747 }
748 if (var > 0.0 && p.counts.size())
749 {
750 var /= p.counts.size();
751 var *= p.counts.size() / (p.counts.size() - 1);
752 }
753 means_and_vars.push_back(make_pair(mean, var));
754 }
755 }
756
757 boost::shared_ptr<MassDispersionModel>
fit_dispersion_model_helper(const string & condition_name,const vector<double> & scale_factors,const vector<LocusCountList> & sample_compatible_count_table)758 fit_dispersion_model_helper(const string& condition_name,
759 const vector<double>& scale_factors,
760 const vector<LocusCountList>& sample_compatible_count_table)
761 {
762 vector<pair<double, double> > compatible_means_and_vars;
763
764 SCVInterpolator true_to_est_scv_table;
765
766 int num_samples = sample_compatible_count_table.front().counts.size();
767 if (no_scv_correction == false)
768 {
769 build_scv_correction_fit(num_samples, 10000, 100000, true_to_est_scv_table);
770 }
771
772 setuplf();
773
774 calculate_count_means_and_vars(sample_compatible_count_table, compatible_means_and_vars);
775
776 sort(compatible_means_and_vars.begin(), compatible_means_and_vars.end());
777
778 vector<double> compatible_count_means;
779 vector<double> raw_variances;
780
781 for(size_t i = 0; i < compatible_means_and_vars.size(); ++i)
782 {
783 if (compatible_means_and_vars[i].first > 0 && compatible_means_and_vars[i].second > 0.0)
784 {
785 compatible_count_means.push_back(compatible_means_and_vars[i].first);
786 raw_variances.push_back(compatible_means_and_vars[i].second);
787 }
788 }
789
790 if (compatible_count_means.size() < min_loci_for_fitting)
791 {
792 boost::shared_ptr<MassDispersionModel> disperser;
793 disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
794
795 return disperser;
796 }
797
798
799 vector<double> fitted_values;
800
801 // WARNING: locfit doesn't like undescores - need camel case for
802 // variable names
803
804 char namebuf[256];
805 sprintf(namebuf, "countMeans");
806 vari* cm = createvar(namebuf,STREGULAR,compatible_count_means.size(),VDOUBLE);
807 for (size_t i = 0; i < compatible_count_means.size(); ++i)
808 {
809 cm->dpr[i] = log(compatible_count_means[i]);
810 }
811
812 //sprintf(namebuf, "countSCV");
813 sprintf(namebuf, "countVariances");
814 vari* cv = createvar(namebuf,STREGULAR,raw_variances.size(),VDOUBLE);
815 for (size_t i = 0; i < raw_variances.size(); ++i)
816 {
817 cv->dpr[i] = raw_variances[i];
818 //cv->dpr[i] = raw_scvs[i];
819 }
820
821 char locfit_cmd[2048];
822 //sprintf(locfit_cmd, "locfit countVariances~countMeans family=gamma");
823 sprintf(locfit_cmd, "locfit countVariances~countMeans family=gamma");
824
825 locfit_dispatch(locfit_cmd);
826
827 sprintf(locfit_cmd, "fittedVars=predict countMeans");
828 locfit_dispatch(locfit_cmd);
829
830 //sprintf(locfit_cmd, "prfit x fhat h nlx");
831 //locfit_dispatch(locfit_cmd);
832
833 double xim = 0;
834 BOOST_FOREACH(double s, scale_factors)
835 {
836 if (s)
837 xim += 1.0 / s;
838 }
839 xim /= scale_factors.size();
840
841 int n = 0;
842 sprintf(namebuf, "fittedVars");
843 vari* cp = findvar(namebuf, 1, &n);
844 assert(cp != NULL);
845 for (size_t i = 0; i < cp->n; ++i)
846 {
847 // if (cp->dpr[i] >= 0)
848 // {
849 double mean = exp(cm->dpr[i]);
850 double fitted_scv = (cp->dpr[i] - mean) / (mean * mean);
851 double corrected_scv = true_to_est_scv_table.interpolate_scv(fitted_scv);
852 double corrected_variance = mean + (corrected_scv * (mean * mean));
853 double uncorrected_variance = mean + (fitted_scv * (mean * mean));
854 //fitted_values.push_back(mean + (cp->dpr[i] - xim * mean));
855 if (no_scv_correction == false && corrected_variance > uncorrected_variance)
856 fitted_values.push_back(corrected_variance);
857 else if (uncorrected_variance > 0)
858 fitted_values.push_back(uncorrected_variance);
859 else
860 fitted_values.push_back(compatible_count_means[i]);
861
862
863 // }
864 // else
865 // {
866 // fitted_values.push_back(compatible_count_means[i]);
867 // }
868 }
869
870 boost::shared_ptr<MassDispersionModel> disperser;
871 disperser = boost::shared_ptr<MassDispersionModel>(new MassDispersionModel(condition_name, compatible_count_means, raw_variances, fitted_values));
872 if (dispersion_method == POISSON)
873 disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
874
875 // for (map<string, pair<double, double> >::iterator itr = labeled_mv_table.begin();
876 // itr != labeled_mv_table.end();
877 // ++itr)
878 // {
879 // string label = itr->first;
880 // disperser->set_compatible_mean_and_var(itr->first, itr->second);
881 // }
882
883 return disperser;
884 }
885
886 boost::shared_ptr<MassDispersionModel>
fit_dispersion_model(const string & condition_name,const vector<double> & scale_factors,const vector<LocusCountList> & sample_compatible_count_table)887 fit_dispersion_model(const string& condition_name,
888 const vector<double>& scale_factors,
889 const vector<LocusCountList>& sample_compatible_count_table)
890 {
891 //
892 //#if ENABLE_THREADS
893 // boost::mutex::scoped_lock lock(_locfit_lock);
894 //#endif
895 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
896 {
897 if (sample_compatible_count_table[i].counts.size() <= 1)
898 {
899 // only one replicate - no point in fitting variance
900 return boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(condition_name));
901 }
902 }
903 #if ENABLE_THREADS
904 _locfit_lock.lock();
905 #endif
906
907 ProgressBar p_bar("Modeling fragment count overdispersion.",0);
908
909 int max_transcripts = 0;
910 BOOST_FOREACH(const LocusCountList& L, sample_compatible_count_table)
911 {
912 if (L.num_transcripts > max_transcripts)
913 {
914 max_transcripts = L.num_transcripts;
915 }
916 }
917
918 boost::shared_ptr<MassDispersionModel> model = fit_dispersion_model_helper(condition_name, scale_factors, sample_compatible_count_table);
919
920 #if ENABLE_THREADS
921 _locfit_lock.unlock();
922 #endif
923 return model;
924 }
925
build_norm_table(const vector<LocusCountList> & full_count_table,boost::shared_ptr<const map<string,LibNormStandards>> normalizing_standards,vector<LocusCountList> & norm_table)926 void build_norm_table(const vector<LocusCountList>& full_count_table,
927 boost::shared_ptr<const map<string, LibNormStandards> > normalizing_standards,
928 vector<LocusCountList>& norm_table)
929 {
930 // If we're using housekeeping genes or spike-in controls, select the rows we'll be using from the full count table.
931 if (normalizing_standards)
932 {
933 for (size_t i = 0; i < full_count_table.size(); ++i)
934 {
935 const vector<string>& gene_ids = full_count_table[i].gene_ids;
936 const vector<string>& gene_short_names = full_count_table[i].gene_short_names;
937
938 // If the row has an ID that's in the table, take it.
939 map<string, LibNormStandards>::const_iterator g_id_itr = normalizing_standards->end();
940 map<string, LibNormStandards>::const_iterator g_name_itr = normalizing_standards->end();
941
942 for (size_t j = 0; j < gene_ids.size(); ++j)
943 {
944 g_id_itr = normalizing_standards->find(gene_ids[j]);
945 if (g_id_itr != normalizing_standards->end())
946 {
947 break;
948 }
949 }
950
951 if (g_id_itr != normalizing_standards->end())
952 {
953 norm_table.push_back(full_count_table[i]);
954 continue;
955 }
956
957 for (size_t j = 0; j < gene_short_names.size(); ++j)
958 {
959 g_name_itr = normalizing_standards->find(gene_short_names[j]);
960 if (g_name_itr != normalizing_standards->end())
961 {
962 break;
963 }
964 }
965
966 if (g_name_itr != normalizing_standards->end())
967 {
968 norm_table.push_back(full_count_table[i]);
969 continue;
970 }
971
972 }
973 }
974 else // otherwise, just take all rows.
975 {
976 norm_table = full_count_table;
977 }
978 }
979
normalize_counts(vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)980 void normalize_counts(vector<boost::shared_ptr<ReadGroupProperties> > & all_read_groups)
981 {
982 vector<LocusCountList> sample_compatible_count_table;
983 vector<LocusCountList> sample_total_count_table;
984
985 for (size_t i = 0; i < all_read_groups.size(); ++i)
986 {
987 boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
988 const vector<LocusCount>& raw_compatible_counts = rg_props->raw_compatible_counts();
989 const vector<LocusCount>& raw_total_counts = rg_props->raw_total_counts();
990
991 for (size_t j = 0; j < raw_compatible_counts.size(); ++j)
992 {
993 if (sample_compatible_count_table.size() == j)
994 {
995 const string& locus_id = raw_compatible_counts[j].locus_desc;
996 int num_transcripts = raw_compatible_counts[j].num_transcripts;
997
998 const vector<string>& gene_ids = raw_compatible_counts[j].gene_ids;
999 const vector<string>& gene_short_names = raw_compatible_counts[j].gene_short_names;
1000
1001 sample_compatible_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts, gene_ids, gene_short_names));
1002 sample_total_count_table.push_back(LocusCountList(locus_id,all_read_groups.size(), num_transcripts, gene_ids, gene_short_names));
1003 }
1004 double scaled = raw_compatible_counts[j].count;
1005 //sample_compatible_count_table[j].counts[i] = scaled * unscaling_factor;
1006 sample_compatible_count_table[j].counts[i] = floor(scaled);
1007 sample_total_count_table[j].counts[i] = floor(raw_total_counts[j].count);
1008
1009 assert(sample_compatible_count_table[j].counts[i] >= 0 && !isinf(sample_compatible_count_table[j].counts[i]));
1010 }
1011 }
1012
1013 vector<double> scale_factors(all_read_groups.size(), 0.0);
1014
1015 vector<LocusCountList> norm_table;
1016
1017 if (use_compat_mass)
1018 {
1019 build_norm_table(sample_compatible_count_table, lib_norm_standards, norm_table);
1020 }
1021 else // use_total_mass
1022 {
1023 assert(use_total_mass);
1024 build_norm_table(sample_total_count_table, lib_norm_standards, norm_table);
1025 }
1026
1027 if (lib_norm_method == GEOMETRIC)
1028 {
1029 calc_geometric_scaling_factors(norm_table, scale_factors);
1030 }
1031 else if (lib_norm_method == CLASSIC_FPKM)
1032 {
1033 calc_classic_fpkm_scaling_factors(norm_table, scale_factors);
1034 }
1035 else if (lib_norm_method == QUARTILE)
1036 {
1037 calc_quartile_scaling_factors(norm_table, scale_factors);
1038 }
1039 else if (lib_norm_method == TMM)
1040 {
1041 calc_tmm_scaling_factors(norm_table, scale_factors);
1042 }
1043 else
1044 {
1045 assert (false);
1046 }
1047
1048 for (size_t i = 0; i < all_read_groups.size(); ++i)
1049 {
1050 boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
1051 rg_props->internal_scale_factor(scale_factors[i]);
1052 }
1053
1054 assert(sample_compatible_count_table.size() == sample_total_count_table.size());
1055
1056 // Transform raw counts to the common scale
1057 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
1058 {
1059 LocusCountList& p = sample_compatible_count_table[i];
1060 for (size_t j = 0; j < p.counts.size(); ++j)
1061 {
1062 assert (scale_factors.size() > j);
1063 p.counts[j] *= (1.0 / scale_factors[j]);
1064 }
1065
1066 LocusCountList& t = sample_total_count_table[i];
1067 for (size_t j = 0; j < t.counts.size(); ++j)
1068 {
1069 assert (scale_factors.size() > j);
1070 t.counts[j] *= (1.0 / scale_factors[j]);
1071 }
1072 }
1073
1074 for (size_t i = 0; i < all_read_groups.size(); ++i)
1075 {
1076 boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
1077 vector<LocusCount> scaled_compatible_counts;
1078 for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
1079 {
1080 string& locus_id = sample_compatible_count_table[j].locus_desc;
1081 double count = sample_compatible_count_table[j].counts[i];
1082 int num_transcripts = sample_compatible_count_table[j].num_transcripts;
1083
1084 const vector<string>& gids = sample_compatible_count_table[j].gene_ids;
1085 const vector<string>& gnms = sample_compatible_count_table[j].gene_short_names;
1086
1087 LocusCount locus_count(locus_id, count, num_transcripts, gids, gnms);
1088 scaled_compatible_counts.push_back(locus_count);
1089 }
1090 rg_props->common_scale_compatible_counts(scaled_compatible_counts);
1091 }
1092
1093 for (size_t i = 0; i < all_read_groups.size(); ++i)
1094 {
1095 boost::shared_ptr<ReadGroupProperties> rg_props = all_read_groups[i];
1096 vector<LocusCount> scaled_total_counts;
1097 for (size_t j = 0; j < sample_total_count_table.size(); ++j)
1098 {
1099 string& locus_id = sample_total_count_table[j].locus_desc;
1100 double count = sample_total_count_table[j].counts[i];
1101 int num_transcripts = sample_total_count_table[j].num_transcripts;
1102
1103 const vector<string>& gids = sample_total_count_table[j].gene_ids;
1104 const vector<string>& gnms = sample_total_count_table[j].gene_short_names;
1105
1106 LocusCount locus_count(locus_id, count, num_transcripts, gids, gnms);
1107 scaled_total_counts.push_back(locus_count);
1108 }
1109 rg_props->common_scale_total_counts(scaled_total_counts);
1110 }
1111
1112 double avg_total_common_scaled_count = 0.0;
1113
1114 for (size_t fac_idx = 0; fac_idx < all_read_groups.size(); ++fac_idx)
1115 {
1116 double total_common = 0.0;
1117 if (use_compat_mass)
1118 {
1119 for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
1120 {
1121 total_common += sample_compatible_count_table[j].counts[fac_idx];
1122 }
1123 }
1124 else
1125 {
1126 for (size_t j = 0; j < sample_compatible_count_table.size(); ++j)
1127 {
1128 total_common += sample_total_count_table[j].counts[fac_idx];
1129 }
1130
1131 }
1132
1133 avg_total_common_scaled_count += (1.0/all_read_groups.size()) * total_common;
1134 }
1135
1136 BOOST_FOREACH(boost::shared_ptr<ReadGroupProperties> rg, all_read_groups)
1137 {
1138 rg->normalized_map_mass(avg_total_common_scaled_count);
1139 }
1140 }
1141
1142