1 /*
2 * differential.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 3/15/10.
6 * Copyright 2010 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <algorithm>
11 #include <functional>
12 #include <numeric>
13 #include <boost/numeric/ublas/vector.hpp>
14 #include <boost/numeric/ublas/matrix.hpp>
15 #include <boost/numeric/ublas/matrix_proxy.hpp>
16 #include <boost/math/special_functions/gamma.hpp>
17 #include <boost/math/distributions/chi_squared.hpp>
18 #include <boost/math/distributions/fisher_f.hpp>
19
20 #include "abundances.h"
21 #include "differential.h"
22 #include "clustering.h"
23 #include "differential.h"
24 #include "sampling.h"
25
26 using namespace std;
27
28 double min_read_count = 10;
29
30 #if ENABLE_THREADS
31 boost::mutex _launcher_lock;
32 boost::mutex locus_thread_pool_lock;
33 int locus_curr_threads = 0;
34 int locus_num_threads = 0;
35
decr_pool_count()36 void decr_pool_count()
37 {
38 locus_thread_pool_lock.lock();
39 locus_curr_threads--;
40 locus_thread_pool_lock.unlock();
41 }
42 #endif
43
SampleDifference()44 SampleDifference::SampleDifference():
45 sample_1(-1),
46 sample_2(-1),
47 value_1(0.0),
48 value_2(0.0),
49 test_stat(0.0),
50 p_value(1.0),
51 corrected_p(1.0),
52 test_status(NOTEST),
53 significant(false) {}
54
55
find_locus(const string & locus_id)56 TestLauncher::launcher_sample_table::iterator TestLauncher::find_locus(const string& locus_id)
57 {
58 launcher_sample_table::iterator itr = _samples.begin();
59 for(; itr != _samples.end(); ++itr)
60 {
61 if (itr->first == locus_id)
62 return itr;
63 }
64 return _samples.end();
65 }
66
register_locus(const string & locus_id)67 void TestLauncher::register_locus(const string& locus_id)
68 {
69 #if ENABLE_THREADS
70 boost::mutex::scoped_lock lock(_launcher_lock);
71 #endif
72
73 launcher_sample_table::iterator itr = find_locus(locus_id);
74 if (itr == _samples.end())
75 {
76 pair<launcher_sample_table::iterator, bool> p;
77 vector<boost::shared_ptr<SampleAbundances> >abs(_orig_workers);
78 _samples.push_back(make_pair(locus_id, abs));
79 }
80 }
81
abundance_avail(const string & locus_id,boost::shared_ptr<SampleAbundances> ab,size_t factory_id)82 void TestLauncher::abundance_avail(const string& locus_id,
83 boost::shared_ptr<SampleAbundances> ab,
84 size_t factory_id)
85 {
86 #if ENABLE_THREADS
87 boost::mutex::scoped_lock lock(_launcher_lock);
88 #endif
89 launcher_sample_table::iterator itr = find_locus(locus_id);
90 if (itr == _samples.end())
91 {
92 assert(false);
93 }
94 itr->second[factory_id] = ab;
95 //itr->second(factory_id] = ab;
96 }
97
98 // Note: this routine should be called under lock - it doesn't
99 // acquire the lock itself.
all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances>> & abundances)100 bool TestLauncher::all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances)
101 {
102 BOOST_FOREACH (boost::shared_ptr<SampleAbundances> ab, abundances)
103 {
104 if (!ab)
105 {
106 return false;
107 }
108 }
109 return true;
110 }
111
112 #if ENABLE_THREADS
113 boost::mutex test_storage_lock; // don't modify the above struct without locking here
114 #endif
115
116 // Note: this routine should be called under lock - it doesn't
117 // acquire the lock itself.
perform_testing(vector<boost::shared_ptr<SampleAbundances>> abundances)118 void TestLauncher::perform_testing(vector<boost::shared_ptr<SampleAbundances> > abundances)
119 {
120 //#if ENABLE_THREADS
121 // _launcher_lock.lock();
122 //#endif
123 assert (abundances.size() == _orig_workers);
124
125 // Just verify that all the loci from each factory match up.
126 for (size_t i = 1; i < abundances.size(); ++i)
127 {
128 const SampleAbundances& curr = *(abundances[i]);
129 const SampleAbundances& prev = *(abundances[i-1]);
130
131 assert (curr.locus_tag == prev.locus_tag);
132
133 const AbundanceGroup& s1 = curr.transcripts;
134 const AbundanceGroup& s2 = prev.transcripts;
135
136 assert (s1.abundances().size() == s2.abundances().size());
137
138 for (size_t j = 0; j < s1.abundances().size(); ++j)
139 {
140 assert (s1.abundances()[j]->description() == s2.abundances()[j]->description());
141 }
142 }
143
144 test_differential(abundances.front()->locus_tag, abundances, _contrasts, *_tests, *_tracking);
145 //#if ENABLE_THREADS
146 // _launcher_lock.unlock();
147 //#endif
148 }
149
record_tracking_data(vector<boost::shared_ptr<SampleAbundances>> & abundances)150 void TestLauncher::record_tracking_data(vector<boost::shared_ptr<SampleAbundances> >& abundances)
151 {
152 assert (abundances.size() == _orig_workers);
153
154 // Just verify that all the loci from each factory match up.
155 for (size_t i = 1; i < abundances.size(); ++i)
156 {
157 const SampleAbundances& curr = *(abundances[i]);
158 const SampleAbundances& prev = *(abundances[i-1]);
159
160 assert (curr.locus_tag == prev.locus_tag);
161
162 const AbundanceGroup& s1 = curr.transcripts;
163 const AbundanceGroup& s2 = prev.transcripts;
164
165 assert (s1.abundances().size() == s2.abundances().size());
166
167 for (size_t j = 0; j < s1.abundances().size(); ++j)
168 {
169 assert (s1.abundances()[j]->description() == s2.abundances()[j]->description());
170 }
171 }
172
173 #if ENABLE_THREADS
174 test_storage_lock.lock();
175 #endif
176
177 // Add all the transcripts, CDS groups, TSS groups, and genes to their
178 // respective FPKM tracking table. Whether this is a time series or an
179 // all pairs comparison, we should be calculating and reporting FPKMs for
180 // all objects in all samples
181 for (size_t i = 0; i < abundances.size(); ++i)
182 {
183 const AbundanceGroup& ab_group = abundances[i]->transcripts;
184 //fprintf(stderr, "[%d] count = %lg\n",i, ab_group.num_fragments());
185 BOOST_FOREACH (boost::shared_ptr<Abundance> ab, ab_group.abundances())
186 {
187 add_to_tracking_table(i, *ab, _tracking->isoform_fpkm_tracking);
188 //assert (_tracking->isoform_fpkm_tracking.num_fragments_by_replicate().empty() == false);
189 }
190
191 BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->cds)
192 {
193 add_to_tracking_table(i, ab, _tracking->cds_fpkm_tracking);
194 }
195
196 BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->primary_transcripts)
197 {
198 add_to_tracking_table(i, ab, _tracking->tss_group_fpkm_tracking);
199 }
200
201 BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->genes)
202 {
203 add_to_tracking_table(i, ab, _tracking->gene_fpkm_tracking);
204 }
205 }
206
207 #if ENABLE_THREADS
208 test_storage_lock.unlock();
209 #endif
210
211 }
212
test_finished_loci()213 vector<vector<boost::shared_ptr<SampleAbundances> > > TestLauncher::test_finished_loci()
214 {
215 #if ENABLE_THREADS
216 _launcher_lock.lock();
217 #endif
218
219 vector<vector<boost::shared_ptr<SampleAbundances> > > samples_for_testing;
220
221 launcher_sample_table::iterator itr = _samples.begin();
222 while(itr != _samples.end())
223 {
224 if (all_samples_reported_in(itr->second))
225 {
226 if (_p_bar)
227 {
228 //verbose_msg("Estimating expression in locus [%s]\n", itr->second.front()->locus_tag.c_str());
229 _p_bar->update(itr->second.front()->locus_tag.c_str(), 1);
230 }
231 record_tracking_data(itr->second);
232 if (!_track_only)
233 samples_for_testing.push_back(itr->second);
234
235 // Removes the samples that have already been tested and transferred to the tracking tables,
236 itr = _samples.erase(itr);
237 }
238 else
239 {
240
241 ++itr;
242 }
243 }
244
245 #if ENABLE_THREADS
246 _launcher_lock.unlock();
247 #endif
248 return samples_for_testing;
249 // for (size_t i = 0; i < samples_for_testing.size(); ++i)
250 // {
251 // vector<boost::shared_ptr<SampleAbundances> > samples_i = samples_for_testing[i];
252 // perform_testing(samples_i);
253 // }
254 }
255
256 //// Sampling-based test:
test_diffexp(const FPKMContext & curr,const FPKMContext & prev)257 SampleDifference test_diffexp(const FPKMContext& curr,
258 const FPKMContext& prev)
259 {
260 bool performed_test = false;
261
262 SampleDifference test = SampleDifference();
263
264 test.p_value = 1.0;
265 test.differential = 0.0;
266 test.test_stat = 0.0;
267 test.test_status = NOTEST;
268 test.value_1 = 0;
269 test.value_2 = 0;
270 test.significant = 0;
271 test.corrected_p = 1.0;
272
273 double p_value = 1.0;
274
275 double differential = 0.0;
276
277 // static const long double min_gamma_params = 1e-20;
278
279 vector<double> null_log_ratio_samples;
280
281 static const size_t num_null_ratio_samples = 10000;
282
283 boost::random::mt19937 rng(random_seed);
284
285 if ((curr.FPKM != 0 || prev.FPKM != 0) && (prev.fpkm_samples.size() > 0 && curr.fpkm_samples.size() > 0))
286 {
287 boost::random::uniform_int_distribution<> prev_sampler(0, prev.fpkm_samples.size()-1);
288 boost::random::uniform_int_distribution<> curr_sampler(0, curr.fpkm_samples.size()-1);
289
290 vector<double> prev_rep_samples;
291 vector<double> curr_rep_samples;
292
293
294 for (size_t i = 0; i != curr.tracking_info_per_rep.size(); ++i)
295 {
296 if (curr.tracking_info_per_rep[i].status == NUMERIC_LOW_DATA)
297 continue;
298 curr_rep_samples.push_back(curr.tracking_info_per_rep[i].fpkm);
299 }
300
301 for (size_t i = 0; i != prev.tracking_info_per_rep.size(); ++i)
302 {
303 if (prev.tracking_info_per_rep[i].status == NUMERIC_LOW_DATA)
304 continue;
305 prev_rep_samples.push_back(prev.tracking_info_per_rep[i].fpkm);
306 }
307
308
309 double curr_fpkm = accumulate(curr_rep_samples.begin(), curr_rep_samples.end(), 0.0);
310 if (curr_rep_samples.size() > 0)
311 curr_fpkm /= curr_rep_samples.size();
312
313 double prev_fpkm = accumulate(prev_rep_samples.begin(), prev_rep_samples.end(), 0.0);
314 if (prev_rep_samples.size() > 0)
315 prev_fpkm /= prev_rep_samples.size();
316
317
318 if (curr_fpkm > 0.0 && prev_fpkm > 0.0)
319 differential = log2(curr_fpkm) - log2(prev_fpkm);
320 else if (curr_fpkm)
321 differential = numeric_limits<double>::infinity();
322 else if (prev_fpkm)
323 differential = -numeric_limits<double>::infinity();
324
325 // set the asymptotic delta method test stat for backward compatibility
326 double curr_log_fpkm_var = (curr.FPKM_variance) / (curr.FPKM * curr.FPKM);
327 double prev_log_fpkm_var = (prev.FPKM_variance) / (prev.FPKM * prev.FPKM);
328 double numerator = log(curr.FPKM / prev.FPKM);
329 double denominator = sqrt(prev_log_fpkm_var + curr_log_fpkm_var);
330 if (denominator > 0.0)
331 test.test_stat = numerator / denominator;
332 else if (numerator > 0)
333 test.test_stat = numeric_limits<double>::infinity();
334 else if (numerator < 0)
335 test.test_stat = -numeric_limits<double>::infinity();
336 else
337 test.test_stat = 0;
338
339
340 test.test_stat = numerator / denominator;
341
342
343 // Draw from prev fpkm_samples to make the first half of the null
344 for (size_t i = 0; i < num_null_ratio_samples; ++i)
345 {
346 double curr_set_sample = 0.0;
347 for (size_t k = 0; k < curr_rep_samples.size(); ++k)
348 {
349 int next_sample_idx = prev_sampler(rng);
350 if (next_sample_idx >= 0 && next_sample_idx < prev.fpkm_samples.size())
351 curr_set_sample += prev.fpkm_samples[next_sample_idx] / (double)curr_rep_samples.size();
352 }
353
354 double prev_set_sample = 0.0;
355 for (size_t k = 0; k < prev_rep_samples.size(); ++k)
356 {
357 int next_sample_idx = prev_sampler(rng);
358 if (next_sample_idx >= 0 && next_sample_idx < prev.fpkm_samples.size())
359 prev_set_sample += prev.fpkm_samples[next_sample_idx] / (double)prev_rep_samples.size();
360 }
361
362 double null_ratio_sample = 0.0;
363 if (curr_set_sample > 0.0 && prev_set_sample > 0.0)
364 null_ratio_sample = log2(curr_set_sample) - log2(prev_set_sample);
365 else if (curr_set_sample > 0.0)
366 null_ratio_sample = numeric_limits<double>::infinity();
367 else if (prev_set_sample)
368 null_ratio_sample = -numeric_limits<double>::infinity();
369
370 null_log_ratio_samples.push_back(null_ratio_sample);
371 }
372
373 // Draw from curr's fpkm_samples to make the other half of the null
374 for (size_t i = 0; i < num_null_ratio_samples; ++i)
375 {
376 double curr_set_sample = 0.0;
377 for (size_t k = 0; k < curr_rep_samples.size(); ++k)
378 {
379 int next_sample_idx = curr_sampler(rng);
380 if (next_sample_idx >= 0 && next_sample_idx < curr.fpkm_samples.size())
381 curr_set_sample += curr.fpkm_samples[next_sample_idx] / (double)curr_rep_samples.size();
382 }
383
384 double prev_set_sample = 0.0;
385 for (size_t k = 0; k < prev_rep_samples.size(); ++k)
386 {
387 int next_sample_idx = curr_sampler(rng);
388 if (next_sample_idx >= 0 && next_sample_idx < curr.fpkm_samples.size())
389 prev_set_sample += curr.fpkm_samples[next_sample_idx] / (double)prev_rep_samples.size();
390 }
391
392 double null_ratio_sample = 0.0;
393 if (curr_set_sample > 0.0 && prev_set_sample > 0.0)
394 null_ratio_sample = log2(curr_set_sample) - log2(prev_set_sample);
395 else if (curr_set_sample > 0.0)
396 null_ratio_sample = numeric_limits<double>::infinity();
397 else if (prev_set_sample)
398 null_ratio_sample = -numeric_limits<double>::infinity();
399
400 null_log_ratio_samples.push_back(null_ratio_sample);
401 }
402
403
404 sort(null_log_ratio_samples.begin(), null_log_ratio_samples.end());
405
406 double lower_tail_val;
407 double upper_tail_val;
408 if (differential > 0)
409 {
410 upper_tail_val = differential;
411 lower_tail_val = -differential;
412 }
413 else
414 {
415 upper_tail_val = -differential;
416 lower_tail_val = differential;
417 }
418
419 vector<double>::iterator lower_tail_null_range_iter = upper_bound(null_log_ratio_samples.begin(), null_log_ratio_samples.end(), lower_tail_val);
420 vector<double>::iterator upper_tail_null_range_iter = upper_bound(null_log_ratio_samples.begin(), null_log_ratio_samples.end(), upper_tail_val);
421 size_t num_smaller_lower_tail = lower_tail_null_range_iter - null_log_ratio_samples.begin();
422 size_t num_smaller_upper_tail = upper_tail_null_range_iter - null_log_ratio_samples.begin();
423
424 size_t num_samples_more_extreme = (null_log_ratio_samples.size() - num_smaller_upper_tail) + num_smaller_lower_tail;
425
426 p_value = num_samples_more_extreme / (double)null_log_ratio_samples.size();
427 if (p_value == 0)
428 p_value = 1.0 / (double)null_log_ratio_samples.size();
429 if (p_value > 1)
430 p_value = 1.0;
431
432
433
434 performed_test = true;
435
436 //test = SampleDifference(sample1, sample2, prev.FPKM, curr.FPKM, stat, p_value, transcript_group_id);
437 test.p_value = p_value;
438 test.differential = differential;
439 //test.test_stat = stat;
440 test.value_1 = prev.FPKM;
441 test.value_2 = curr.FPKM;
442 }
443 else
444 {
445 test.p_value = 1.0;
446 test.test_stat = 0.0;
447 test.value_1 = prev.FPKM;
448 test.value_2 = curr.FPKM;
449 test.differential = 0;
450 performed_test = false;
451 }
452
453 test.test_status = performed_test ? OK : NOTEST;
454 return test;
455 }
456
457 SampleDiffMetaDataTable meta_data_table;
458 #if ENABLE_THREADS
459 boost::mutex meta_data_lock;
460 #endif
461
get_metadata(const string description)462 boost::shared_ptr<SampleDifferenceMetaData> get_metadata(const string description)
463 {
464 #if ENABLE_THREADS
465 boost::mutex::scoped_lock lock(meta_data_lock);
466 #endif
467 pair<SampleDiffMetaDataTable::iterator, bool> p;
468 p = meta_data_table.insert(make_pair(description, new SampleDifferenceMetaData()));
469 return p.first->second;
470 }
471
472 // This performs between-group tests on isoforms or TSS groupings in a single
473 // locus, on two different samples.
get_de_tests(const string & description,const FPKMContext & prev_abundance,const FPKMContext & curr_abundance,bool enough_reads)474 SampleDifference get_de_tests(const string& description,
475 const FPKMContext& prev_abundance,
476 const FPKMContext& curr_abundance,
477 //SampleDiffs& de_tests,
478 bool enough_reads)
479 {
480 int total_iso_de_tests = 0;
481
482 SampleDifference test;
483
484 const FPKMContext& r1 = curr_abundance;
485 const FPKMContext& r2 = prev_abundance;
486
487 if (curr_abundance.status == NUMERIC_FAIL ||
488 prev_abundance.status == NUMERIC_FAIL ||
489 prev_abundance.status == NUMERIC_HI_DATA ||
490 curr_abundance.status == NUMERIC_HI_DATA)
491 {
492 test = test_diffexp(r1, r2);
493 test.test_stat = 0;
494 test.p_value = 1.0;
495 test.differential = 0.0;
496 if (curr_abundance.status == NUMERIC_FAIL ||
497 prev_abundance.status == NUMERIC_FAIL)
498 {
499 test.test_status = FAIL;
500 }
501 else if (prev_abundance.status == NUMERIC_HI_DATA ||
502 curr_abundance.status == NUMERIC_HI_DATA)
503 {
504 test.test_status = HIDATA;
505 }
506 }
507 else if (curr_abundance.status == NUMERIC_LOW_DATA ||
508 prev_abundance.status == NUMERIC_LOW_DATA)
509 {
510 // perform the test, but mark it as not significant and don't add it to the
511 // pile. This way we don't penalize for multiple testing, but users can still
512 // see the fold change.
513 test = test_diffexp(r1, r2);
514 test.test_stat = 0;
515 test.p_value = 1.0;
516 //test.differential = 0.0;
517
518 test.test_status = LOWDATA;
519 }
520 else // at least one is OK, the other might be LOW_DATA
521 {
522 test = test_diffexp(r1, r2);
523 if (test.test_status == OK && enough_reads)
524 {
525 total_iso_de_tests++;
526 }
527 else
528 {
529 test.test_status = NOTEST;
530 test.test_stat = 0;
531 test.p_value = 1.0;
532 //test.differential = 0.0;
533 }
534 // if (test_diffexp(r1, r2, test))
535 // {
536 // total_iso_de_tests++;
537 // }
538 // else
539 // {
540 // test.test_stat = 0;
541 // test.p_value = 1.0;
542 // test.differential = 0.0;
543 // }
544 // if (enough_reads)
545 // test.test_status = OK;
546 // else
547 // test.test_status = NOTEST;
548
549 }
550
551
552 //inserted.first->second = test;
553
554 //return make_pair(total_iso_de_tests, inserted.first);
555 return test;
556 }
557
test_js(const AbundanceGroup & prev_abundance,const AbundanceGroup & curr_abundance,double & js,double & p_val)558 bool test_js(const AbundanceGroup& prev_abundance,
559 const AbundanceGroup& curr_abundance,
560 double& js,
561 double& p_val)
562 {
563 vector<Eigen::VectorXd> sample_kappas;
564 // Eigen::VectorXd curr_kappas(Eigen::VectorXd::Zero(curr_abundance.abundances().size()));
565 // for (size_t i = 0; i < curr_abundance.abundances().size(); ++i)
566 // {
567 // curr_kappas(i) = curr_abundance.abundances()[i]->kappa();
568 // }
569 //
570 // Eigen::VectorXd prev_kappas(Eigen::VectorXd::Zero(prev_abundance.abundances().size()));
571 // for (size_t i = 0; i < prev_abundance.abundances().size(); ++i)
572 // {
573 // prev_kappas(i) = prev_abundance.abundances()[i]->kappa();
574 // }
575
576
577 ////////////
578
579 Eigen::VectorXd prev_kappas = Eigen::VectorXd::Zero(prev_abundance.abundances().size());
580 for (size_t i = 0; i < prev_abundance.abundances().size(); ++i)
581 {
582 FPKMPerReplicateTable ab_i_by_rep = prev_abundance.abundances()[i]->FPKM_by_replicate();
583 for (FPKMPerReplicateTable::const_iterator itr = ab_i_by_rep.begin(); itr != ab_i_by_rep.end(); ++itr)
584 {
585 prev_kappas(i) += itr->second;
586 }
587 prev_kappas(i) /= ab_i_by_rep.size();
588 }
589
590 if (prev_kappas.sum() > 0)
591 prev_kappas /= prev_kappas.sum();
592
593 Eigen::VectorXd curr_kappas = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
594 for (size_t i = 0; i < curr_abundance.abundances().size(); ++i)
595 {
596 FPKMPerReplicateTable ab_i_by_rep = curr_abundance.abundances()[i]->FPKM_by_replicate();
597 for (FPKMPerReplicateTable::const_iterator itr = ab_i_by_rep.begin(); itr != ab_i_by_rep.end(); ++itr)
598 {
599 curr_kappas(i) += itr->second;
600 }
601 curr_kappas(i) /= ab_i_by_rep.size();
602 }
603
604 if (curr_kappas.sum() > 0)
605 curr_kappas /= curr_kappas.sum();
606
607 ////////////
608
609 sample_kappas.push_back(prev_kappas);
610 sample_kappas.push_back(curr_kappas);
611
612 js = jensen_shannon_distance(sample_kappas);
613
614 if (isinf(js) || isnan(js))
615 return false;
616
617 vector<double> null_js_samples;
618
619 static const size_t num_null_js_samples = 10000;
620
621 boost::random::mt19937 rng(random_seed);
622 boost::random::uniform_int_distribution<> prev_sampler(0, prev_abundance.member_fpkm_samples().size()-1);
623 boost::random::uniform_int_distribution<> curr_sampler(0, curr_abundance.member_fpkm_samples().size()-1);
624
625 for (size_t i = 0; i < num_null_js_samples; ++i)
626 {
627 // Draw k values, from prev's fpkm_samples to make the first half of the null, where k is the number of replicates in *curr*, and compute their average
628 Eigen::VectorXd curr_set_sample = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
629 for (size_t k = 0; k < curr_abundance.rg_props().size(); ++k)
630 {
631 int next_sample_idx = prev_sampler(rng);
632 if (next_sample_idx >= 0 && next_sample_idx < prev_abundance.member_fpkm_samples().size())
633 curr_set_sample += prev_abundance.member_fpkm_samples()[next_sample_idx] / (double)curr_abundance.rg_props().size();
634 }
635 double total_curr_set_sample_fpkm = curr_set_sample.sum();
636 if (total_curr_set_sample_fpkm > 0.0)
637 curr_set_sample /= total_curr_set_sample_fpkm;
638
639 // Draw k values, from prev's fpkm_samples to make the first half of the null, where k is the number of replicates in *prev*, and compute their average
640 Eigen::VectorXd prev_set_sample = Eigen::VectorXd::Zero(prev_abundance.abundances().size());
641 for (size_t k = 0; k < prev_abundance.rg_props().size(); ++k)
642 {
643 int next_sample_idx = prev_sampler(rng);
644 if (next_sample_idx >= 0 && next_sample_idx < prev_abundance.member_fpkm_samples().size())
645 prev_set_sample += prev_abundance.member_fpkm_samples()[next_sample_idx] / (double)prev_abundance.rg_props().size();
646 }
647 double total_prev_set_sample_fpkm = prev_set_sample.sum();
648 if (total_prev_set_sample_fpkm > 0.0)
649 prev_set_sample /= total_prev_set_sample_fpkm;
650
651 vector<Eigen::VectorXd> sample_kappas;
652
653 sample_kappas.push_back(curr_set_sample);
654 sample_kappas.push_back(prev_set_sample);
655
656 double js_sample = jensen_shannon_distance(sample_kappas);
657
658 //cerr << curr_set_sample.transpose() << " vs. " << prev_set_sample.transpose() << " : " << js_sample << endl;
659
660 null_js_samples.push_back(js_sample);
661 }
662
663 for (size_t i = 0; i < num_null_js_samples; ++i)
664 {
665 // Draw k values, from curr's fpkm_samples to make the first half of the null, where k is the number of replicates in *curr*, and compute their average
666 Eigen::VectorXd curr_set_sample = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
667 for (size_t k = 0; k < curr_abundance.rg_props().size(); ++k)
668 {
669 int next_sample_idx = curr_sampler(rng);
670 if (next_sample_idx >= 0 && next_sample_idx < curr_abundance.member_fpkm_samples().size())
671 curr_set_sample += curr_abundance.member_fpkm_samples()[next_sample_idx] / (double)curr_abundance.rg_props().size();
672 }
673 double total_curr_set_sample_fpkm = curr_set_sample.sum();
674 if (total_curr_set_sample_fpkm > 0.0)
675 curr_set_sample /= total_curr_set_sample_fpkm;
676
677 // Draw k values, from curr's fpkm_samples to make the first half of the null, where k is the number of replicates in *prev*, and compute their average
678 Eigen::VectorXd prev_set_sample = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
679 for (size_t k = 0; k < prev_abundance.rg_props().size(); ++k)
680 {
681 int next_sample_idx = curr_sampler(rng);
682 if (next_sample_idx >= 0 && next_sample_idx < curr_abundance.member_fpkm_samples().size())
683 prev_set_sample += curr_abundance.member_fpkm_samples()[next_sample_idx] / (double)prev_abundance.rg_props().size();
684 }
685 double total_prev_set_sample_fpkm = prev_set_sample.sum();
686 if (total_prev_set_sample_fpkm > 0.0)
687 prev_set_sample /= total_prev_set_sample_fpkm;
688
689 vector<Eigen::VectorXd> sample_kappas;
690
691 sample_kappas.push_back(curr_set_sample);
692 sample_kappas.push_back(prev_set_sample);
693
694 double js_sample = jensen_shannon_distance(sample_kappas);
695
696 null_js_samples.push_back(js_sample);
697 }
698
699 sort(null_js_samples.begin(), null_js_samples.end());
700
701 double upper_tail_val = js;
702
703 vector<double>::iterator upper_tail_null_range_iter = upper_bound(null_js_samples.begin(), null_js_samples.end(), upper_tail_val);
704
705 size_t num_smaller_upper_tail = upper_tail_null_range_iter - null_js_samples.begin();
706
707 size_t num_samples_more_extreme = (null_js_samples.size() - num_smaller_upper_tail);
708
709 p_val = num_samples_more_extreme / (double)null_js_samples.size();
710 if (p_val == 0)
711 p_val = 1.0 / (double)null_js_samples.size();
712 if (p_val > 1)
713 p_val = 1.0;
714
715 return true;
716 }
717
718
719 // This performs within-group tests on a set of isoforms or a set of TSS groups.
720 // This is a way of looking for meaningful differential splicing or differential
721 // promoter use.
get_ds_tests(const AbundanceGroup & prev_abundance,const AbundanceGroup & curr_abundance,bool enough_reads)722 SampleDifference get_ds_tests(const AbundanceGroup& prev_abundance,
723 const AbundanceGroup& curr_abundance,
724 // SampleDiffs& diff_tests,
725 bool enough_reads)
726 {
727 SampleDifference test;
728
729 test.p_value = 1.0;
730 test.differential = 0.0;
731 test.test_stat = 0.0;
732 test.test_status = NOTEST;
733 test.value_1 = 0;
734 test.value_2 = 0;
735 test.significant = 0;
736 test.corrected_p = 1.0;
737
738
739 AbundanceStatus prev_status = curr_abundance.status();
740 AbundanceStatus curr_status = prev_abundance.status();
741
742 if (prev_abundance.abundances().size() == 1 ||
743 prev_abundance.num_fragments() == 0 ||
744 curr_abundance.num_fragments() == 0 ||
745 prev_abundance.member_fpkm_samples().size() == 0 ||
746 curr_abundance.member_fpkm_samples().size() == 0)
747 {
748 test.p_value = 1;
749 test.value_1 = 0;
750 test.value_2 = 0;
751 test.differential = 0;
752 test.test_status = NOTEST;
753 }
754 else if (prev_abundance.abundances().size() > 1 &&
755 /*prev_abundance.has_member_with_status(NUMERIC_LOW_DATA) == false &&
756 filtered_curr.has_member_with_status(NUMERIC_LOW_DATA) == false &&*/
757 prev_status == NUMERIC_OK && prev_abundance.num_fragments() > 0 &&
758 curr_status == NUMERIC_OK && curr_abundance.num_fragments() > 0)
759 {
760 vector<ublas::vector<double> > sample_kappas;
761 ublas::vector<double> curr_kappas(curr_abundance.abundances().size());
762
763 for (size_t i = 0; i < curr_abundance.abundances().size(); ++i)
764 {
765 curr_kappas(i) = curr_abundance.abundances()[i]->kappa();
766 }
767
768 ublas::vector<double> prev_kappas(prev_abundance.abundances().size());
769 for (size_t i = 0; i < prev_abundance.abundances().size(); ++i)
770 {
771 prev_kappas(i) = prev_abundance.abundances()[i]->kappa();
772 }
773
774 sample_kappas.push_back(prev_kappas);
775 sample_kappas.push_back(curr_kappas);
776
777 double js = 0.0;
778 double p_val = 1.0;
779
780 bool success;
781 success = test_js(prev_abundance, curr_abundance, js, p_val);
782
783 if (js == 0.0 || success == false)
784 {
785 test.test_stat = 0;
786 test.p_value = 1.0;
787 test.value_1 = 0;
788 test.value_2 = 0;
789 test.differential = 0;
790 test.test_status = NOTEST;
791 }
792 else
793 {
794 //test.test_stat = 0;
795 test.p_value = p_val;
796 test.value_1 = 0;
797 test.value_2 = 0;
798 test.differential = js;
799 test.test_status = enough_reads ? OK : NOTEST;
800 }
801 }
802 else // we won't even bother with the JS-based testing in LOWDATA cases.
803 {
804 if (prev_status == NUMERIC_OK && curr_status == NUMERIC_OK &&
805 prev_abundance.has_member_with_status(NUMERIC_LOW_DATA) == false &&
806 curr_abundance.has_member_with_status(NUMERIC_LOW_DATA) == false)
807 test.test_status = NOTEST;
808 else if (prev_status == NUMERIC_FAIL || curr_status == NUMERIC_FAIL)
809 test.test_status = FAIL;
810 else
811 test.test_status = LOWDATA;
812
813 test.test_stat = 0;
814 test.p_value = 0.0;
815 test.differential = 0.0;
816 }
817
818 return test;
819 }
820
make_ref_tag(const string & ref,char classcode)821 string make_ref_tag(const string& ref, char classcode)
822 {
823 char tag_buf[1024];
824
825 sprintf(tag_buf,
826 "%s(%c)",
827 ref.c_str(),
828 classcode);
829
830 return string(tag_buf);
831 }
832
bundle_locus_tag(const RefSequenceTable & rt,const HitBundle & bundle)833 string bundle_locus_tag(const RefSequenceTable& rt,
834 const HitBundle& bundle)
835 {
836 char locus_buf[1024];
837 RefID bundle_chr_id = bundle.ref_id();
838 assert (bundle_chr_id != 0);
839 const char* chr_name = rt.get_name(bundle_chr_id);
840
841 sprintf(locus_buf,
842 "%s:%d-%d",
843 chr_name,
844 bundle.left(),
845 bundle.right());
846
847 return string(locus_buf);
848 }
849
850 struct LocusVarianceInfo
851 {
852 int factory_id;
853 double count_mean;
854 double count_empir_var;
855 double locus_count_fitted_var;
856 double isoform_fitted_var_sum;
857 double cross_replicate_js;
858 int num_transcripts;
859 double bayes_gamma_trace;
860 double empir_gamma_trace;
861 vector<double> gamma;
862 vector<double> gamma_var;
863 vector<double> gamma_bootstrap_var;
864 vector<string> transcript_ids;
865 vector<double> count_sharing;
866 double locus_salient_frags;
867 double locus_total_frags;
868
869 };
870
871 #if ENABLE_THREADS
872 boost::mutex variance_info_lock; // don't modify the above struct without locking here
873 #endif
874
875 vector<LocusVarianceInfo> locus_variance_info_table;
876
877 // We'll use this tracking table to collect per replicate counts for each
878 // transcript, so we can re-fit the variance model.
879 FPKMTrackingTable transcript_count_tracking;
880
881
sample_worker(bool non_empty,boost::shared_ptr<HitBundle> bundle,const RefSequenceTable & rt,ReplicatedBundleFactory & sample_factory,boost::shared_ptr<SampleAbundances> abundance,size_t factory_id,boost::shared_ptr<TestLauncher> launcher,bool calculate_variance)882 void sample_worker(bool non_empty,
883 boost::shared_ptr<HitBundle> bundle,
884 const RefSequenceTable& rt,
885 ReplicatedBundleFactory& sample_factory,
886 boost::shared_ptr<SampleAbundances> abundance,
887 size_t factory_id,
888 boost::shared_ptr<TestLauncher> launcher,
889 bool calculate_variance)
890 {
891 #if ENABLE_THREADS
892 boost::this_thread::at_thread_exit(decr_pool_count);
893 #endif
894
895 char bundle_label_buf[2048];
896 sprintf(bundle_label_buf,
897 "%s:%d-%d",
898 rt.get_name(bundle->ref_id()),
899 bundle->left(),
900 bundle->right());
901 string locus_tag = bundle_label_buf;
902
903 if (!non_empty || (bias_run && bundle->ref_scaffolds().size() != 1)) // Only learn on single isoforms
904 {
905 #if !ENABLE_THREADS
906 // If Cuffdiff was built without threads, we need to manually invoke
907 // the testing functor, which will check to see if all the workers
908 // are done, and if so, perform the cross sample testing.
909 launcher->abundance_avail(locus_tag, abundance, factory_id);
910 vector<vector<boost::shared_ptr<SampleAbundances> > > to_be_tested = launcher->test_finished_loci();
911 launcher->perform_testing(to_be_tested);
912 //launcher();
913 #endif
914 return;
915 }
916
917 abundance->cluster_mass = bundle->mass();
918
919 launcher->register_locus(locus_tag);
920
921 abundance->locus_tag = locus_tag;
922
923 bool perform_cds_analysis = false;
924 bool perform_tss_analysis = false;
925
926 BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle->ref_scaffolds())
927 {
928 if (s->annotated_tss_id() != "")
929 {
930 perform_tss_analysis = final_est_run;
931 }
932 if (s->annotated_protein_id() != "")
933 {
934 perform_cds_analysis = final_est_run;
935 }
936 }
937
938 std::vector<boost::shared_ptr<BundleFactory> > factories = sample_factory.factories();
939 vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> > hit_factories;
940 for (size_t i = 0; i < factories.size(); ++i)
941 {
942 boost::shared_ptr<BundleFactory> pFac = factories[i];
943 boost::shared_ptr<PrecomputedExpressionBundleFactory> pBundleFac = boost::dynamic_pointer_cast<PrecomputedExpressionBundleFactory> (pFac);
944 if (pBundleFac)
945 {
946 // If we get here, this factory refers to a pre computed expression object.
947
948 hit_factories.push_back(pBundleFac);
949 }
950 }
951
952 if (hit_factories.size() == factories.size())
953 {
954 merge_precomputed_expression_worker(boost::cref(locus_tag),
955 hit_factories,
956 boost::ref(*abundance),
957 bundle,
958 perform_cds_analysis,
959 perform_tss_analysis,
960 calculate_variance);
961 }
962 else if (hit_factories.empty())
963 {
964 set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
965 for (size_t i = 0; i < sample_factory.factories().size(); ++i)
966 {
967 boost::shared_ptr<BundleFactory> bf = sample_factory.factories()[i];
968 rg_props.insert(bf->read_group_properties());
969 }
970
971 sample_abundance_worker(boost::cref(locus_tag),
972 boost::cref(rg_props),
973 boost::ref(*abundance),
974 bundle,
975 perform_cds_analysis,
976 perform_tss_analysis,
977 calculate_variance);
978 }
979 else
980 {
981 fprintf (stderr, "Error: mixing pre-computed input with SAM/BAM files not yet supported!\n");
982 exit(1);
983 }
984 ///////////////////////////////////////////////
985
986
987 BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, bundle->ref_scaffolds())
988 {
989 ref_scaff->clear_hits();
990 }
991
992 launcher->abundance_avail(locus_tag, abundance, factory_id);
993 vector<vector<boost::shared_ptr<SampleAbundances> > > to_be_tested = launcher->test_finished_loci();
994
995 for (size_t i = 0; i < to_be_tested.size(); ++i)
996 launcher->perform_testing(to_be_tested[i]);
997
998 #if !ENABLE_THREADS
999 // If Cuffdiff was built without threads, we need to manually invoke
1000 // the testing functor, which will check to see if all the workers
1001 // are done, and if so, perform the cross sample testing.
1002 //launcher->test_finished_loci();
1003 #endif
1004
1005 }
1006
dump_locus_variance_info(const string & filename)1007 void dump_locus_variance_info(const string& filename)
1008 {
1009 #if ENABLE_THREADS
1010 variance_info_lock.lock();
1011 #endif
1012
1013 FILE* fdump = fopen(filename.c_str(), "w");
1014
1015 fprintf(fdump,
1016 "condition\tdescription\tlocus_counts\tempir_var\tlocus_fit_var\tsum_iso_fit_var\tcross_replicate_js\tnum_transcripts\tbayes_gamma_trace\tempir_gamma_trace\tcount_mean\tgamma_var\tlocus_salient_frags\tlocus_total_frags\tcount_sharing\n");
1017 BOOST_FOREACH (LocusVarianceInfo& L, locus_variance_info_table)
1018 {
1019 for (size_t i = 0; i < L.gamma.size(); ++i)
1020 {
1021 fprintf(fdump, "%d\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\t%d\t%lg\t%lg\t%lg\t%lg\t%lg\t%lg\t%lg\n", L.factory_id, L.transcript_ids[i].c_str(), L.count_mean, L.count_empir_var, L.locus_count_fitted_var, L.isoform_fitted_var_sum, L.cross_replicate_js, L.num_transcripts, L.bayes_gamma_trace, L.empir_gamma_trace,L.gamma[i],L.gamma_var[i], L.locus_salient_frags, L.locus_total_frags, L.count_sharing[i]);
1022 }
1023
1024 }
1025
1026 #if ENABLE_THREADS
1027 variance_info_lock.unlock();
1028 #endif
1029 }
1030
filter_group_for_js_testing(vector<vector<AbundanceGroup>> & source_groups)1031 void filter_group_for_js_testing(vector<vector<AbundanceGroup> >& source_groups)
1032 {
1033 if (source_groups.empty())
1034 return;
1035
1036 // iterate over transcript groups
1037 for (size_t ab_group_idx = 0; ab_group_idx < source_groups[0].size(); ++ab_group_idx)
1038 {
1039 vector<bool> to_keep(source_groups[0][ab_group_idx].abundances().size(), false);
1040 // iterate over samples
1041 for (size_t sample_idx = 0; sample_idx < source_groups.size(); ++sample_idx)
1042 {
1043 // iterate over each member of the current abundance group
1044 AbundanceGroup& ab_group = source_groups[sample_idx][ab_group_idx];
1045 for (size_t ab_idx = 0; ab_idx < ab_group.abundances().size(); ++ab_idx)
1046 {
1047 const Abundance& ab = *(ab_group.abundances()[ab_idx]);
1048 if (ab.num_fragments() && ab.effective_length())
1049 {
1050 double frags_per_kb = ab.num_fragments() / (ab.effective_length() / 1000.0);
1051 if (frags_per_kb >= min_read_count)
1052 to_keep[ab_idx] = true;
1053 }
1054
1055 }
1056 }
1057
1058 // Now that we know which ones we want to keep, get rid of the rest
1059 for (size_t sample_idx = 0; sample_idx < source_groups.size(); ++sample_idx)
1060 {
1061 AbundanceGroup& ab_group = source_groups[sample_idx][ab_group_idx];
1062 AbundanceGroup f = ab_group;
1063 ab_group.filter_group(to_keep, f);
1064 ab_group = f;
1065 }
1066 }
1067 }
1068
1069 // The two functions below just clear the FPKM samples and other data used to estimate
1070 // the FPKM distributions during testing. Once testing for a locus is complete,
1071 // we can throw that stuff out. We do so to save memory over the run.
clear_samples_from_fpkm_tracking_table(const string & locus_desc,FPKMTrackingTable & fpkm_tracking)1072 void clear_samples_from_fpkm_tracking_table(const string& locus_desc, FPKMTrackingTable& fpkm_tracking)
1073 {
1074 FPKMTrackingTable::iterator itr = fpkm_tracking.find(locus_desc);
1075 if (itr == fpkm_tracking.end())
1076 return;
1077
1078 for (size_t i = 0; i < itr->second.fpkm_series.size(); ++i)
1079 {
1080 vector<double>& fpkm_samples = itr->second.fpkm_series[i].fpkm_samples;
1081 fpkm_samples.clear();
1082 std::vector<double>().swap(fpkm_samples);
1083 //itr->second.fpkm_series[i].fpkm_samples.swap(vector<double>(itr->second.fpkm_series[i].fpkm_samples));
1084 }
1085 }
1086
clear_samples_from_tracking_table(boost::shared_ptr<SampleAbundances> sample,Tracking & tracking)1087 void clear_samples_from_tracking_table(boost::shared_ptr<SampleAbundances> sample, Tracking& tracking)
1088 {
1089 for (size_t k = 0; k < sample->transcripts.abundances().size(); ++k)
1090 {
1091 const Abundance& abundance = *(sample->transcripts.abundances()[k]);
1092 const string& desc = abundance.description();
1093 clear_samples_from_fpkm_tracking_table(desc, tracking.isoform_fpkm_tracking);
1094 }
1095
1096 for (size_t k = 0; k < sample->cds.size(); ++k)
1097 {
1098 const Abundance& abundance = sample->cds[k];
1099 const string& desc = abundance.description();
1100 clear_samples_from_fpkm_tracking_table(desc, tracking.cds_fpkm_tracking);
1101 }
1102
1103 for (size_t k = 0; k < sample->primary_transcripts.size(); ++k)
1104 {
1105 const Abundance& abundance = sample->primary_transcripts[k];
1106 const string& desc = abundance.description();
1107 clear_samples_from_fpkm_tracking_table(desc, tracking.tss_group_fpkm_tracking);
1108 }
1109
1110 for (size_t k = 0; k < sample->genes.size(); ++k)
1111 {
1112 const Abundance& abundance = sample->genes[k];
1113 const string& desc = abundance.description();
1114 clear_samples_from_fpkm_tracking_table(desc, tracking.gene_fpkm_tracking);
1115 }
1116 }
1117
1118 int total_tests = 0;
test_differential(const string & locus_tag,const vector<boost::shared_ptr<SampleAbundances>> & samples,const vector<pair<size_t,size_t>> & contrasts,Tests & tests,Tracking & tracking)1119 void test_differential(const string& locus_tag,
1120 const vector<boost::shared_ptr<SampleAbundances> >& samples,
1121 const vector<pair<size_t, size_t> >& contrasts,
1122 Tests& tests,
1123 Tracking& tracking)
1124 {
1125 if (samples.empty())
1126 return;
1127
1128 if (no_differential == true)
1129 {
1130 //#if ENABLE_THREADS
1131 // test_storage_lock.unlock();
1132 //#endif
1133 for (size_t i = 0; i < samples.size(); ++i)
1134 {
1135 clear_samples_from_tracking_table(samples[i], tracking);
1136 }
1137 return;
1138 }
1139
1140 vector<vector<AbundanceGroup> > filtered_primary_trans_groups;
1141 vector<vector<AbundanceGroup> > filtered_promoter_groups;
1142 vector<vector<AbundanceGroup> > filtered_cds_groups;
1143
1144 for (size_t i = 0; i < samples.size(); ++i)
1145 {
1146 filtered_primary_trans_groups.push_back(samples[i]->primary_transcripts);
1147 filtered_promoter_groups.push_back(samples[i]->gene_primary_transcripts);
1148 filtered_cds_groups.push_back(samples[i]->gene_cds);
1149 }
1150
1151 filter_group_for_js_testing(filtered_primary_trans_groups);
1152 filter_group_for_js_testing(filtered_promoter_groups);
1153 filter_group_for_js_testing(filtered_cds_groups);
1154
1155
1156 // Perform pairwise significance testing between samples. If this is a
1157 // time series, only test between successive pairs of samples, as supplied
1158 // by the user.
1159
1160 for (size_t contrast_idx = 0; contrast_idx < contrasts.size(); ++contrast_idx)
1161 {
1162 size_t i = contrasts[contrast_idx].first;
1163 size_t j = contrasts[contrast_idx].second;
1164 // bool enough_reads = (samples[i]->cluster_mass >= min_read_count ||
1165 // samples[j]->cluster_mass >= min_read_count);
1166 assert (samples[i]->transcripts.abundances().size() ==
1167 samples[j]->transcripts.abundances().size());
1168 for (size_t k = 0; k < samples[i]->transcripts.abundances().size(); ++k)
1169 {
1170 const Abundance& curr_abundance = *(samples[j]->transcripts.abundances()[k]);
1171 const Abundance& prev_abundance = *(samples[i]->transcripts.abundances()[k]);
1172 const string& desc = curr_abundance.description();
1173 FPKMTrackingTable::iterator itr = tracking.isoform_fpkm_tracking.find(desc);
1174 assert (itr != tracking.isoform_fpkm_tracking.end());
1175
1176 bool enough_reads = false;
1177 if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1178 {
1179 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1180 if (frags_per_kb >= min_read_count)
1181 enough_reads = true;
1182 }
1183 if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1184 {
1185 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1186 if (frags_per_kb >= min_read_count)
1187 enough_reads = true;
1188 }
1189
1190 // if (enough_reads)
1191 // {
1192 // if (is_badly_fit(curr_abundance) || is_badly_fit(prev_abundance))
1193 // enough_reads = false;
1194 // }
1195
1196 SampleDifference test;
1197 test = get_de_tests(desc,
1198 itr->second.fpkm_series[j],
1199 itr->second.fpkm_series[i],
1200 //tests.isoform_de_tests[i][j],
1201 enough_reads);
1202 #if ENABLE_THREADS
1203 test_storage_lock.lock();
1204 #endif
1205 pair<SampleDiffs::iterator, bool> inserted;
1206 inserted = tests.isoform_de_tests[i][j].insert(make_pair(desc,
1207 test));
1208
1209 boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1210
1211 meta_data->gene_ids = curr_abundance.gene_id();
1212 meta_data->gene_names = curr_abundance.gene_name();
1213 meta_data->protein_ids = curr_abundance.protein_id();
1214 meta_data->locus_desc = curr_abundance.locus_tag();
1215 meta_data->description = curr_abundance.description();
1216 inserted.first->second.meta_data = meta_data;
1217 #if ENABLE_THREADS
1218 test_storage_lock.unlock();
1219 #endif
1220 }
1221
1222 for (size_t k = 0; k < samples[i]->cds.size(); ++k)
1223 {
1224 const Abundance& curr_abundance = samples[i]->cds[k];
1225 const Abundance& prev_abundance = samples[j]->cds[k];
1226
1227 const string& desc = curr_abundance.description();
1228 FPKMTrackingTable::iterator itr = tracking.cds_fpkm_tracking.find(desc);
1229 assert (itr != tracking.cds_fpkm_tracking.end());
1230
1231 bool enough_reads = false;
1232 if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1233 {
1234 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1235 if (frags_per_kb >= min_read_count)
1236 enough_reads = true;
1237 }
1238 if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1239 {
1240 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1241 if (frags_per_kb >= min_read_count)
1242 enough_reads = true;
1243 }
1244
1245 // if (enough_reads)
1246 // {
1247 // if (is_badly_fit(curr_abundance) || is_badly_fit(prev_abundance))
1248 // enough_reads = false;
1249 // }
1250
1251
1252 SampleDifference test;
1253 test = get_de_tests(desc,
1254 itr->second.fpkm_series[j],
1255 itr->second.fpkm_series[i],
1256 //tests.cds_de_tests[i][j],
1257 enough_reads);
1258 #if ENABLE_THREADS
1259 test_storage_lock.lock();
1260 #endif
1261
1262 pair<SampleDiffs::iterator, bool> inserted;
1263 inserted = tests.cds_de_tests[i][j].insert(make_pair(desc,
1264 test));
1265
1266 boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1267
1268 meta_data->gene_ids = curr_abundance.gene_id();
1269 meta_data->gene_names = curr_abundance.gene_name();
1270 meta_data->protein_ids = curr_abundance.protein_id();
1271 meta_data->locus_desc = curr_abundance.locus_tag();
1272 meta_data->description = curr_abundance.description();
1273 inserted.first->second.meta_data = meta_data;
1274 #if ENABLE_THREADS
1275 test_storage_lock.unlock();
1276 #endif
1277 }
1278
1279 for (size_t k = 0; k < samples[i]->primary_transcripts.size(); ++k)
1280 {
1281 const Abundance& curr_abundance = samples[i]->primary_transcripts[k];
1282 const Abundance& prev_abundance = samples[j]->primary_transcripts[k];
1283
1284 const string& desc = curr_abundance.description();
1285 FPKMTrackingTable::iterator itr = tracking.tss_group_fpkm_tracking.find(desc);
1286 assert (itr != tracking.tss_group_fpkm_tracking.end());
1287
1288 bool enough_reads = false;
1289 if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1290 {
1291 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1292 if (frags_per_kb >= min_read_count)
1293 enough_reads = true;
1294 }
1295 if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1296 {
1297 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1298 if (frags_per_kb >= min_read_count)
1299 enough_reads = true;
1300 }
1301
1302 // if (enough_reads)
1303 // {
1304 // if (is_badly_fit(curr_abundance) || is_badly_fit(prev_abundance))
1305 // enough_reads = false;
1306 // }
1307
1308
1309 SampleDifference test;
1310 test = get_de_tests(desc,
1311 itr->second.fpkm_series[j],
1312 itr->second.fpkm_series[i],
1313 //tests.tss_group_de_tests[i][j],
1314 enough_reads);
1315 #if ENABLE_THREADS
1316 test_storage_lock.lock();
1317 #endif
1318 pair<SampleDiffs::iterator, bool> inserted;
1319 inserted = tests.tss_group_de_tests[i][j].insert(make_pair(desc,
1320 test));
1321
1322
1323 boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1324
1325 meta_data->gene_ids = curr_abundance.gene_id();
1326 meta_data->gene_names = curr_abundance.gene_name();
1327 meta_data->protein_ids = curr_abundance.protein_id();
1328 meta_data->locus_desc = curr_abundance.locus_tag();
1329 meta_data->description = curr_abundance.description();
1330 inserted.first->second.meta_data = meta_data;
1331 #if ENABLE_THREADS
1332 test_storage_lock.unlock();
1333 #endif
1334 }
1335
1336 for (size_t k = 0; k < samples[i]->genes.size(); ++k)
1337 {
1338 const AbundanceGroup& curr_abundance = samples[i]->genes[k];
1339 const AbundanceGroup& prev_abundance = samples[j]->genes[k];
1340 const string& desc = curr_abundance.description();
1341 FPKMTrackingTable::iterator itr = tracking.gene_fpkm_tracking.find(desc);
1342 assert (itr != tracking.gene_fpkm_tracking.end());
1343
1344 bool enough_reads = false;
1345 if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1346 {
1347 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1348 if (frags_per_kb >= min_read_count)
1349 enough_reads = true;
1350 }
1351 if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1352 {
1353 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1354 if (frags_per_kb >= min_read_count)
1355 enough_reads = true;
1356 }
1357
1358 SampleDifference test;
1359 test = get_de_tests(desc,
1360 itr->second.fpkm_series[j],
1361 itr->second.fpkm_series[i],
1362 //tests.gene_de_tests[i][j],
1363 enough_reads);
1364 #if ENABLE_THREADS
1365 test_storage_lock.lock();
1366 #endif
1367 pair<SampleDiffs::iterator, bool> inserted;
1368 inserted = tests.gene_de_tests[i][j].insert(make_pair(desc,
1369 test));
1370
1371 boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1372
1373 meta_data->gene_ids = curr_abundance.gene_id();
1374 meta_data->gene_names = curr_abundance.gene_name();
1375 meta_data->protein_ids = curr_abundance.protein_id();
1376 meta_data->locus_desc = curr_abundance.locus_tag();
1377 meta_data->description = curr_abundance.description();
1378 inserted.first->second.meta_data = meta_data;
1379 #if ENABLE_THREADS
1380 test_storage_lock.unlock();
1381 #endif
1382 }
1383
1384 // Skip all the JS based testing for genes with an isoform switch?
1385 if (no_js_tests)
1386 continue;
1387
1388 // FIXME: the code below will not properly test for differential
1389 // splicing/promoter use when a gene (e.g.) occupies two
1390 // disjoint bundles. We need to store the covariance matrices (etc)
1391 // in the FPKMContexts to handle that case properly.
1392
1393 // Differential promoter use
1394 for (size_t k = 0; k < samples[i]->gene_primary_transcripts.size(); ++k)
1395 {
1396 const AbundanceGroup& curr_abundance = filtered_promoter_groups[i][k];
1397 const AbundanceGroup& prev_abundance = filtered_promoter_groups[j][k];
1398 const string& desc = curr_abundance.description();
1399
1400 bool enough_reads = (curr_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test &&
1401 prev_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test);
1402 SampleDifference test;
1403 test = get_ds_tests(prev_abundance,
1404 curr_abundance,
1405 enough_reads);
1406
1407 // The filtered group might be empty, so let's grab metadata from
1408 // the unfiltered group
1409 boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1410
1411 meta_data->gene_ids = samples[i]->gene_primary_transcripts[k].gene_id();
1412 meta_data->gene_names = samples[i]->gene_primary_transcripts[k].gene_name();
1413 meta_data->protein_ids = samples[i]->gene_primary_transcripts[k].protein_id();
1414 meta_data->locus_desc = samples[i]->gene_primary_transcripts[k].locus_tag();
1415 meta_data->description = samples[i]->gene_primary_transcripts[k].description();
1416
1417 test.meta_data = meta_data;
1418
1419 #if ENABLE_THREADS
1420 test_storage_lock.lock();
1421 #endif
1422
1423 pair<SampleDiffs::iterator, bool> inserted;
1424 inserted = tests.diff_promoter_tests[i][j].insert(make_pair(desc,test));
1425 inserted.first->second = test;
1426
1427 #if ENABLE_THREADS
1428 test_storage_lock.unlock();
1429 #endif
1430 }
1431
1432 // Differential coding sequence output
1433 for (size_t k = 0; k < samples[i]->gene_cds.size(); ++k)
1434 {
1435 const AbundanceGroup& curr_abundance = filtered_cds_groups[i][k];
1436 const AbundanceGroup& prev_abundance = filtered_cds_groups[j][k];
1437 const string& desc = curr_abundance.description();
1438
1439 bool enough_reads = (curr_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test &&
1440 prev_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test);
1441 SampleDifference test;
1442 test = get_ds_tests(prev_abundance,
1443 curr_abundance,
1444 enough_reads);
1445
1446 // The filtered group might be empty, so let's grab metadata from
1447 // the unfiltered group
1448 boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1449
1450 meta_data->gene_ids = samples[i]->gene_cds[k].gene_id();
1451 meta_data->gene_names = samples[i]->gene_cds[k].gene_name();
1452 meta_data->protein_ids = samples[i]->gene_cds[k].protein_id();
1453 meta_data->locus_desc = samples[i]->gene_cds[k].locus_tag();
1454 meta_data->description = samples[i]->gene_cds[k].description();
1455
1456 test.meta_data = meta_data;
1457
1458 #if ENABLE_THREADS
1459 test_storage_lock.lock();
1460 #endif
1461 pair<SampleDiffs::iterator, bool> inserted;
1462 inserted = tests.diff_cds_tests[i][j].insert(make_pair(desc,test));
1463 inserted.first->second = test;
1464
1465 #if ENABLE_THREADS
1466 test_storage_lock.unlock();
1467 #endif
1468 }
1469
1470 // Differential splicing of primary transcripts
1471 for (size_t k = 0; k < samples[i]->primary_transcripts.size(); ++k)
1472 {
1473 const AbundanceGroup& curr_abundance = filtered_primary_trans_groups[i][k];
1474 const AbundanceGroup& prev_abundance = filtered_primary_trans_groups[j][k];
1475 const string& desc = curr_abundance.description();
1476
1477 bool enough_reads = (curr_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test &&
1478 prev_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test);
1479
1480 SampleDifference test;
1481 test = get_ds_tests(prev_abundance,
1482 curr_abundance,
1483 enough_reads);
1484
1485 // The filtered group might be empty, so let's grab metadata from
1486 // the unfiltered group
1487 boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1488
1489 meta_data->gene_ids = samples[i]->primary_transcripts[k].gene_id();
1490 meta_data->gene_names = samples[i]->primary_transcripts[k].gene_name();
1491 meta_data->protein_ids = samples[i]->primary_transcripts[k].protein_id();
1492 meta_data->locus_desc = samples[i]->primary_transcripts[k].locus_tag();
1493 meta_data->description = samples[i]->primary_transcripts[k].description();
1494
1495 test.meta_data = meta_data;
1496
1497 #if ENABLE_THREADS
1498 test_storage_lock.lock();
1499 #endif
1500 pair<SampleDiffs::iterator, bool> inserted;
1501 inserted = tests.diff_splicing_tests[i][j].insert(make_pair(desc,test));
1502 inserted.first->second = test;
1503
1504 #if ENABLE_THREADS
1505 test_storage_lock.unlock();
1506 #endif
1507 }
1508
1509 }
1510
1511 for (size_t i = 0; i < samples.size(); ++i)
1512 {
1513 clear_samples_from_tracking_table(samples[i], tracking);
1514 }
1515 }
1516
print_checked_params_table(const vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)1517 void print_checked_params_table(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
1518 {
1519 string param_check_error_out_filename = output_dir + "/checked_params.txt";
1520 FILE* param_check_error_out_file = fopen(param_check_error_out_filename.c_str(), "w");
1521
1522 fprintf(param_check_error_out_file, "file\tdefault frag length mean\tdefault frag length std dev\tbias correction\tbias mode\tmultiread correction\tmax-mle-iterations\tmin-mle-accuracy\tmax-bundle-frags\tmax-frag-multihits\tno-effective-length-correction\tno-length-correction\n");
1523 for (size_t i = 0; i < all_read_groups.size(); ++i)
1524 {
1525 const CheckedParameters& cp_i = all_read_groups[i]->checked_parameters();
1526 fprintf(param_check_error_out_file, "%s\t", all_read_groups[i]->file_path().c_str());
1527 fprintf(param_check_error_out_file, "%lg\t", cp_i.frag_len_mean);
1528 fprintf(param_check_error_out_file, "%lg\t", cp_i.frag_len_std_dev);
1529 fprintf(param_check_error_out_file, "%s\t", cp_i.corr_bias ? "yes" : "no");
1530 fprintf(param_check_error_out_file, "%d\t", cp_i.frag_bias_mode);
1531 fprintf(param_check_error_out_file, "%s\t", cp_i.corr_multireads ? "yes" : "no");
1532 fprintf(param_check_error_out_file, "%lg\t", cp_i.max_mle_iterations);
1533 fprintf(param_check_error_out_file, "%lg\t", cp_i.min_mle_accuracy);
1534 fprintf(param_check_error_out_file, "%lg\t", cp_i.max_bundle_frags);
1535 fprintf(param_check_error_out_file, "%lg\t", cp_i.max_frags_multihits);
1536 fprintf(param_check_error_out_file, "%s\t", cp_i.no_effective_length_correction ? "yes" : "no");
1537 fprintf(param_check_error_out_file, "%s\t", cp_i.no_length_correction? "yes" : "no");
1538 fprintf(param_check_error_out_file, "\n");
1539 }
1540 fclose(param_check_error_out_file);
1541
1542 }
1543
validate_cross_sample_parameters(const vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)1544 void validate_cross_sample_parameters(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
1545 {
1546 bool dump_params = false;
1547 for (size_t i = 1; i < all_read_groups.size(); ++i)
1548 {
1549 const CheckedParameters& cp_i = all_read_groups[i - 1]->checked_parameters();
1550 const CheckedParameters& cp_j = all_read_groups[i]->checked_parameters();
1551
1552 if (cp_i.ref_gtf_crc != cp_j.ref_gtf_crc)
1553 {
1554 fprintf(stderr, "Error reference gene annotation differs between samples!\n");
1555 fprintf(stderr, "\t%s\t%s:\t%u!\n", all_read_groups[i - 1]->file_path().c_str(), cp_i.ref_gtf_file_path.c_str(), cp_i.ref_gtf_crc);
1556 fprintf(stderr, "\t%s\t%s:\t%u!\n", all_read_groups[i]->file_path().c_str(), cp_j.ref_gtf_file_path.c_str(), cp_j.ref_gtf_crc);
1557 exit(1);
1558 }
1559
1560 if (cp_i != cp_j)
1561 {
1562 dump_params = true;
1563 fprintf(stderr, "Warning: quantification parameters differ between CXB files!\n\tSee %s/checked_params.txt for more info\n", output_dir.c_str());
1564 break;
1565 // fprintf(stderr, "CXB files:\n");
1566 // fprintf(stderr, "%s:\n", all_read_groups[i - 1]->file_path().c_str());
1567 // fprintf(stderr, "\tdefault-frag-length-mean:\t%lg\n", cp_i.frag_len_mean);
1568 // fprintf(stderr, "\tdefault-frag-length-std-dev:\t%lg\n", cp_i.frag_len_std_dev);
1569 // fprintf(stderr, "\tbias correction:\t%s\n", cp_i.corr_bias ? "yes" : "no");
1570 // fprintf(stderr, "\tbias mode:\t%d\n", cp_i.frag_bias_mode);
1571 // fprintf(stderr, "\tmultiread correction:\t%s\n", cp_i.corr_multireads ? "yes" : "no");
1572 // fprintf(stderr, "\tmax-mle-iterations:\t%lg\n", cp_i.max_mle_iterations);
1573 // fprintf(stderr, "\tmin-mle-accuracy:\t%lg\n", cp_i.min_mle_accuracy);
1574 // fprintf(stderr, "\tmax-bundle-frags:\t%lg\n", cp_i.max_bundle_frags);
1575 // fprintf(stderr, "\tmax-frag-multihits:\t%lg\n", cp_i.max_frags_multihits);
1576 // fprintf(stderr, "\tno-effective-length-correction:\t%s\n", cp_i.no_effective_length_correction ? "yes" : "no");
1577 // fprintf(stderr, "\tno-length-correction:\t%s\n", cp_i.no_length_correction? "yes" : "no");
1578 //
1579 // fprintf(stderr, "%s\n", all_read_groups[i]->file_path().c_str());
1580 // fprintf(stderr, "\tdefault-frag-length-mean:\t%lg\n", cp_j.frag_len_mean);
1581 // fprintf(stderr, "\tdefault-frag-length-std-dev:\t%lg\n", cp_j.frag_len_std_dev);
1582 // // TODO: add CRCs for reference GTF, mask file, norm standards file if using.
1583 // fprintf(stderr, "\tbias correction:\t%s\n", cp_j.corr_bias ? "yes" : "no");
1584 // fprintf(stderr, "\tbias mode:\t%d\n", cp_j.frag_bias_mode);
1585 // fprintf(stderr, "\tmultiread correction:\t%s\n", cp_j.corr_multireads ? "yes" : "no");
1586 // fprintf(stderr, "\tmax-mle-iterations:\t%lg\n", cp_j.max_mle_iterations);
1587 // fprintf(stderr, "\tmin-mle-accuracy:\t%lg\n", cp_j.min_mle_accuracy);
1588 // fprintf(stderr, "\tmax-bundle-frags:\t%lg\n", cp_j.max_bundle_frags);
1589 // fprintf(stderr, "\tmax-frag-multihits:\t%lg\n", cp_j.max_frags_multihits);
1590 // fprintf(stderr, "\tno-effective-length-correction:\t%s\n", cp_j.no_effective_length_correction ? "yes" : "no");
1591 // fprintf(stderr, "\tno-length-correction:\t%s\n", cp_j.no_length_correction? "yes" : "no");
1592
1593 }
1594
1595 }
1596 if (dump_params)
1597 print_checked_params_table(all_read_groups);
1598 }
1599
perform_testing(vector<boost::shared_ptr<SampleAbundances>> abundances)1600 void TrackingDataWriter::perform_testing(vector<boost::shared_ptr<SampleAbundances> > abundances)
1601 {
1602 #if ENABLE_THREADS
1603 _launcher_lock.lock();
1604 test_storage_lock.lock();
1605 #endif
1606 if (headers_written == false)
1607 {
1608 write_header_output();
1609 headers_written = true;
1610 }
1611
1612 write_output(abundances);
1613 //test_differential(abundances.front()->locus_tag, abundances, _contrasts, *_tests, *_tracking);
1614 #if ENABLE_THREADS
1615 test_storage_lock.unlock();
1616 _launcher_lock.unlock();
1617 #endif
1618
1619 }
1620
1621