1 /*
2  *  differential.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 3/15/10.
6  *  Copyright 2010 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include <algorithm>
11 #include <functional>
12 #include <numeric>
13 #include <boost/numeric/ublas/vector.hpp>
14 #include <boost/numeric/ublas/matrix.hpp>
15 #include <boost/numeric/ublas/matrix_proxy.hpp>
16 #include <boost/math/special_functions/gamma.hpp>
17 #include <boost/math/distributions/chi_squared.hpp>
18 #include <boost/math/distributions/fisher_f.hpp>
19 
20 #include "abundances.h"
21 #include "differential.h"
22 #include "clustering.h"
23 #include "differential.h"
24 #include "sampling.h"
25 
26 using namespace std;
27 
28 double min_read_count = 10;
29 
30 #if ENABLE_THREADS
31 boost::mutex _launcher_lock;
32 boost::mutex locus_thread_pool_lock;
33 int locus_curr_threads = 0;
34 int locus_num_threads = 0;
35 
decr_pool_count()36 void decr_pool_count()
37 {
38 	locus_thread_pool_lock.lock();
39 	locus_curr_threads--;
40 	locus_thread_pool_lock.unlock();
41 }
42 #endif
43 
SampleDifference()44 SampleDifference::SampleDifference():
45     sample_1(-1),
46     sample_2(-1),
47     value_1(0.0),
48     value_2(0.0),
49     test_stat(0.0),
50     p_value(1.0),
51     corrected_p(1.0),
52     test_status(NOTEST),
53     significant(false) {}
54 
55 
find_locus(const string & locus_id)56 TestLauncher::launcher_sample_table::iterator TestLauncher::find_locus(const string& locus_id)
57 {
58     launcher_sample_table::iterator itr = _samples.begin();
59     for(; itr != _samples.end(); ++itr)
60     {
61         if (itr->first == locus_id)
62             return itr;
63     }
64     return _samples.end();
65 }
66 
register_locus(const string & locus_id)67 void TestLauncher::register_locus(const string& locus_id)
68 {
69 #if ENABLE_THREADS
70 	boost::mutex::scoped_lock lock(_launcher_lock);
71 #endif
72 
73     launcher_sample_table::iterator itr = find_locus(locus_id);
74     if (itr == _samples.end())
75     {
76         pair<launcher_sample_table::iterator, bool> p;
77         vector<boost::shared_ptr<SampleAbundances> >abs(_orig_workers);
78         _samples.push_back(make_pair(locus_id, abs));
79     }
80 }
81 
abundance_avail(const string & locus_id,boost::shared_ptr<SampleAbundances> ab,size_t factory_id)82 void TestLauncher::abundance_avail(const string& locus_id,
83                                    boost::shared_ptr<SampleAbundances> ab,
84                                    size_t factory_id)
85 {
86 #if ENABLE_THREADS
87 	boost::mutex::scoped_lock lock(_launcher_lock);
88 #endif
89     launcher_sample_table::iterator itr = find_locus(locus_id);
90     if (itr == _samples.end())
91     {
92         assert(false);
93     }
94     itr->second[factory_id] = ab;
95     //itr->second(factory_id] = ab;
96 }
97 
98 // Note: this routine should be called under lock - it doesn't
99 // acquire the lock itself.
all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances>> & abundances)100 bool TestLauncher::all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances)
101 {
102     BOOST_FOREACH (boost::shared_ptr<SampleAbundances> ab, abundances)
103     {
104         if (!ab)
105         {
106             return false;
107         }
108     }
109     return true;
110 }
111 
112 #if ENABLE_THREADS
113 boost::mutex test_storage_lock; // don't modify the above struct without locking here
114 #endif
115 
116 // Note: this routine should be called under lock - it doesn't
117 // acquire the lock itself.
perform_testing(vector<boost::shared_ptr<SampleAbundances>> abundances)118 void TestLauncher::perform_testing(vector<boost::shared_ptr<SampleAbundances> > abundances)
119 {
120 //#if ENABLE_THREADS
121 //	_launcher_lock.lock();
122 //#endif
123     assert (abundances.size() == _orig_workers);
124 
125     // Just verify that all the loci from each factory match up.
126     for (size_t i = 1; i < abundances.size(); ++i)
127     {
128         const SampleAbundances& curr = *(abundances[i]);
129         const SampleAbundances& prev = *(abundances[i-1]);
130 
131         assert (curr.locus_tag == prev.locus_tag);
132 
133         const AbundanceGroup& s1 = curr.transcripts;
134         const AbundanceGroup& s2 =  prev.transcripts;
135 
136         assert (s1.abundances().size() == s2.abundances().size());
137 
138         for (size_t j = 0; j < s1.abundances().size(); ++j)
139         {
140             assert (s1.abundances()[j]->description() == s2.abundances()[j]->description());
141         }
142     }
143 
144     test_differential(abundances.front()->locus_tag, abundances, _contrasts, *_tests, *_tracking);
145 //#if ENABLE_THREADS
146 //	_launcher_lock.unlock();
147 //#endif
148 }
149 
record_tracking_data(vector<boost::shared_ptr<SampleAbundances>> & abundances)150 void TestLauncher::record_tracking_data(vector<boost::shared_ptr<SampleAbundances> >& abundances)
151 {
152     assert (abundances.size() == _orig_workers);
153 
154     // Just verify that all the loci from each factory match up.
155     for (size_t i = 1; i < abundances.size(); ++i)
156     {
157         const SampleAbundances& curr = *(abundances[i]);
158         const SampleAbundances& prev = *(abundances[i-1]);
159 
160         assert (curr.locus_tag == prev.locus_tag);
161 
162         const AbundanceGroup& s1 = curr.transcripts;
163         const AbundanceGroup& s2 =  prev.transcripts;
164 
165         assert (s1.abundances().size() == s2.abundances().size());
166 
167         for (size_t j = 0; j < s1.abundances().size(); ++j)
168         {
169             assert (s1.abundances()[j]->description() == s2.abundances()[j]->description());
170         }
171     }
172 
173 #if ENABLE_THREADS
174 	test_storage_lock.lock();
175 #endif
176 
177     // Add all the transcripts, CDS groups, TSS groups, and genes to their
178     // respective FPKM tracking table.  Whether this is a time series or an
179     // all pairs comparison, we should be calculating and reporting FPKMs for
180     // all objects in all samples
181 	for (size_t i = 0; i < abundances.size(); ++i)
182 	{
183 		const AbundanceGroup& ab_group = abundances[i]->transcripts;
184         //fprintf(stderr, "[%d] count = %lg\n",i,  ab_group.num_fragments());
185 		BOOST_FOREACH (boost::shared_ptr<Abundance> ab, ab_group.abundances())
186 		{
187 			add_to_tracking_table(i, *ab, _tracking->isoform_fpkm_tracking);
188             //assert (_tracking->isoform_fpkm_tracking.num_fragments_by_replicate().empty() == false);
189 		}
190 
191 		BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->cds)
192 		{
193 			add_to_tracking_table(i, ab, _tracking->cds_fpkm_tracking);
194 		}
195 
196 		BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->primary_transcripts)
197 		{
198 			add_to_tracking_table(i, ab, _tracking->tss_group_fpkm_tracking);
199 		}
200 
201 		BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->genes)
202 		{
203 			add_to_tracking_table(i, ab, _tracking->gene_fpkm_tracking);
204 		}
205 	}
206 
207 #if ENABLE_THREADS
208     test_storage_lock.unlock();
209 #endif
210 
211 }
212 
test_finished_loci()213 vector<vector<boost::shared_ptr<SampleAbundances> > > TestLauncher::test_finished_loci()
214 {
215 #if ENABLE_THREADS
216 	_launcher_lock.lock();
217 #endif
218 
219     vector<vector<boost::shared_ptr<SampleAbundances> > > samples_for_testing;
220 
221     launcher_sample_table::iterator itr = _samples.begin();
222     while(itr != _samples.end())
223     {
224         if (all_samples_reported_in(itr->second))
225         {
226             if (_p_bar)
227             {
228                 //verbose_msg("Estimating expression in locus [%s]\n", itr->second.front()->locus_tag.c_str());
229                 _p_bar->update(itr->second.front()->locus_tag.c_str(), 1);
230             }
231             record_tracking_data(itr->second);
232             if (!_track_only)
233                 samples_for_testing.push_back(itr->second);
234 
235             // Removes the samples that have already been tested and transferred to the tracking tables,
236             itr = _samples.erase(itr);
237         }
238         else
239         {
240 
241             ++itr;
242         }
243     }
244 
245 #if ENABLE_THREADS
246 	_launcher_lock.unlock();
247 #endif
248     return samples_for_testing;
249 //    for (size_t i = 0; i < samples_for_testing.size(); ++i)
250 //    {
251 //        vector<boost::shared_ptr<SampleAbundances> > samples_i = samples_for_testing[i];
252 //        perform_testing(samples_i);
253 //    }
254 }
255 
256 //// Sampling-based test:
test_diffexp(const FPKMContext & curr,const FPKMContext & prev)257 SampleDifference test_diffexp(const FPKMContext& curr,
258                               const FPKMContext& prev)
259 {
260 	bool performed_test = false;
261 
262     SampleDifference test = SampleDifference();
263 
264     test.p_value = 1.0;
265     test.differential = 0.0;
266     test.test_stat = 0.0;
267     test.test_status = NOTEST;
268     test.value_1 = 0;
269     test.value_2 = 0;
270     test.significant = 0;
271     test.corrected_p = 1.0;
272 
273     double p_value = 1.0;
274 
275     double differential = 0.0;
276 
277     // static const long double min_gamma_params = 1e-20;
278 
279     vector<double> null_log_ratio_samples;
280 
281     static const size_t num_null_ratio_samples = 10000;
282 
283     boost::random::mt19937 rng(random_seed);
284 
285     if ((curr.FPKM != 0 || prev.FPKM != 0) && (prev.fpkm_samples.size() > 0 && curr.fpkm_samples.size() > 0))
286     {
287         boost::random::uniform_int_distribution<> prev_sampler(0, prev.fpkm_samples.size()-1);
288         boost::random::uniform_int_distribution<> curr_sampler(0, curr.fpkm_samples.size()-1);
289 
290         vector<double> prev_rep_samples;
291         vector<double> curr_rep_samples;
292 
293 
294         for (size_t i = 0; i != curr.tracking_info_per_rep.size(); ++i)
295         {
296             if (curr.tracking_info_per_rep[i].status == NUMERIC_LOW_DATA)
297                 continue;
298             curr_rep_samples.push_back(curr.tracking_info_per_rep[i].fpkm);
299         }
300 
301         for (size_t i = 0; i != prev.tracking_info_per_rep.size(); ++i)
302         {
303             if (prev.tracking_info_per_rep[i].status == NUMERIC_LOW_DATA)
304                 continue;
305             prev_rep_samples.push_back(prev.tracking_info_per_rep[i].fpkm);
306         }
307 
308 
309         double curr_fpkm = accumulate(curr_rep_samples.begin(), curr_rep_samples.end(), 0.0);
310         if (curr_rep_samples.size() > 0)
311             curr_fpkm /= curr_rep_samples.size();
312 
313         double prev_fpkm = accumulate(prev_rep_samples.begin(), prev_rep_samples.end(), 0.0);
314         if (prev_rep_samples.size() > 0)
315             prev_fpkm /= prev_rep_samples.size();
316 
317 
318         if (curr_fpkm > 0.0 && prev_fpkm > 0.0)
319             differential = log2(curr_fpkm) - log2(prev_fpkm);
320         else if (curr_fpkm)
321             differential = numeric_limits<double>::infinity();
322         else if (prev_fpkm)
323             differential = -numeric_limits<double>::infinity();
324 
325         // set the asymptotic delta method test stat for backward compatibility
326         double curr_log_fpkm_var = (curr.FPKM_variance) / (curr.FPKM * curr.FPKM);
327         double prev_log_fpkm_var = (prev.FPKM_variance) / (prev.FPKM * prev.FPKM);
328         double numerator = log(curr.FPKM / prev.FPKM);
329         double denominator = sqrt(prev_log_fpkm_var + curr_log_fpkm_var);
330         if (denominator > 0.0)
331             test.test_stat = numerator / denominator;
332         else if (numerator > 0)
333             test.test_stat = numeric_limits<double>::infinity();
334         else if (numerator < 0)
335             test.test_stat = -numeric_limits<double>::infinity();
336         else
337             test.test_stat = 0;
338 
339 
340         test.test_stat = numerator / denominator;
341 
342 
343         // Draw from prev fpkm_samples to make the first half of the null
344         for (size_t i = 0; i < num_null_ratio_samples; ++i)
345         {
346             double curr_set_sample = 0.0;
347             for (size_t k = 0; k < curr_rep_samples.size(); ++k)
348             {
349                 int next_sample_idx = prev_sampler(rng);
350                 if (next_sample_idx >= 0 && next_sample_idx < prev.fpkm_samples.size())
351                     curr_set_sample += prev.fpkm_samples[next_sample_idx] / (double)curr_rep_samples.size();
352             }
353 
354             double prev_set_sample = 0.0;
355             for (size_t k = 0; k < prev_rep_samples.size(); ++k)
356             {
357                 int next_sample_idx = prev_sampler(rng);
358                 if (next_sample_idx >= 0 && next_sample_idx < prev.fpkm_samples.size())
359                     prev_set_sample += prev.fpkm_samples[next_sample_idx] / (double)prev_rep_samples.size();
360             }
361 
362             double null_ratio_sample = 0.0;
363             if (curr_set_sample > 0.0 && prev_set_sample > 0.0)
364                 null_ratio_sample = log2(curr_set_sample) - log2(prev_set_sample);
365             else if (curr_set_sample > 0.0)
366                 null_ratio_sample = numeric_limits<double>::infinity();
367             else if (prev_set_sample)
368                 null_ratio_sample = -numeric_limits<double>::infinity();
369 
370             null_log_ratio_samples.push_back(null_ratio_sample);
371         }
372 
373         // Draw from curr's fpkm_samples to make the other half of the null
374         for (size_t i = 0; i < num_null_ratio_samples; ++i)
375         {
376             double curr_set_sample = 0.0;
377             for (size_t k = 0; k < curr_rep_samples.size(); ++k)
378             {
379                 int next_sample_idx = curr_sampler(rng);
380                 if (next_sample_idx >= 0 && next_sample_idx < curr.fpkm_samples.size())
381                     curr_set_sample += curr.fpkm_samples[next_sample_idx] / (double)curr_rep_samples.size();
382             }
383 
384             double prev_set_sample = 0.0;
385             for (size_t k = 0; k < prev_rep_samples.size(); ++k)
386             {
387                 int next_sample_idx = curr_sampler(rng);
388                 if (next_sample_idx >= 0 && next_sample_idx < curr.fpkm_samples.size())
389                     prev_set_sample += curr.fpkm_samples[next_sample_idx] / (double)prev_rep_samples.size();
390             }
391 
392             double null_ratio_sample = 0.0;
393             if (curr_set_sample > 0.0 && prev_set_sample > 0.0)
394                 null_ratio_sample = log2(curr_set_sample) - log2(prev_set_sample);
395             else if (curr_set_sample > 0.0)
396                 null_ratio_sample = numeric_limits<double>::infinity();
397             else if (prev_set_sample)
398                 null_ratio_sample = -numeric_limits<double>::infinity();
399 
400             null_log_ratio_samples.push_back(null_ratio_sample);
401         }
402 
403 
404         sort(null_log_ratio_samples.begin(), null_log_ratio_samples.end());
405 
406         double lower_tail_val;
407         double upper_tail_val;
408         if (differential > 0)
409         {
410             upper_tail_val = differential;
411             lower_tail_val = -differential;
412         }
413         else
414         {
415             upper_tail_val = -differential;
416             lower_tail_val = differential;
417         }
418 
419         vector<double>::iterator lower_tail_null_range_iter = upper_bound(null_log_ratio_samples.begin(), null_log_ratio_samples.end(), lower_tail_val);
420         vector<double>::iterator upper_tail_null_range_iter = upper_bound(null_log_ratio_samples.begin(), null_log_ratio_samples.end(), upper_tail_val);
421         size_t num_smaller_lower_tail = lower_tail_null_range_iter - null_log_ratio_samples.begin();
422         size_t num_smaller_upper_tail = upper_tail_null_range_iter - null_log_ratio_samples.begin();
423 
424         size_t num_samples_more_extreme = (null_log_ratio_samples.size() - num_smaller_upper_tail) + num_smaller_lower_tail;
425 
426         p_value = num_samples_more_extreme / (double)null_log_ratio_samples.size();
427         if (p_value == 0)
428             p_value = 1.0 / (double)null_log_ratio_samples.size();
429         if (p_value > 1)
430             p_value = 1.0;
431 
432 
433 
434         performed_test = true;
435 
436         //test = SampleDifference(sample1, sample2, prev.FPKM, curr.FPKM, stat, p_value, transcript_group_id);
437         test.p_value = p_value;
438         test.differential = differential;
439         //test.test_stat = stat;
440         test.value_1 = prev.FPKM;
441         test.value_2 = curr.FPKM;
442     }
443     else
444     {
445         test.p_value = 1.0;
446         test.test_stat = 0.0;
447         test.value_1 = prev.FPKM;
448         test.value_2 = curr.FPKM;
449         test.differential = 0;
450         performed_test = false;
451     }
452 
453 	test.test_status = performed_test ? OK : NOTEST;
454 	return test;
455 }
456 
457 SampleDiffMetaDataTable meta_data_table;
458 #if ENABLE_THREADS
459 boost::mutex meta_data_lock;
460 #endif
461 
get_metadata(const string description)462 boost::shared_ptr<SampleDifferenceMetaData> get_metadata(const string description)
463 {
464 #if ENABLE_THREADS
465     boost::mutex::scoped_lock lock(meta_data_lock);
466 #endif
467     pair<SampleDiffMetaDataTable::iterator, bool> p;
468     p = meta_data_table.insert(make_pair(description, new SampleDifferenceMetaData()));
469     return p.first->second;
470 }
471 
472 // This performs between-group tests on isoforms or TSS groupings in a single
473 // locus, on two different samples.
get_de_tests(const string & description,const FPKMContext & prev_abundance,const FPKMContext & curr_abundance,bool enough_reads)474 SampleDifference get_de_tests(const string& description,
475                  const FPKMContext& prev_abundance,
476 				 const FPKMContext& curr_abundance,
477 				 //SampleDiffs& de_tests,
478 				 bool enough_reads)
479 {
480 	int total_iso_de_tests = 0;
481 
482 	SampleDifference test;
483 
484     const FPKMContext& r1 = curr_abundance;
485     const FPKMContext& r2 = prev_abundance;
486 
487 	if (curr_abundance.status == NUMERIC_FAIL ||
488         prev_abundance.status == NUMERIC_FAIL ||
489         prev_abundance.status == NUMERIC_HI_DATA ||
490         curr_abundance.status == NUMERIC_HI_DATA)
491     {
492         test = test_diffexp(r1, r2);
493         test.test_stat = 0;
494 		test.p_value = 1.0;
495 		test.differential = 0.0;
496         if (curr_abundance.status == NUMERIC_FAIL ||
497             prev_abundance.status == NUMERIC_FAIL)
498         {
499             test.test_status = FAIL;
500         }
501         else if (prev_abundance.status == NUMERIC_HI_DATA ||
502                  curr_abundance.status == NUMERIC_HI_DATA)
503         {
504             test.test_status = HIDATA;
505         }
506     }
507     else if (curr_abundance.status == NUMERIC_LOW_DATA ||
508              prev_abundance.status == NUMERIC_LOW_DATA)
509     {
510         // perform the test, but mark it as not significant and don't add it to the
511         // pile. This way we don't penalize for multiple testing, but users can still
512         // see the fold change.
513 		test = test_diffexp(r1, r2);
514         test.test_stat = 0;
515         test.p_value = 1.0;
516         //test.differential = 0.0;
517 
518 		test.test_status = LOWDATA;
519     }
520     else // at least one is OK, the other might be LOW_DATA
521 	{
522         test = test_diffexp(r1, r2);
523         if (test.test_status == OK && enough_reads)
524         {
525             total_iso_de_tests++;
526         }
527         else
528         {
529             test.test_status = NOTEST;
530 			test.test_stat = 0;
531 			test.p_value = 1.0;
532 			//test.differential = 0.0;
533         }
534 //		if (test_diffexp(r1, r2, test))
535 //		{
536 //			total_iso_de_tests++;
537 //		}
538 //		else
539 //		{
540 //			test.test_stat = 0;
541 //			test.p_value = 1.0;
542 //			test.differential = 0.0;
543 //		}
544 //		if (enough_reads)
545 //			test.test_status = OK;
546 //		else
547 //			test.test_status = NOTEST;
548 
549 	}
550 
551 
552 	//inserted.first->second = test;
553 
554 	//return make_pair(total_iso_de_tests, inserted.first);
555     return test;
556 }
557 
test_js(const AbundanceGroup & prev_abundance,const AbundanceGroup & curr_abundance,double & js,double & p_val)558 bool test_js(const AbundanceGroup& prev_abundance,
559              const AbundanceGroup& curr_abundance,
560              double& js,
561              double& p_val)
562 {
563     vector<Eigen::VectorXd> sample_kappas;
564 //    Eigen::VectorXd curr_kappas(Eigen::VectorXd::Zero(curr_abundance.abundances().size()));
565 //    for (size_t i = 0; i < curr_abundance.abundances().size(); ++i)
566 //    {
567 //        curr_kappas(i) = curr_abundance.abundances()[i]->kappa();
568 //    }
569 //
570 //    Eigen::VectorXd prev_kappas(Eigen::VectorXd::Zero(prev_abundance.abundances().size()));
571 //    for (size_t i = 0; i < prev_abundance.abundances().size(); ++i)
572 //    {
573 //        prev_kappas(i) = prev_abundance.abundances()[i]->kappa();
574 //    }
575 
576 
577     ////////////
578 
579     Eigen::VectorXd prev_kappas = Eigen::VectorXd::Zero(prev_abundance.abundances().size());
580     for (size_t i = 0; i < prev_abundance.abundances().size(); ++i)
581     {
582         FPKMPerReplicateTable ab_i_by_rep = prev_abundance.abundances()[i]->FPKM_by_replicate();
583         for (FPKMPerReplicateTable::const_iterator itr = ab_i_by_rep.begin(); itr != ab_i_by_rep.end(); ++itr)
584         {
585             prev_kappas(i) += itr->second;
586         }
587         prev_kappas(i) /= ab_i_by_rep.size();
588     }
589 
590     if (prev_kappas.sum() > 0)
591         prev_kappas /= prev_kappas.sum();
592 
593     Eigen::VectorXd curr_kappas = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
594     for (size_t i = 0; i < curr_abundance.abundances().size(); ++i)
595     {
596         FPKMPerReplicateTable ab_i_by_rep = curr_abundance.abundances()[i]->FPKM_by_replicate();
597         for (FPKMPerReplicateTable::const_iterator itr = ab_i_by_rep.begin(); itr != ab_i_by_rep.end(); ++itr)
598         {
599             curr_kappas(i) += itr->second;
600         }
601         curr_kappas(i) /= ab_i_by_rep.size();
602     }
603 
604     if (curr_kappas.sum() > 0)
605         curr_kappas /= curr_kappas.sum();
606 
607     ////////////
608 
609     sample_kappas.push_back(prev_kappas);
610     sample_kappas.push_back(curr_kappas);
611 
612     js = jensen_shannon_distance(sample_kappas);
613 
614     if (isinf(js) || isnan(js))
615         return false;
616 
617     vector<double> null_js_samples;
618 
619     static const size_t num_null_js_samples = 10000;
620 
621     boost::random::mt19937 rng(random_seed);
622     boost::random::uniform_int_distribution<> prev_sampler(0, prev_abundance.member_fpkm_samples().size()-1);
623     boost::random::uniform_int_distribution<> curr_sampler(0, curr_abundance.member_fpkm_samples().size()-1);
624 
625     for (size_t i = 0; i < num_null_js_samples; ++i)
626     {
627         // Draw k values, from prev's fpkm_samples to make the first half of the null, where k is the number of replicates in *curr*, and compute their average
628         Eigen::VectorXd curr_set_sample = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
629         for (size_t k = 0; k < curr_abundance.rg_props().size(); ++k)
630         {
631             int next_sample_idx = prev_sampler(rng);
632             if (next_sample_idx >= 0 && next_sample_idx < prev_abundance.member_fpkm_samples().size())
633                 curr_set_sample += prev_abundance.member_fpkm_samples()[next_sample_idx] / (double)curr_abundance.rg_props().size();
634         }
635         double total_curr_set_sample_fpkm = curr_set_sample.sum();
636         if (total_curr_set_sample_fpkm > 0.0)
637             curr_set_sample /= total_curr_set_sample_fpkm;
638 
639         // Draw k values, from prev's fpkm_samples to make the first half of the null, where k is the number of replicates in *prev*, and compute their average
640         Eigen::VectorXd prev_set_sample = Eigen::VectorXd::Zero(prev_abundance.abundances().size());
641         for (size_t k = 0; k < prev_abundance.rg_props().size(); ++k)
642         {
643             int next_sample_idx = prev_sampler(rng);
644             if (next_sample_idx >= 0 && next_sample_idx < prev_abundance.member_fpkm_samples().size())
645                 prev_set_sample += prev_abundance.member_fpkm_samples()[next_sample_idx] / (double)prev_abundance.rg_props().size();
646         }
647         double total_prev_set_sample_fpkm = prev_set_sample.sum();
648         if (total_prev_set_sample_fpkm > 0.0)
649             prev_set_sample /= total_prev_set_sample_fpkm;
650 
651         vector<Eigen::VectorXd> sample_kappas;
652 
653         sample_kappas.push_back(curr_set_sample);
654         sample_kappas.push_back(prev_set_sample);
655 
656         double js_sample = jensen_shannon_distance(sample_kappas);
657 
658         //cerr << curr_set_sample.transpose() << " vs. " << prev_set_sample.transpose() << " : " << js_sample << endl;
659 
660         null_js_samples.push_back(js_sample);
661     }
662 
663     for (size_t i = 0; i < num_null_js_samples; ++i)
664     {
665         // Draw k values, from curr's fpkm_samples to make the first half of the null, where k is the number of replicates in *curr*, and compute their average
666         Eigen::VectorXd curr_set_sample = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
667         for (size_t k = 0; k < curr_abundance.rg_props().size(); ++k)
668         {
669             int next_sample_idx = curr_sampler(rng);
670             if (next_sample_idx >= 0 && next_sample_idx < curr_abundance.member_fpkm_samples().size())
671                 curr_set_sample += curr_abundance.member_fpkm_samples()[next_sample_idx] / (double)curr_abundance.rg_props().size();
672         }
673         double total_curr_set_sample_fpkm = curr_set_sample.sum();
674         if (total_curr_set_sample_fpkm > 0.0)
675             curr_set_sample /= total_curr_set_sample_fpkm;
676 
677         // Draw k values, from curr's fpkm_samples to make the first half of the null, where k is the number of replicates in *prev*, and compute their average
678         Eigen::VectorXd prev_set_sample = Eigen::VectorXd::Zero(curr_abundance.abundances().size());
679         for (size_t k = 0; k < prev_abundance.rg_props().size(); ++k)
680         {
681             int next_sample_idx = curr_sampler(rng);
682             if (next_sample_idx >= 0 && next_sample_idx < curr_abundance.member_fpkm_samples().size())
683                 prev_set_sample += curr_abundance.member_fpkm_samples()[next_sample_idx] / (double)prev_abundance.rg_props().size();
684         }
685         double total_prev_set_sample_fpkm = prev_set_sample.sum();
686         if (total_prev_set_sample_fpkm > 0.0)
687             prev_set_sample /= total_prev_set_sample_fpkm;
688 
689         vector<Eigen::VectorXd> sample_kappas;
690 
691         sample_kappas.push_back(curr_set_sample);
692         sample_kappas.push_back(prev_set_sample);
693 
694         double js_sample = jensen_shannon_distance(sample_kappas);
695 
696         null_js_samples.push_back(js_sample);
697     }
698 
699     sort(null_js_samples.begin(), null_js_samples.end());
700 
701     double upper_tail_val = js;
702 
703     vector<double>::iterator upper_tail_null_range_iter = upper_bound(null_js_samples.begin(), null_js_samples.end(), upper_tail_val);
704 
705     size_t num_smaller_upper_tail = upper_tail_null_range_iter - null_js_samples.begin();
706 
707     size_t num_samples_more_extreme = (null_js_samples.size() - num_smaller_upper_tail);
708 
709     p_val = num_samples_more_extreme / (double)null_js_samples.size();
710     if (p_val == 0)
711         p_val = 1.0 / (double)null_js_samples.size();
712     if (p_val > 1)
713         p_val = 1.0;
714 
715     return true;
716 }
717 
718 
719 // This performs within-group tests on a set of isoforms or a set of TSS groups.
720 // This is a way of looking for meaningful differential splicing or differential
721 // promoter use.
get_ds_tests(const AbundanceGroup & prev_abundance,const AbundanceGroup & curr_abundance,bool enough_reads)722 SampleDifference get_ds_tests(const AbundanceGroup& prev_abundance,
723                               const AbundanceGroup& curr_abundance,
724 //                              SampleDiffs& diff_tests,
725                               bool enough_reads)
726 {
727 	SampleDifference test;
728 
729 	test.p_value = 1.0;
730     test.differential = 0.0;
731     test.test_stat = 0.0;
732     test.test_status = NOTEST;
733     test.value_1 = 0;
734     test.value_2 = 0;
735     test.significant = 0;
736     test.corrected_p = 1.0;
737 
738 
739 	AbundanceStatus prev_status = curr_abundance.status();
740 	AbundanceStatus curr_status = prev_abundance.status();
741 
742     if (prev_abundance.abundances().size() == 1 ||
743         prev_abundance.num_fragments() == 0 ||
744         curr_abundance.num_fragments() == 0 ||
745         prev_abundance.member_fpkm_samples().size() == 0 ||
746         curr_abundance.member_fpkm_samples().size() == 0)
747     {
748         test.p_value = 1;
749         test.value_1 = 0;
750         test.value_2 = 0;
751         test.differential = 0;
752         test.test_status = NOTEST;
753     }
754 	else if (prev_abundance.abundances().size() > 1 &&
755         /*prev_abundance.has_member_with_status(NUMERIC_LOW_DATA) == false &&
756         filtered_curr.has_member_with_status(NUMERIC_LOW_DATA) == false &&*/
757         prev_status == NUMERIC_OK && prev_abundance.num_fragments() > 0 &&
758 		curr_status == NUMERIC_OK && curr_abundance.num_fragments() > 0)
759 	{
760 		vector<ublas::vector<double> > sample_kappas;
761 		ublas::vector<double> curr_kappas(curr_abundance.abundances().size());
762 
763         for (size_t i = 0; i < curr_abundance.abundances().size(); ++i)
764 		{
765 			curr_kappas(i) = curr_abundance.abundances()[i]->kappa();
766 		}
767 
768 		ublas::vector<double> prev_kappas(prev_abundance.abundances().size());
769         for (size_t i = 0; i < prev_abundance.abundances().size(); ++i)
770 		{
771 			prev_kappas(i) = prev_abundance.abundances()[i]->kappa();
772 		}
773 
774 		sample_kappas.push_back(prev_kappas);
775 		sample_kappas.push_back(curr_kappas);
776 
777         double js = 0.0;
778         double p_val = 1.0;
779 
780         bool success;
781         success = test_js(prev_abundance, curr_abundance, js, p_val);
782 
783 		if (js == 0.0 || success == false)
784 		{
785 			test.test_stat = 0;
786 			test.p_value = 1.0;
787 			test.value_1 = 0;
788 			test.value_2 = 0;
789 			test.differential = 0;
790 			test.test_status = NOTEST;
791 		}
792 		else
793 		{
794             //test.test_stat = 0;
795 			test.p_value = p_val;
796 			test.value_1 = 0;
797 			test.value_2 = 0;
798 			test.differential = js;
799 			test.test_status = enough_reads ? OK : NOTEST;
800         }
801 	}
802 	else // we won't even bother with the JS-based testing in LOWDATA cases.
803 	{
804         if (prev_status == NUMERIC_OK && curr_status == NUMERIC_OK &&
805             prev_abundance.has_member_with_status(NUMERIC_LOW_DATA) == false &&
806             curr_abundance.has_member_with_status(NUMERIC_LOW_DATA) == false)
807             test.test_status = NOTEST;
808         else if (prev_status == NUMERIC_FAIL || curr_status == NUMERIC_FAIL)
809             test.test_status = FAIL;
810         else
811             test.test_status = LOWDATA;
812 
813 		test.test_stat = 0;
814 		test.p_value = 0.0;
815 		test.differential = 0.0;
816 	}
817 
818     return test;
819 }
820 
make_ref_tag(const string & ref,char classcode)821 string make_ref_tag(const string& ref, char classcode)
822 {
823 	char tag_buf[1024];
824 
825 	sprintf(tag_buf,
826 			"%s(%c)",
827 			ref.c_str(),
828 			classcode);
829 
830 	return string(tag_buf);
831 }
832 
bundle_locus_tag(const RefSequenceTable & rt,const HitBundle & bundle)833 string bundle_locus_tag(const RefSequenceTable& rt,
834 						const HitBundle& bundle)
835 {
836 	char locus_buf[1024];
837 	RefID bundle_chr_id = bundle.ref_id();
838 	assert (bundle_chr_id != 0);
839 	const char* chr_name = rt.get_name(bundle_chr_id);
840 
841 	sprintf(locus_buf,
842 			"%s:%d-%d",
843 			chr_name,
844 			bundle.left(),
845 			bundle.right());
846 
847 	return string(locus_buf);
848 }
849 
850 struct LocusVarianceInfo
851 {
852     int factory_id;
853     double count_mean;
854     double count_empir_var;
855     double locus_count_fitted_var;
856     double isoform_fitted_var_sum;
857     double cross_replicate_js;
858     int num_transcripts;
859     double bayes_gamma_trace;
860     double empir_gamma_trace;
861     vector<double> gamma;
862     vector<double> gamma_var;
863     vector<double> gamma_bootstrap_var;
864     vector<string> transcript_ids;
865     vector<double> count_sharing;
866     double locus_salient_frags;
867     double locus_total_frags;
868 
869 };
870 
871 #if ENABLE_THREADS
872 boost::mutex variance_info_lock; // don't modify the above struct without locking here
873 #endif
874 
875 vector<LocusVarianceInfo> locus_variance_info_table;
876 
877 // We'll use this tracking table to collect per replicate counts for each
878 // transcript, so we can re-fit the variance model.
879 FPKMTrackingTable transcript_count_tracking;
880 
881 
sample_worker(bool non_empty,boost::shared_ptr<HitBundle> bundle,const RefSequenceTable & rt,ReplicatedBundleFactory & sample_factory,boost::shared_ptr<SampleAbundances> abundance,size_t factory_id,boost::shared_ptr<TestLauncher> launcher,bool calculate_variance)882 void sample_worker(bool non_empty,
883                         boost::shared_ptr<HitBundle> bundle,
884                         const RefSequenceTable& rt,
885                         ReplicatedBundleFactory& sample_factory,
886                         boost::shared_ptr<SampleAbundances> abundance,
887                         size_t factory_id,
888                         boost::shared_ptr<TestLauncher> launcher,
889                         bool calculate_variance)
890 {
891 #if ENABLE_THREADS
892 	boost::this_thread::at_thread_exit(decr_pool_count);
893 #endif
894 
895     char bundle_label_buf[2048];
896     sprintf(bundle_label_buf,
897             "%s:%d-%d",
898             rt.get_name(bundle->ref_id()),
899             bundle->left(),
900             bundle->right());
901     string locus_tag = bundle_label_buf;
902 
903     if (!non_empty || (bias_run && bundle->ref_scaffolds().size() != 1)) // Only learn on single isoforms
904     {
905 #if !ENABLE_THREADS
906         // If Cuffdiff was built without threads, we need to manually invoke
907         // the testing functor, which will check to see if all the workers
908         // are done, and if so, perform the cross sample testing.
909         launcher->abundance_avail(locus_tag, abundance, factory_id);
910         vector<vector<boost::shared_ptr<SampleAbundances> > > to_be_tested = launcher->test_finished_loci();
911         launcher->perform_testing(to_be_tested);
912         //launcher();
913 #endif
914     	return;
915     }
916 
917     abundance->cluster_mass = bundle->mass();
918 
919     launcher->register_locus(locus_tag);
920 
921     abundance->locus_tag = locus_tag;
922 
923     bool perform_cds_analysis = false;
924     bool perform_tss_analysis = false;
925 
926     BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle->ref_scaffolds())
927     {
928         if (s->annotated_tss_id() != "")
929         {
930             perform_tss_analysis = final_est_run;
931         }
932         if (s->annotated_protein_id() != "")
933         {
934             perform_cds_analysis = final_est_run;
935         }
936     }
937 
938     std::vector<boost::shared_ptr<BundleFactory> > factories = sample_factory.factories();
939     vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> > hit_factories;
940     for (size_t i = 0; i < factories.size(); ++i)
941     {
942         boost::shared_ptr<BundleFactory> pFac = factories[i];
943         boost::shared_ptr<PrecomputedExpressionBundleFactory> pBundleFac = boost::dynamic_pointer_cast<PrecomputedExpressionBundleFactory> (pFac);
944         if (pBundleFac)
945         {
946             // If we get here, this factory refers to a pre computed expression object.
947 
948             hit_factories.push_back(pBundleFac);
949         }
950     }
951 
952     if (hit_factories.size() == factories.size())
953     {
954         merge_precomputed_expression_worker(boost::cref(locus_tag),
955                                             hit_factories,
956                                             boost::ref(*abundance),
957                                             bundle,
958                                             perform_cds_analysis,
959                                             perform_tss_analysis,
960                                             calculate_variance);
961     }
962     else if (hit_factories.empty())
963     {
964         set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
965         for (size_t i = 0; i < sample_factory.factories().size(); ++i)
966         {
967             boost::shared_ptr<BundleFactory> bf = sample_factory.factories()[i];
968             rg_props.insert(bf->read_group_properties());
969         }
970 
971         sample_abundance_worker(boost::cref(locus_tag),
972                                 boost::cref(rg_props),
973                                 boost::ref(*abundance),
974                                 bundle,
975                                 perform_cds_analysis,
976                                 perform_tss_analysis,
977                                 calculate_variance);
978     }
979     else
980     {
981         fprintf (stderr, "Error: mixing pre-computed input with SAM/BAM files not yet supported!\n");
982         exit(1);
983     }
984     ///////////////////////////////////////////////
985 
986 
987     BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff,  bundle->ref_scaffolds())
988     {
989         ref_scaff->clear_hits();
990     }
991 
992     launcher->abundance_avail(locus_tag, abundance, factory_id);
993     vector<vector<boost::shared_ptr<SampleAbundances> > > to_be_tested = launcher->test_finished_loci();
994 
995     for (size_t i = 0; i < to_be_tested.size(); ++i)
996         launcher->perform_testing(to_be_tested[i]);
997 
998 #if !ENABLE_THREADS
999     // If Cuffdiff was built without threads, we need to manually invoke
1000     // the testing functor, which will check to see if all the workers
1001     // are done, and if so, perform the cross sample testing.
1002     //launcher->test_finished_loci();
1003 #endif
1004 
1005 }
1006 
dump_locus_variance_info(const string & filename)1007 void dump_locus_variance_info(const string& filename)
1008 {
1009 #if ENABLE_THREADS
1010     variance_info_lock.lock();
1011 #endif
1012 
1013     FILE* fdump = fopen(filename.c_str(), "w");
1014 
1015     fprintf(fdump,
1016             "condition\tdescription\tlocus_counts\tempir_var\tlocus_fit_var\tsum_iso_fit_var\tcross_replicate_js\tnum_transcripts\tbayes_gamma_trace\tempir_gamma_trace\tcount_mean\tgamma_var\tlocus_salient_frags\tlocus_total_frags\tcount_sharing\n");
1017     BOOST_FOREACH (LocusVarianceInfo& L, locus_variance_info_table)
1018     {
1019         for (size_t i = 0; i < L.gamma.size(); ++i)
1020         {
1021             fprintf(fdump, "%d\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\t%d\t%lg\t%lg\t%lg\t%lg\t%lg\t%lg\t%lg\n", L.factory_id, L.transcript_ids[i].c_str(), L.count_mean, L.count_empir_var, L.locus_count_fitted_var, L.isoform_fitted_var_sum, L.cross_replicate_js, L.num_transcripts, L.bayes_gamma_trace, L.empir_gamma_trace,L.gamma[i],L.gamma_var[i], L.locus_salient_frags, L.locus_total_frags, L.count_sharing[i]);
1022         }
1023 
1024     }
1025 
1026 #if ENABLE_THREADS
1027     variance_info_lock.unlock();
1028 #endif
1029 }
1030 
filter_group_for_js_testing(vector<vector<AbundanceGroup>> & source_groups)1031 void filter_group_for_js_testing(vector<vector<AbundanceGroup> >& source_groups)
1032 {
1033     if (source_groups.empty())
1034         return;
1035 
1036     // iterate over transcript groups
1037     for (size_t ab_group_idx = 0; ab_group_idx < source_groups[0].size(); ++ab_group_idx)
1038     {
1039         vector<bool> to_keep(source_groups[0][ab_group_idx].abundances().size(), false);
1040         // iterate over samples
1041         for (size_t sample_idx = 0;  sample_idx < source_groups.size(); ++sample_idx)
1042         {
1043             // iterate over each member of the current abundance group
1044             AbundanceGroup& ab_group = source_groups[sample_idx][ab_group_idx];
1045             for (size_t ab_idx = 0; ab_idx < ab_group.abundances().size(); ++ab_idx)
1046             {
1047                 const Abundance& ab = *(ab_group.abundances()[ab_idx]);
1048                 if (ab.num_fragments() && ab.effective_length())
1049                 {
1050                     double frags_per_kb = ab.num_fragments() / (ab.effective_length() / 1000.0);
1051                     if (frags_per_kb >= min_read_count)
1052                         to_keep[ab_idx] = true;
1053                 }
1054 
1055             }
1056         }
1057 
1058         // Now that we know which ones we want to keep, get rid of the rest
1059         for (size_t sample_idx = 0;  sample_idx < source_groups.size(); ++sample_idx)
1060         {
1061             AbundanceGroup& ab_group = source_groups[sample_idx][ab_group_idx];
1062             AbundanceGroup f = ab_group;
1063             ab_group.filter_group(to_keep, f);
1064             ab_group = f;
1065         }
1066     }
1067 }
1068 
1069 // The two functions below just clear the FPKM samples and other data used to estimate
1070 // the FPKM distributions during testing.  Once testing for a locus is complete,
1071 // we can throw that stuff out.  We do so to save memory over the run.
clear_samples_from_fpkm_tracking_table(const string & locus_desc,FPKMTrackingTable & fpkm_tracking)1072 void clear_samples_from_fpkm_tracking_table(const string& locus_desc, FPKMTrackingTable& fpkm_tracking)
1073 {
1074     FPKMTrackingTable::iterator itr = fpkm_tracking.find(locus_desc);
1075     if (itr == fpkm_tracking.end())
1076         return;
1077 
1078     for (size_t i = 0; i < itr->second.fpkm_series.size(); ++i)
1079     {
1080         vector<double>& fpkm_samples = itr->second.fpkm_series[i].fpkm_samples;
1081         fpkm_samples.clear();
1082         std::vector<double>().swap(fpkm_samples);
1083         //itr->second.fpkm_series[i].fpkm_samples.swap(vector<double>(itr->second.fpkm_series[i].fpkm_samples));
1084     }
1085 }
1086 
clear_samples_from_tracking_table(boost::shared_ptr<SampleAbundances> sample,Tracking & tracking)1087 void clear_samples_from_tracking_table(boost::shared_ptr<SampleAbundances> sample, Tracking& tracking)
1088 {
1089     for (size_t k = 0; k < sample->transcripts.abundances().size(); ++k)
1090     {
1091         const Abundance& abundance = *(sample->transcripts.abundances()[k]);
1092         const string& desc = abundance.description();
1093         clear_samples_from_fpkm_tracking_table(desc, tracking.isoform_fpkm_tracking);
1094     }
1095 
1096     for (size_t k = 0; k < sample->cds.size(); ++k)
1097     {
1098         const Abundance& abundance = sample->cds[k];
1099         const string& desc = abundance.description();
1100         clear_samples_from_fpkm_tracking_table(desc, tracking.cds_fpkm_tracking);
1101     }
1102 
1103     for (size_t k = 0; k < sample->primary_transcripts.size(); ++k)
1104     {
1105         const Abundance& abundance = sample->primary_transcripts[k];
1106         const string& desc = abundance.description();
1107         clear_samples_from_fpkm_tracking_table(desc, tracking.tss_group_fpkm_tracking);
1108     }
1109 
1110     for (size_t k = 0; k < sample->genes.size(); ++k)
1111     {
1112         const Abundance& abundance = sample->genes[k];
1113         const string& desc = abundance.description();
1114         clear_samples_from_fpkm_tracking_table(desc, tracking.gene_fpkm_tracking);
1115     }
1116 }
1117 
1118 int total_tests = 0;
test_differential(const string & locus_tag,const vector<boost::shared_ptr<SampleAbundances>> & samples,const vector<pair<size_t,size_t>> & contrasts,Tests & tests,Tracking & tracking)1119 void test_differential(const string& locus_tag,
1120                        const vector<boost::shared_ptr<SampleAbundances> >& samples,
1121                        const vector<pair<size_t, size_t> >& contrasts,
1122 					   Tests& tests,
1123 					   Tracking& tracking)
1124 {
1125 	if (samples.empty())
1126 		return;
1127 
1128     if (no_differential == true)
1129     {
1130 //#if ENABLE_THREADS
1131 //        test_storage_lock.unlock();
1132 //#endif
1133         for (size_t i = 0; i < samples.size(); ++i)
1134         {
1135             clear_samples_from_tracking_table(samples[i], tracking);
1136         }
1137         return;
1138     }
1139 
1140     vector<vector<AbundanceGroup> > filtered_primary_trans_groups;
1141     vector<vector<AbundanceGroup> > filtered_promoter_groups;
1142     vector<vector<AbundanceGroup> > filtered_cds_groups;
1143 
1144     for (size_t i = 0; i < samples.size(); ++i)
1145     {
1146         filtered_primary_trans_groups.push_back(samples[i]->primary_transcripts);
1147         filtered_promoter_groups.push_back(samples[i]->gene_primary_transcripts);
1148         filtered_cds_groups.push_back(samples[i]->gene_cds);
1149     }
1150 
1151     filter_group_for_js_testing(filtered_primary_trans_groups);
1152     filter_group_for_js_testing(filtered_promoter_groups);
1153     filter_group_for_js_testing(filtered_cds_groups);
1154 
1155 
1156     // Perform pairwise significance testing between samples. If this is a
1157     // time series, only test between successive pairs of samples, as supplied
1158     // by the user.
1159 
1160     for (size_t contrast_idx = 0; contrast_idx < contrasts.size(); ++contrast_idx)
1161     {
1162         size_t i = contrasts[contrast_idx].first;
1163         size_t j = contrasts[contrast_idx].second;
1164         //            bool enough_reads = (samples[i]->cluster_mass >= min_read_count ||
1165         //                                 samples[j]->cluster_mass >= min_read_count);
1166         assert (samples[i]->transcripts.abundances().size() ==
1167                 samples[j]->transcripts.abundances().size());
1168         for (size_t k = 0; k < samples[i]->transcripts.abundances().size(); ++k)
1169         {
1170             const Abundance& curr_abundance = *(samples[j]->transcripts.abundances()[k]);
1171             const Abundance& prev_abundance = *(samples[i]->transcripts.abundances()[k]);
1172             const string& desc = curr_abundance.description();
1173             FPKMTrackingTable::iterator itr = tracking.isoform_fpkm_tracking.find(desc);
1174             assert (itr != tracking.isoform_fpkm_tracking.end());
1175 
1176             bool enough_reads = false;
1177             if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1178             {
1179                 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1180                 if (frags_per_kb >= min_read_count)
1181                     enough_reads = true;
1182             }
1183             if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1184             {
1185                 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1186                 if (frags_per_kb >= min_read_count)
1187                     enough_reads = true;
1188             }
1189 
1190 //            if (enough_reads)
1191 //            {
1192 //                if (is_badly_fit(curr_abundance) || is_badly_fit(prev_abundance))
1193 //                    enough_reads = false;
1194 //            }
1195 
1196             SampleDifference test;
1197             test = get_de_tests(desc,
1198                                 itr->second.fpkm_series[j],
1199                                 itr->second.fpkm_series[i],
1200                                 //tests.isoform_de_tests[i][j],
1201                                 enough_reads);
1202 #if ENABLE_THREADS
1203             test_storage_lock.lock();
1204 #endif
1205             pair<SampleDiffs::iterator, bool> inserted;
1206             inserted = tests.isoform_de_tests[i][j].insert(make_pair(desc,
1207                                                                      test));
1208 
1209             boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1210 
1211             meta_data->gene_ids = curr_abundance.gene_id();
1212             meta_data->gene_names = curr_abundance.gene_name();
1213             meta_data->protein_ids = curr_abundance.protein_id();
1214             meta_data->locus_desc = curr_abundance.locus_tag();
1215             meta_data->description = curr_abundance.description();
1216             inserted.first->second.meta_data = meta_data;
1217 #if ENABLE_THREADS
1218             test_storage_lock.unlock();
1219 #endif
1220         }
1221 
1222         for (size_t k = 0; k < samples[i]->cds.size(); ++k)
1223         {
1224             const Abundance& curr_abundance = samples[i]->cds[k];
1225             const Abundance& prev_abundance = samples[j]->cds[k];
1226 
1227             const string& desc = curr_abundance.description();
1228             FPKMTrackingTable::iterator itr = tracking.cds_fpkm_tracking.find(desc);
1229             assert (itr != tracking.cds_fpkm_tracking.end());
1230 
1231             bool enough_reads = false;
1232             if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1233             {
1234                 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1235                 if (frags_per_kb >= min_read_count)
1236                     enough_reads = true;
1237             }
1238             if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1239             {
1240                 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1241                 if (frags_per_kb >= min_read_count)
1242                     enough_reads = true;
1243             }
1244 
1245 //            if (enough_reads)
1246 //            {
1247 //                if (is_badly_fit(curr_abundance) || is_badly_fit(prev_abundance))
1248 //                    enough_reads = false;
1249 //            }
1250 
1251 
1252             SampleDifference test;
1253             test = get_de_tests(desc,
1254                                 itr->second.fpkm_series[j],
1255                                 itr->second.fpkm_series[i],
1256                                 //tests.cds_de_tests[i][j],
1257                                 enough_reads);
1258 #if ENABLE_THREADS
1259             test_storage_lock.lock();
1260 #endif
1261 
1262             pair<SampleDiffs::iterator, bool> inserted;
1263             inserted = tests.cds_de_tests[i][j].insert(make_pair(desc,
1264                                                                  test));
1265 
1266             boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1267 
1268             meta_data->gene_ids = curr_abundance.gene_id();
1269             meta_data->gene_names = curr_abundance.gene_name();
1270             meta_data->protein_ids = curr_abundance.protein_id();
1271             meta_data->locus_desc = curr_abundance.locus_tag();
1272             meta_data->description = curr_abundance.description();
1273             inserted.first->second.meta_data = meta_data;
1274 #if ENABLE_THREADS
1275             test_storage_lock.unlock();
1276 #endif
1277         }
1278 
1279         for (size_t k = 0; k < samples[i]->primary_transcripts.size(); ++k)
1280         {
1281             const Abundance& curr_abundance = samples[i]->primary_transcripts[k];
1282             const Abundance& prev_abundance = samples[j]->primary_transcripts[k];
1283 
1284             const string& desc = curr_abundance.description();
1285             FPKMTrackingTable::iterator itr = tracking.tss_group_fpkm_tracking.find(desc);
1286             assert (itr != tracking.tss_group_fpkm_tracking.end());
1287 
1288             bool enough_reads = false;
1289             if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1290             {
1291                 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1292                 if (frags_per_kb >= min_read_count)
1293                     enough_reads = true;
1294             }
1295             if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1296             {
1297                 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1298                 if (frags_per_kb >= min_read_count)
1299                     enough_reads = true;
1300             }
1301 
1302 //            if (enough_reads)
1303 //            {
1304 //                if (is_badly_fit(curr_abundance) || is_badly_fit(prev_abundance))
1305 //                    enough_reads = false;
1306 //            }
1307 
1308 
1309             SampleDifference test;
1310             test = get_de_tests(desc,
1311                                 itr->second.fpkm_series[j],
1312                                 itr->second.fpkm_series[i],
1313                                 //tests.tss_group_de_tests[i][j],
1314                                 enough_reads);
1315 #if ENABLE_THREADS
1316             test_storage_lock.lock();
1317 #endif
1318             pair<SampleDiffs::iterator, bool> inserted;
1319             inserted = tests.tss_group_de_tests[i][j].insert(make_pair(desc,
1320                                                                        test));
1321 
1322 
1323             boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1324 
1325             meta_data->gene_ids = curr_abundance.gene_id();
1326             meta_data->gene_names = curr_abundance.gene_name();
1327             meta_data->protein_ids = curr_abundance.protein_id();
1328             meta_data->locus_desc = curr_abundance.locus_tag();
1329             meta_data->description = curr_abundance.description();
1330             inserted.first->second.meta_data = meta_data;
1331 #if ENABLE_THREADS
1332             test_storage_lock.unlock();
1333 #endif
1334         }
1335 
1336         for (size_t k = 0; k < samples[i]->genes.size(); ++k)
1337         {
1338             const AbundanceGroup& curr_abundance = samples[i]->genes[k];
1339             const AbundanceGroup& prev_abundance = samples[j]->genes[k];
1340             const string& desc = curr_abundance.description();
1341             FPKMTrackingTable::iterator itr = tracking.gene_fpkm_tracking.find(desc);
1342             assert (itr != tracking.gene_fpkm_tracking.end());
1343 
1344             bool enough_reads = false;
1345             if (curr_abundance.num_fragments() && curr_abundance.effective_length())
1346             {
1347                 double frags_per_kb = curr_abundance.num_fragments() / (curr_abundance.effective_length() / 1000.0);
1348                 if (frags_per_kb >= min_read_count)
1349                     enough_reads = true;
1350             }
1351             if (prev_abundance.num_fragments() && prev_abundance.effective_length())
1352             {
1353                 double frags_per_kb = prev_abundance.num_fragments() / (prev_abundance.effective_length() / 1000.0);
1354                 if (frags_per_kb >= min_read_count)
1355                     enough_reads = true;
1356             }
1357 
1358             SampleDifference test;
1359             test = get_de_tests(desc,
1360                                 itr->second.fpkm_series[j],
1361                                 itr->second.fpkm_series[i],
1362                                 //tests.gene_de_tests[i][j],
1363                                 enough_reads);
1364 #if ENABLE_THREADS
1365             test_storage_lock.lock();
1366 #endif
1367             pair<SampleDiffs::iterator, bool> inserted;
1368             inserted = tests.gene_de_tests[i][j].insert(make_pair(desc,
1369                                                                   test));
1370 
1371             boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1372 
1373             meta_data->gene_ids = curr_abundance.gene_id();
1374             meta_data->gene_names = curr_abundance.gene_name();
1375             meta_data->protein_ids = curr_abundance.protein_id();
1376             meta_data->locus_desc = curr_abundance.locus_tag();
1377             meta_data->description = curr_abundance.description();
1378             inserted.first->second.meta_data = meta_data;
1379 #if ENABLE_THREADS
1380             test_storage_lock.unlock();
1381 #endif
1382         }
1383 
1384         // Skip all the JS based testing for genes with an isoform switch?
1385         if (no_js_tests)
1386             continue;
1387 
1388         // FIXME: the code below will not properly test for differential
1389         // splicing/promoter use when a gene (e.g.) occupies two
1390         // disjoint bundles.  We need to store the covariance matrices (etc)
1391         // in the FPKMContexts to handle that case properly.
1392 
1393         // Differential promoter use
1394         for (size_t k = 0; k < samples[i]->gene_primary_transcripts.size(); ++k)
1395         {
1396             const AbundanceGroup& curr_abundance = filtered_promoter_groups[i][k];
1397             const AbundanceGroup& prev_abundance = filtered_promoter_groups[j][k];
1398             const string& desc = curr_abundance.description();
1399 
1400             bool enough_reads = (curr_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test &&
1401                                  prev_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test);
1402             SampleDifference test;
1403             test = get_ds_tests(prev_abundance,
1404                                 curr_abundance,
1405                                 enough_reads);
1406 
1407             // The filtered group might be empty, so let's grab metadata from
1408             // the unfiltered group
1409             boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1410 
1411             meta_data->gene_ids = samples[i]->gene_primary_transcripts[k].gene_id();
1412             meta_data->gene_names = samples[i]->gene_primary_transcripts[k].gene_name();
1413             meta_data->protein_ids = samples[i]->gene_primary_transcripts[k].protein_id();
1414             meta_data->locus_desc = samples[i]->gene_primary_transcripts[k].locus_tag();
1415             meta_data->description = samples[i]->gene_primary_transcripts[k].description();
1416 
1417             test.meta_data = meta_data;
1418 
1419 #if ENABLE_THREADS
1420             test_storage_lock.lock();
1421 #endif
1422 
1423             pair<SampleDiffs::iterator, bool> inserted;
1424             inserted = tests.diff_promoter_tests[i][j].insert(make_pair(desc,test));
1425             inserted.first->second = test;
1426 
1427 #if ENABLE_THREADS
1428             test_storage_lock.unlock();
1429 #endif
1430         }
1431 
1432         // Differential coding sequence output
1433         for (size_t k = 0; k < samples[i]->gene_cds.size(); ++k)
1434         {
1435             const AbundanceGroup& curr_abundance = filtered_cds_groups[i][k];
1436             const AbundanceGroup& prev_abundance = filtered_cds_groups[j][k];
1437             const string& desc = curr_abundance.description();
1438 
1439             bool enough_reads =  (curr_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test &&
1440                                   prev_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test);
1441             SampleDifference test;
1442             test = get_ds_tests(prev_abundance,
1443                                 curr_abundance,
1444                                 enough_reads);
1445 
1446             // The filtered group might be empty, so let's grab metadata from
1447             // the unfiltered group
1448             boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1449 
1450             meta_data->gene_ids = samples[i]->gene_cds[k].gene_id();
1451             meta_data->gene_names = samples[i]->gene_cds[k].gene_name();
1452             meta_data->protein_ids = samples[i]->gene_cds[k].protein_id();
1453             meta_data->locus_desc = samples[i]->gene_cds[k].locus_tag();
1454             meta_data->description = samples[i]->gene_cds[k].description();
1455 
1456             test.meta_data = meta_data;
1457 
1458 #if ENABLE_THREADS
1459             test_storage_lock.lock();
1460 #endif
1461             pair<SampleDiffs::iterator, bool> inserted;
1462             inserted = tests.diff_cds_tests[i][j].insert(make_pair(desc,test));
1463             inserted.first->second = test;
1464 
1465 #if ENABLE_THREADS
1466             test_storage_lock.unlock();
1467 #endif
1468         }
1469 
1470         // Differential splicing of primary transcripts
1471         for (size_t k = 0; k < samples[i]->primary_transcripts.size(); ++k)
1472         {
1473             const AbundanceGroup& curr_abundance = filtered_primary_trans_groups[i][k];
1474             const AbundanceGroup& prev_abundance = filtered_primary_trans_groups[j][k];
1475             const string& desc = curr_abundance.description();
1476 
1477             bool enough_reads = (curr_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test &&
1478                                  prev_abundance.FPKM_by_replicate().size() >= min_reps_for_js_test);
1479 
1480             SampleDifference test;
1481             test = get_ds_tests(prev_abundance,
1482                                 curr_abundance,
1483                                 enough_reads);
1484 
1485             // The filtered group might be empty, so let's grab metadata from
1486             // the unfiltered group
1487             boost::shared_ptr<SampleDifferenceMetaData> meta_data = get_metadata(desc);
1488 
1489             meta_data->gene_ids = samples[i]->primary_transcripts[k].gene_id();
1490             meta_data->gene_names = samples[i]->primary_transcripts[k].gene_name();
1491             meta_data->protein_ids = samples[i]->primary_transcripts[k].protein_id();
1492             meta_data->locus_desc = samples[i]->primary_transcripts[k].locus_tag();
1493             meta_data->description = samples[i]->primary_transcripts[k].description();
1494 
1495             test.meta_data = meta_data;
1496 
1497 #if ENABLE_THREADS
1498             test_storage_lock.lock();
1499 #endif
1500             pair<SampleDiffs::iterator, bool> inserted;
1501             inserted = tests.diff_splicing_tests[i][j].insert(make_pair(desc,test));
1502             inserted.first->second = test;
1503 
1504 #if ENABLE_THREADS
1505             test_storage_lock.unlock();
1506 #endif
1507         }
1508 
1509     }
1510 
1511     for (size_t i = 0; i < samples.size(); ++i)
1512     {
1513         clear_samples_from_tracking_table(samples[i], tracking);
1514     }
1515 }
1516 
print_checked_params_table(const vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)1517 void print_checked_params_table(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
1518 {
1519     string param_check_error_out_filename = output_dir + "/checked_params.txt";
1520     FILE* param_check_error_out_file = fopen(param_check_error_out_filename.c_str(), "w");
1521 
1522     fprintf(param_check_error_out_file, "file\tdefault frag length mean\tdefault frag length std dev\tbias correction\tbias mode\tmultiread correction\tmax-mle-iterations\tmin-mle-accuracy\tmax-bundle-frags\tmax-frag-multihits\tno-effective-length-correction\tno-length-correction\n");
1523     for (size_t i = 0; i < all_read_groups.size(); ++i)
1524     {
1525         const CheckedParameters& cp_i = all_read_groups[i]->checked_parameters();
1526         fprintf(param_check_error_out_file, "%s\t", all_read_groups[i]->file_path().c_str());
1527         fprintf(param_check_error_out_file, "%lg\t", cp_i.frag_len_mean);
1528         fprintf(param_check_error_out_file, "%lg\t", cp_i.frag_len_std_dev);
1529         fprintf(param_check_error_out_file, "%s\t", cp_i.corr_bias ? "yes" : "no");
1530         fprintf(param_check_error_out_file, "%d\t", cp_i.frag_bias_mode);
1531         fprintf(param_check_error_out_file, "%s\t", cp_i.corr_multireads ? "yes" : "no");
1532         fprintf(param_check_error_out_file, "%lg\t", cp_i.max_mle_iterations);
1533         fprintf(param_check_error_out_file, "%lg\t", cp_i.min_mle_accuracy);
1534         fprintf(param_check_error_out_file, "%lg\t", cp_i.max_bundle_frags);
1535         fprintf(param_check_error_out_file, "%lg\t", cp_i.max_frags_multihits);
1536         fprintf(param_check_error_out_file, "%s\t", cp_i.no_effective_length_correction ? "yes" : "no");
1537         fprintf(param_check_error_out_file, "%s\t", cp_i.no_length_correction? "yes" : "no");
1538         fprintf(param_check_error_out_file, "\n");
1539     }
1540     fclose(param_check_error_out_file);
1541 
1542 }
1543 
validate_cross_sample_parameters(const vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)1544 void validate_cross_sample_parameters(const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
1545 {
1546     bool dump_params = false;
1547     for (size_t i = 1; i < all_read_groups.size(); ++i)
1548     {
1549         const CheckedParameters& cp_i = all_read_groups[i - 1]->checked_parameters();
1550         const CheckedParameters& cp_j = all_read_groups[i]->checked_parameters();
1551 
1552         if (cp_i.ref_gtf_crc != cp_j.ref_gtf_crc)
1553         {
1554             fprintf(stderr, "Error reference gene annotation differs between samples!\n");
1555             fprintf(stderr, "\t%s\t%s:\t%u!\n", all_read_groups[i - 1]->file_path().c_str(), cp_i.ref_gtf_file_path.c_str(), cp_i.ref_gtf_crc);
1556             fprintf(stderr, "\t%s\t%s:\t%u!\n", all_read_groups[i]->file_path().c_str(), cp_j.ref_gtf_file_path.c_str(), cp_j.ref_gtf_crc);
1557             exit(1);
1558         }
1559 
1560         if (cp_i != cp_j)
1561         {
1562             dump_params = true;
1563             fprintf(stderr, "Warning: quantification parameters differ between CXB files!\n\tSee %s/checked_params.txt for more info\n", output_dir.c_str());
1564             break;
1565 //            fprintf(stderr, "CXB files:\n");
1566 //            fprintf(stderr, "%s:\n", all_read_groups[i - 1]->file_path().c_str());
1567 //            fprintf(stderr, "\tdefault-frag-length-mean:\t%lg\n", cp_i.frag_len_mean);
1568 //            fprintf(stderr, "\tdefault-frag-length-std-dev:\t%lg\n", cp_i.frag_len_std_dev);
1569 //            fprintf(stderr, "\tbias correction:\t%s\n", cp_i.corr_bias ? "yes" : "no");
1570 //            fprintf(stderr, "\tbias mode:\t%d\n", cp_i.frag_bias_mode);
1571 //            fprintf(stderr, "\tmultiread correction:\t%s\n", cp_i.corr_multireads ? "yes" : "no");
1572 //            fprintf(stderr, "\tmax-mle-iterations:\t%lg\n", cp_i.max_mle_iterations);
1573 //            fprintf(stderr, "\tmin-mle-accuracy:\t%lg\n", cp_i.min_mle_accuracy);
1574 //            fprintf(stderr, "\tmax-bundle-frags:\t%lg\n", cp_i.max_bundle_frags);
1575 //            fprintf(stderr, "\tmax-frag-multihits:\t%lg\n", cp_i.max_frags_multihits);
1576 //            fprintf(stderr, "\tno-effective-length-correction:\t%s\n", cp_i.no_effective_length_correction ? "yes" : "no");
1577 //            fprintf(stderr, "\tno-length-correction:\t%s\n", cp_i.no_length_correction? "yes" : "no");
1578 //
1579 //            fprintf(stderr, "%s\n", all_read_groups[i]->file_path().c_str());
1580 //            fprintf(stderr, "\tdefault-frag-length-mean:\t%lg\n", cp_j.frag_len_mean);
1581 //            fprintf(stderr, "\tdefault-frag-length-std-dev:\t%lg\n", cp_j.frag_len_std_dev);
1582 //            // TODO: add CRCs for reference GTF, mask file, norm standards file if using.
1583 //            fprintf(stderr, "\tbias correction:\t%s\n", cp_j.corr_bias ? "yes" : "no");
1584 //            fprintf(stderr, "\tbias mode:\t%d\n", cp_j.frag_bias_mode);
1585 //            fprintf(stderr, "\tmultiread correction:\t%s\n", cp_j.corr_multireads ? "yes" : "no");
1586 //            fprintf(stderr, "\tmax-mle-iterations:\t%lg\n", cp_j.max_mle_iterations);
1587 //            fprintf(stderr, "\tmin-mle-accuracy:\t%lg\n", cp_j.min_mle_accuracy);
1588 //            fprintf(stderr, "\tmax-bundle-frags:\t%lg\n", cp_j.max_bundle_frags);
1589 //            fprintf(stderr, "\tmax-frag-multihits:\t%lg\n", cp_j.max_frags_multihits);
1590 //            fprintf(stderr, "\tno-effective-length-correction:\t%s\n", cp_j.no_effective_length_correction ? "yes" : "no");
1591 //            fprintf(stderr, "\tno-length-correction:\t%s\n", cp_j.no_length_correction? "yes" : "no");
1592 
1593         }
1594 
1595     }
1596     if (dump_params)
1597         print_checked_params_table(all_read_groups);
1598 }
1599 
perform_testing(vector<boost::shared_ptr<SampleAbundances>> abundances)1600 void TrackingDataWriter::perform_testing(vector<boost::shared_ptr<SampleAbundances> > abundances)
1601 {
1602 #if ENABLE_THREADS
1603     _launcher_lock.lock();
1604     test_storage_lock.lock();
1605 #endif
1606     if (headers_written == false)
1607     {
1608         write_header_output();
1609         headers_written = true;
1610     }
1611 
1612     write_output(abundances);
1613     //test_differential(abundances.front()->locus_tag, abundances, _contrasts, *_tests, *_tracking);
1614 #if ENABLE_THREADS
1615     test_storage_lock.unlock();
1616     _launcher_lock.unlock();
1617 #endif
1618 
1619 }
1620 
1621