1 /*
2 * cuffdiff.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 10/21/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 #include <numeric>
14 #include <cfloat>
15 #include <iostream>
16 #include <fstream>
17
18 #include "common.h"
19 #include "hits.h"
20 #include "bundles.h"
21 #include "abundances.h"
22 #include "tokenize.h"
23 #include "biascorrection.h"
24 #include "update_check.h"
25
26 #include <boost/thread.hpp>
27 #include <boost/version.hpp>
28 #include <boost/graph/adjacency_list.hpp>
29 #include <boost/graph/graph_traits.hpp>
30 #include <boost/numeric/ublas/matrix.hpp>
31 #include <boost/numeric/ublas/matrix_proxy.hpp>
32 #include <boost/numeric/ublas/vector.hpp>
33 #include <boost/numeric/ublas/vector_proxy.hpp>
34 #include <boost/numeric/ublas/io.hpp>
35 #include <boost/algorithm/string.hpp>
36
37 #include "replicates.h"
38 #include "tracking.h"
39
40 // Need at least this many reads in a locus to do any testing on it
41
42 vector<string> sample_labels;
43
44 using namespace std;
45 using namespace boost;
46
47 // We leave out the short codes for options that don't take an argument
48 #if ENABLE_THREADS
49 const char *short_options = "m:p:s:c:I:j:L:M:o:b:TNqvuF:C:";
50 #else
51 const char *short_options = "m:s:c:I:j:L:M:o:b:TNqvuF:C:";
52 #endif
53
54
55
56 static struct option long_options[] = {
57 {"frag-len-mean", required_argument, 0, 'm'},
58 {"frag-len-std-dev", required_argument, 0, 's'},
59 {"seed", required_argument, 0, OPT_RANDOM_SEED},
60 {"mask-file", required_argument, 0, 'M'},
61 {"output-dir", required_argument, 0, 'o'},
62 {"verbose", no_argument, 0, 'v'},
63 {"quiet", no_argument, 0, 'q'},
64 {"frag-bias-correct", required_argument, 0, 'b'},
65 {"multi-read-correct", no_argument, 0, 'u'},
66 #if ENABLE_THREADS
67 {"num-threads", required_argument, 0, 'p'},
68 #endif
69 {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
70 {"seed", required_argument, 0, OPT_RANDOM_SEED},
71 {"no-collapse-cond-prob", no_argument, 0, OPT_COLLAPSE_COND_PROB},
72 {"max-mle-iterations", required_argument, 0, OPT_MLE_MAX_ITER},
73 {"min-mle-accuracy", required_argument, 0, OPT_MLE_MIN_ACC},
74 {"bias-mode", required_argument, 0, OPT_BIAS_MODE},
75 {"no-update-check", no_argument, 0, OPT_NO_UPDATE_CHECK},
76
77 // Some options for testing different stats policies
78 {"max-bundle-frags", required_argument, 0, OPT_MAX_FRAGS_PER_BUNDLE},
79 {"max-frag-multihits", required_argument, 0, OPT_FRAG_MAX_MULTIHITS},
80 {"no-effective-length-correction", no_argument, 0, OPT_NO_EFFECTIVE_LENGTH_CORRECTION},
81 {"no-length-correction", no_argument, 0, OPT_NO_LENGTH_CORRECTION},
82 {0, 0, 0, 0} // terminator
83 };
84
print_usage()85 void print_usage()
86 {
87 fprintf(stderr, "cuffquant v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
88 fprintf(stderr, "-----------------------------\n");
89
90 //NOTE: SPACES ONLY, bozo
91 fprintf(stderr, "Usage: cuffquant [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]\n");
92 fprintf(stderr, " Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam\n");
93 fprintf(stderr, "General Options:\n");
94 fprintf(stderr, " -o/--output-dir write all output files to this directory [ default: ./ ]\n");
95 fprintf(stderr, " -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]\n");
96 //fprintf(stderr, " --norm-standards-file Housekeeping/spike genes to normalize libraries [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
97 fprintf(stderr, " -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]\n");
98 fprintf(stderr, " -u/--multi-read-correct use 'rescue method' for multi-reads [ default: FALSE ]\n");
99 #if ENABLE_THREADS
100 fprintf(stderr, " -p/--num-threads number of threads used during quantification [ default: 1 ]\n");
101 #endif
102 fprintf(stderr, " --library-type Library prep used for input reads [ default: below ]\n");
103
104 fprintf(stderr, "\nAdvanced Options:\n");
105 fprintf(stderr, " -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]\n");
106 fprintf(stderr, " -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]\n");
107 fprintf(stderr, " -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]\n");
108 fprintf(stderr, " --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]\n");
109 fprintf(stderr, " -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]\n");
110 fprintf(stderr, " -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]\n");
111 fprintf(stderr, " --seed value of random number generator seed [ default: 0 ]\n");
112 fprintf(stderr, " --no-update-check do not contact server to check for update availability[ default: FALSE ]\n");
113 fprintf(stderr, " --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]\n");
114 fprintf(stderr, " --max-frag-multihits Maximum number of alignments allowed per fragment [ default: unlim ]\n");
115 fprintf(stderr, " --no-effective-length-correction No effective length correction [ default: FALSE ]\n");
116 fprintf(stderr, " --no-length-correction No length correction [ default: FALSE ]\n");
117
118 fprintf(stderr, "\nDebugging use only:\n");
119 fprintf(stderr, " --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]\n");
120 fprintf(stderr, " --no-read-pairs Break all read pairs [ default: FALSE ]\n");
121 fprintf(stderr, " --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]\n");
122 fprintf(stderr, " --no-scv-correction Disable SCV correction [ default: FALSE ]\n");
123 print_library_table();
124 }
125
parse_options(int argc,char ** argv)126 int parse_options(int argc, char** argv)
127 {
128 int option_index = 0;
129 int next_option;
130 string sample_label_list;
131 string dispersion_method_str;
132 string lib_norm_method_str;
133 do {
134 next_option = getopt_long_only(argc, argv, short_options, long_options, &option_index);
135 if (next_option == -1) /* Done with options. */
136 break;
137 switch (next_option) {
138 case 0:
139 /* If this option set a flag, do nothing else now. */
140 if (long_options[option_index].flag != 0)
141 break;
142 break;
143
144 case 'm':
145 user_provided_fld = true;
146 def_frag_len_mean = (uint32_t)parseInt(0, "-m/--frag-len-mean arg must be at least 0", print_usage);
147 break;
148 case 's':
149 user_provided_fld = true;
150 def_frag_len_std_dev = (uint32_t)parseInt(0, "-s/--frag-len-std-dev arg must be at least 0", print_usage);
151 break;
152 case 'p':
153 num_threads = (uint32_t)parseInt(1, "-p/--num-threads arg must be at least 1", print_usage);
154 break;
155 case 'L':
156 sample_label_list = optarg;
157 break;
158 case OPT_NUM_IMP_SAMPLES:
159 num_importance_samples = parseInt(1, "--num-importance-samples must be at least 1", print_usage);
160 break;
161 case OPT_MLE_MAX_ITER:
162 max_mle_iterations = parseInt(1, "--max-mle-iterations must be at least 1", print_usage);
163 break;
164 case OPT_BIAS_MODE:
165 if (!strcmp(optarg, "site"))
166 bias_mode = SITE;
167 else if (!strcmp(optarg, "pos"))
168 bias_mode = POS;
169 else if (!strcmp(optarg, "pos_vlmm"))
170 bias_mode = POS_VLMM;
171 else if (!strcmp(optarg, "vlmm"))
172 bias_mode = VLMM;
173 else if (!strcmp(optarg, "pos_site"))
174 bias_mode = POS_SITE;
175 else
176 {
177 fprintf(stderr, "Unknown bias mode.\n");
178 exit(1);
179 }
180 break;
181 case 'M':
182 {
183 mask_gtf_filename = optarg;
184 break;
185 }
186 case OPT_NORM_STANDARDS_FILE:
187 {
188 norm_standards_filename = optarg;
189 break;
190 }
191 case 'v':
192 {
193 if (cuff_quiet)
194 {
195 fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting verbose only.\n");
196 }
197 cuff_quiet = false;
198 cuff_verbose = true;
199 break;
200 }
201 case 'q':
202 {
203 if (cuff_verbose)
204 {
205 fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting quiet only.\n");
206 }
207 cuff_verbose = false;
208 cuff_quiet = true;
209 break;
210 }
211 case 'o':
212 {
213 output_dir = optarg;
214 break;
215 }
216 case 'b':
217 {
218 fasta_dir = optarg;
219 corr_bias = true;
220 break;
221 }
222
223 case 'u':
224 {
225 corr_multi = true;
226 break;
227 }
228 case OPT_LIBRARY_TYPE:
229 {
230 library_type = optarg;
231 break;
232 }
233 case OPT_NO_UPDATE_CHECK:
234 {
235 no_update_check = true;
236 break;
237 }
238 case OPT_RANDOM_SEED:
239 {
240 random_seed = parseInt(0, "--seed must be at least 0", print_usage);
241 break;
242 }
243 case OPT_COLLAPSE_COND_PROB:
244 {
245 cond_prob_collapse = false;
246 break;
247 }
248 case OPT_USE_COMPAT_MASS:
249 {
250 use_compat_mass = true;
251 break;
252 }
253 case OPT_USE_TOTAL_MASS:
254 {
255 use_total_mass = true;
256 break;
257 }
258 case OPT_MAX_FRAGS_PER_BUNDLE:
259 {
260 max_frags_per_bundle = parseInt(0, "--max-bundle-frags must be at least 0", print_usage);
261 break;
262 }
263 case OPT_READ_SKIP_FRACTION:
264 {
265 read_skip_fraction = parseFloat(0, 1.0, "--read-skip-fraction must be between 0 and 1.0", print_usage);
266 break;
267 }
268 case OPT_NO_READ_PAIRS:
269 {
270 no_read_pairs = true;
271 break;
272 }
273 case OPT_TRIM_READ_LENGTH:
274 {
275 trim_read_length = parseInt(0, "--trim-read-length must be at least 1", print_usage);
276 break;
277 }
278 case OPT_FRAG_MAX_MULTIHITS:
279 {
280 max_frag_multihits = parseInt(1, "--max-frag-multihits must be at least 1", print_usage);
281 break;
282 }
283 case OPT_NO_EFFECTIVE_LENGTH_CORRECTION:
284 {
285 no_effective_length_correction = true;
286 break;
287 }
288 case OPT_NO_LENGTH_CORRECTION:
289 {
290 no_length_correction = true;
291 break;
292 }
293 case OPT_LIB_NORM_METHOD:
294 {
295 lib_norm_method_str = optarg;
296 break;
297 }
298
299 default:
300 print_usage();
301 return 1;
302 }
303 } while(next_option != -1);
304
305 if (library_type != "")
306 {
307 map<string, ReadGroupProperties>::iterator lib_itr =
308 library_type_table.find(library_type);
309 if (lib_itr == library_type_table.end())
310 {
311 fprintf(stderr, "Error: Library type %s not supported\n", library_type.c_str());
312 exit(1);
313 }
314 else
315 {
316 if (library_type == "transfrags")
317 {
318 allow_junk_filtering = false;
319 }
320 global_read_properties = &lib_itr->second;
321 }
322 }
323 else
324 {
325
326 }
327
328 // Set the count dispersion method to use
329 if (dispersion_method_str == "")
330 {
331 dispersion_method_str = default_dispersion_method;
332 }
333
334 map<string, DispersionMethod>::iterator disp_itr =
335 dispersion_method_table.find(dispersion_method_str);
336 if (disp_itr == dispersion_method_table.end())
337 {
338 fprintf(stderr, "Error: Dispersion method %s not supported\n", dispersion_method_str.c_str());
339 exit(1);
340 }
341 else
342 {
343 dispersion_method = disp_itr->second;
344 }
345
346 // Set the library size normalization method to use
347 if (lib_norm_method_str == "")
348 {
349 lib_norm_method_str = default_lib_norm_method;
350 }
351
352 map<string, LibNormalizationMethod>::iterator lib_norm_itr =
353 lib_norm_method_table.find(lib_norm_method_str);
354 if (lib_norm_itr == lib_norm_method_table.end())
355 {
356 fprintf(stderr, "Error: Normalization method %s not supported\n", lib_norm_method_str.c_str());
357 exit(1);
358 }
359 else
360 {
361 lib_norm_method = lib_norm_itr->second;
362 }
363
364
365
366 if (use_total_mass && use_compat_mass)
367 {
368 fprintf (stderr, "Error: please supply only one of --compatibile-hits-norm and --total-hits-norm\n");
369 exit(1);
370 }
371
372 tokenize(sample_label_list, ",", sample_labels);
373
374 allow_junk_filtering = false;
375
376 return 0;
377 }
378
379 struct Outfiles
380 {
381 FILE* isoform_fpkm_tracking_out;
382 FILE* tss_group_fpkm_tracking_out;
383 FILE* gene_fpkm_tracking_out;
384 FILE* cds_fpkm_tracking_out;
385
386 FILE* isoform_count_tracking_out;
387 FILE* tss_group_count_tracking_out;
388 FILE* gene_count_tracking_out;
389 FILE* cds_count_tracking_out;
390
391 FILE* run_info_out;
392 FILE* read_group_info_out;
393 FILE* bias_out;
394 FILE* var_model_out;
395 };
396
print_FPKM_tracking(FILE * fout,const FPKMTrackingTable & tracking)397 void print_FPKM_tracking(FILE* fout,
398 const FPKMTrackingTable& tracking)
399 {
400 fprintf(fout,"tracking_id\tclass_code\tnearest_ref_id\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage");
401 FPKMTrackingTable::const_iterator first_itr = tracking.begin();
402 if (first_itr != tracking.end())
403 {
404 const FPKMTracking& track = first_itr->second;
405 const vector<FPKMContext>& fpkms = track.fpkm_series;
406 for (size_t i = 0; i < fpkms.size(); ++i)
407 {
408 fprintf(fout, "\t%s_FPKM\t%s_conf_lo\t%s_conf_hi\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
409 }
410 }
411 fprintf(fout, "\n");
412 for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
413 {
414 const string& description = itr->first;
415 const FPKMTracking& track = itr->second;
416 const vector<FPKMContext>& fpkms = track.fpkm_series;
417
418 AbundanceStatus status = NUMERIC_OK;
419 BOOST_FOREACH (const FPKMContext& c, fpkms)
420 {
421 if (c.status == NUMERIC_FAIL)
422 status = NUMERIC_FAIL;
423 }
424
425 string all_gene_ids = cat_strings(track.gene_ids);
426 if (all_gene_ids == "")
427 all_gene_ids = "-";
428
429 string all_gene_names = cat_strings(track.gene_names);
430 if (all_gene_names == "")
431 all_gene_names = "-";
432
433 string all_tss_ids = cat_strings(track.tss_ids);
434 if (all_tss_ids == "")
435 all_tss_ids = "-";
436
437 char length_buff[33] = "-";
438 if (track.length)
439 sprintf(length_buff, "%d", track.length);
440
441 fprintf(fout, "%s\t%c\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
442 description.c_str(),
443 track.classcode ? track.classcode : '-',
444 track.ref_match.c_str(),
445 all_gene_ids.c_str(),
446 all_gene_names.c_str(),
447 all_tss_ids.c_str(),
448 track.locus_tag.c_str(),
449 length_buff,
450 "-");
451
452 for (size_t i = 0; i < fpkms.size(); ++i)
453 {
454 double fpkm = fpkms[i].FPKM;
455 //double std_dev = sqrt(fpkms[i].FPKM_variance);
456 double fpkm_conf_hi = fpkms[i].FPKM_conf_hi;
457 double fpkm_conf_lo = fpkms[i].FPKM_conf_lo;
458 const char* status_str = "OK";
459
460 if (fpkms[i].status == NUMERIC_OK)
461 {
462 status_str = "OK";
463 }
464 else if (fpkms[i].status == NUMERIC_FAIL)
465 {
466 status_str = "FAIL";
467 }
468 else if (fpkms[i].status == NUMERIC_LOW_DATA)
469 {
470 status_str = "LOWDATA";
471 }
472 else if (fpkms[i].status == NUMERIC_HI_DATA)
473 {
474 status_str = "HIDATA";
475 }
476 else
477 {
478 assert(false);
479 }
480
481 fprintf(fout, "\t%lg\t%lg\t%lg\t%s", fpkm, fpkm_conf_lo, fpkm_conf_hi, status_str);
482 }
483
484 fprintf(fout, "\n");
485 }
486 }
487
print_count_tracking(FILE * fout,const FPKMTrackingTable & tracking)488 void print_count_tracking(FILE* fout,
489 const FPKMTrackingTable& tracking)
490 {
491 fprintf(fout,"tracking_id");
492 FPKMTrackingTable::const_iterator first_itr = tracking.begin();
493 if (first_itr != tracking.end())
494 {
495 const FPKMTracking& track = first_itr->second;
496 const vector<FPKMContext>& fpkms = track.fpkm_series;
497 for (size_t i = 0; i < fpkms.size(); ++i)
498 {
499 fprintf(fout, "\t%s_count\t%s_count_variance\t%s_count_uncertainty_var\t%s_count_dispersion_var\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
500 }
501 }
502 fprintf(fout, "\n");
503 for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
504 {
505 const string& description = itr->first;
506 const FPKMTracking& track = itr->second;
507 const vector<FPKMContext>& fpkms = track.fpkm_series;
508
509 AbundanceStatus status = NUMERIC_OK;
510 BOOST_FOREACH (const FPKMContext& c, fpkms)
511 {
512 if (c.status == NUMERIC_FAIL)
513 status = NUMERIC_FAIL;
514 }
515
516 fprintf(fout, "%s",
517 description.c_str());
518
519 for (size_t i = 0; i < fpkms.size(); ++i)
520 {
521 const char* status_str = "OK";
522
523 if (fpkms[i].status == NUMERIC_OK)
524 {
525 status_str = "OK";
526 }
527 else if (fpkms[i].status == NUMERIC_FAIL)
528 {
529 status_str = "FAIL";
530 }
531 else if (fpkms[i].status == NUMERIC_LOW_DATA)
532 {
533 status_str = "LOWDATA";
534 }
535 else if (fpkms[i].status == NUMERIC_HI_DATA)
536 {
537 status_str = "HIDATA";
538 }
539 else
540 {
541 assert(false);
542 }
543
544 double external_counts = fpkms[i].count_mean;
545 double external_count_var = fpkms[i].count_var;
546 double uncertainty_var = fpkms[i].count_uncertainty_var;
547 double dispersion_var = fpkms[i].count_dispersion_var;
548 fprintf(fout, "\t%lg\t%lg\t%lg\t%lg\t%s", external_counts, external_count_var, uncertainty_var, dispersion_var, status_str);
549 }
550
551 fprintf(fout, "\n");
552 }
553 }
554
print_run_info(FILE * fout)555 void print_run_info(FILE* fout)
556 {
557 fprintf(fout, "param\tvalue\n");
558 fprintf(fout, "cmd_line\t%s\n", cmd_str.c_str());
559 fprintf(fout, "version\t%s\n", PACKAGE_VERSION);
560 fprintf(fout, "SVN_revision\t%s\n",SVN_REVISION);
561 fprintf(fout, "boost_version\t%d\n", BOOST_VERSION);
562 }
563
564
565 #if ENABLE_THREADS
566 boost::mutex inspect_lock;
567
568 boost::mutex _recorder_lock;
569 boost::mutex locus_thread_pool_lock;
570 int locus_curr_threads = 0;
571 int locus_num_threads = 0;
572
decr_pool_count()573 void decr_pool_count()
574 {
575 locus_thread_pool_lock.lock();
576 locus_curr_threads--;
577 locus_thread_pool_lock.unlock();
578 }
579
580 #endif
581
582
583
inspect_map_worker(ReplicatedBundleFactory & fac,int & tmp_min_frag_len,int & tmp_max_frag_len,IdToLocusMap & id_to_locus_map)584 void inspect_map_worker(ReplicatedBundleFactory& fac,
585 int& tmp_min_frag_len,
586 int& tmp_max_frag_len,
587 IdToLocusMap& id_to_locus_map)
588 {
589 #if ENABLE_THREADS
590 boost::this_thread::at_thread_exit(decr_pool_count);
591 #endif
592
593 int min_f = std::numeric_limits<int>::max();
594 int max_f = 0;
595
596 fac.inspect_replicate_maps(min_f, max_f, id_to_locus_map);
597
598 #if ENABLE_THREADS
599 inspect_lock.lock();
600 #endif
601 tmp_min_frag_len = min(min_f, tmp_min_frag_len);
602 tmp_max_frag_len = max(max_f, tmp_max_frag_len);
603 #if ENABLE_THREADS
604 inspect_lock.unlock();
605 #endif
606 }
607
learn_bias_worker(boost::shared_ptr<BundleFactory> fac)608 void learn_bias_worker(boost::shared_ptr<BundleFactory> fac)
609 {
610 #if ENABLE_THREADS
611 boost::this_thread::at_thread_exit(decr_pool_count);
612 #endif
613 boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
614 BiasLearner* bl = new BiasLearner(rg_props->frag_len_dist());
615 learn_bias(*fac, *bl, false);
616 rg_props->bias_learner(boost::shared_ptr<BiasLearner>(bl));
617 }
618
619 typedef map<int, vector<AbundanceGroup> > light_ab_group_tracking_table;
620
621 // Similiar to TestLauncher, except this class records tracking data when abundance groups report in
622 struct AbundanceRecorder
623 {
624 private:
AbundanceRecorderAbundanceRecorder625 AbundanceRecorder(AbundanceRecorder& rhs) {}
626
627 public:
AbundanceRecorderAbundanceRecorder628 AbundanceRecorder(int num_samples,
629 Tracking* tracking,
630 ProgressBar* p_bar)
631 :
632 _orig_workers(num_samples),
633 _tracking(tracking),
634 _p_bar(p_bar)
635 {
636 }
637
638 void operator()();
639
640 void register_locus(int locus_id);
641 void abundance_avail(int locus_id,
642 boost::shared_ptr<SampleAbundances> ab,
643 size_t factory_id);
644 void record_finished_loci();
645 void record_tracking_data(int locus_id, vector<boost::shared_ptr<SampleAbundances> >& abundances);
646 bool all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances);
647 bool all_samples_reported_in(int locus_id);
648
clear_tracking_dataAbundanceRecorder649 void clear_tracking_data() { _tracking->clear(); }
650
651 typedef list<pair<int, vector<boost::shared_ptr<SampleAbundances> > > > recorder_sample_table;
652
get_sample_tableAbundanceRecorder653 const light_ab_group_tracking_table& get_sample_table() const { return _ab_group_tracking_table; }
654
655 private:
656
657 recorder_sample_table::iterator find_locus(int locus_id);
658
659 int _orig_workers;
660
661 recorder_sample_table _samples;
662
663 Tracking* _tracking;
664 ProgressBar* _p_bar;
665
666 light_ab_group_tracking_table _ab_group_tracking_table;
667 };
668
669
find_locus(int locus_id)670 AbundanceRecorder::recorder_sample_table::iterator AbundanceRecorder::find_locus(int locus_id)
671 {
672 recorder_sample_table::iterator itr = _samples.begin();
673 for(; itr != _samples.end(); ++itr)
674 {
675 if (itr->first == locus_id)
676 return itr;
677 }
678 return _samples.end();
679 }
680
register_locus(int locus_id)681 void AbundanceRecorder::register_locus(int locus_id)
682 {
683 #if ENABLE_THREADS
684 boost::mutex::scoped_lock lock(_recorder_lock);
685 #endif
686
687 recorder_sample_table::iterator itr = find_locus(locus_id);
688 if (itr == _samples.end())
689 {
690 pair<recorder_sample_table::iterator, bool> p;
691 vector<boost::shared_ptr<SampleAbundances> >abs(_orig_workers);
692 _samples.push_back(make_pair(locus_id, abs));
693 }
694 }
695
abundance_avail(int locus_id,boost::shared_ptr<SampleAbundances> ab,size_t factory_id)696 void AbundanceRecorder::abundance_avail(int locus_id,
697 boost::shared_ptr<SampleAbundances> ab,
698 size_t factory_id)
699 {
700 #if ENABLE_THREADS
701 boost::mutex::scoped_lock lock(_recorder_lock);
702 #endif
703 recorder_sample_table::iterator itr = find_locus(locus_id);
704 if (itr == _samples.end())
705 {
706 assert(false);
707 }
708 itr->second[factory_id] = ab;
709 //itr->second(factory_id] = ab;
710 }
711
712 // Note: this routine should be called under lock - it doesn't
713 // acquire the lock itself.
all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances>> & abundances)714 bool AbundanceRecorder::all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances)
715 {
716 BOOST_FOREACH (boost::shared_ptr<SampleAbundances> ab, abundances)
717 {
718 if (!ab)
719 {
720 return false;
721 }
722 }
723 return true;
724 }
725
726
727 // Note: this routine should be called under lock - it doesn't
728 // acquire the lock itself.
record_tracking_data(int locus_id,vector<boost::shared_ptr<SampleAbundances>> & abundances)729 void AbundanceRecorder::record_tracking_data(int locus_id, vector<boost::shared_ptr<SampleAbundances> >& abundances)
730 {
731 assert (abundances.size() == _orig_workers);
732
733 // Just verify that all the loci from each factory match up.
734 for (size_t i = 1; i < abundances.size(); ++i)
735 {
736 const SampleAbundances& curr = *(abundances[i]);
737 const SampleAbundances& prev = *(abundances[i-1]);
738
739 assert (curr.locus_tag == prev.locus_tag);
740
741 const AbundanceGroup& s1 = curr.transcripts;
742 const AbundanceGroup& s2 = prev.transcripts;
743
744 assert (s1.abundances().size() == s2.abundances().size());
745
746 for (size_t j = 0; j < s1.abundances().size(); ++j)
747 {
748 assert (s1.abundances()[j]->description() == s2.abundances()[j]->description());
749 }
750 }
751
752 vector<AbundanceGroup> lightweight_ab_groups;
753
754 // Add all the transcripts, CDS groups, TSS groups, and genes to their
755 // respective FPKM tracking table. Whether this is a time series or an
756 // all pairs comparison, we should be calculating and reporting FPKMs for
757 // all objects in all samples
758 for (size_t i = 0; i < abundances.size(); ++i)
759 {
760 const AbundanceGroup& ab_group = abundances[i]->transcripts;
761 /*
762 //fprintf(stderr, "[%d] count = %lg\n",i, ab_group.num_fragments());
763 BOOST_FOREACH (boost::shared_ptr<Abundance> ab, ab_group.abundances())
764 {
765 add_to_tracking_table(i, *ab, _tracking->isoform_fpkm_tracking);
766 //assert (_tracking->isoform_fpkm_tracking.num_fragments_by_replicate().empty() == false);
767 }
768
769 BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->cds)
770 {
771 add_to_tracking_table(i, ab, _tracking->cds_fpkm_tracking);
772 }
773
774 BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->primary_transcripts)
775 {
776 add_to_tracking_table(i, ab, _tracking->tss_group_fpkm_tracking);
777 }
778
779 BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->genes)
780 {
781 add_to_tracking_table(i, ab, _tracking->gene_fpkm_tracking);
782 }
783 */
784
785 abundances[i]->transcripts.clear_non_serialized_data();
786 lightweight_ab_groups.push_back(abundances[i]->transcripts);
787 }
788
789 if (_ab_group_tracking_table.find(locus_id) != _ab_group_tracking_table.end())
790 {
791 fprintf (stderr, "Error: locus %d is already recorded!\n", locus_id);
792 }
793 _ab_group_tracking_table[locus_id] = lightweight_ab_groups;
794 }
795
record_finished_loci()796 void AbundanceRecorder::record_finished_loci()
797 {
798 #if ENABLE_THREADS
799 boost::mutex::scoped_lock lock(_recorder_lock);
800 #endif
801
802 recorder_sample_table::iterator itr = _samples.begin();
803 while(itr != _samples.end())
804 {
805 if (all_samples_reported_in(itr->second))
806 {
807 // In some abundance runs, we don't actually want to perform testing
808 // (eg initial quantification before bias correction).
809 // _tests and _tracking will be NULL in these cases.
810 if (_tracking != NULL)
811 {
812 if (_p_bar)
813 {
814 verbose_msg("Estimating expression in locus [%s]\n", itr->second.front()->locus_tag.c_str());
815 _p_bar->update(itr->second.front()->locus_tag.c_str(), 1);
816 }
817 }
818 record_tracking_data(itr->first, itr->second);
819
820 // Removes the samples that have already been tested and transferred to the tracking tables,
821 itr = _samples.erase(itr);
822 }
823 else
824 {
825
826 ++itr;
827 }
828 }
829 }
830
831
832 boost::shared_ptr<AbundanceRecorder> abundance_recorder;
833
sample_worker(const RefSequenceTable & rt,ReplicatedBundleFactory & sample_factory,boost::shared_ptr<SampleAbundances> abundance,size_t factory_id,boost::shared_ptr<AbundanceRecorder> recorder,bool calculate_variance)834 void sample_worker(const RefSequenceTable& rt,
835 ReplicatedBundleFactory& sample_factory,
836 boost::shared_ptr<SampleAbundances> abundance,
837 size_t factory_id,
838 boost::shared_ptr<AbundanceRecorder> recorder,
839 bool calculate_variance)
840 {
841 #if ENABLE_THREADS
842 boost::this_thread::at_thread_exit(decr_pool_count);
843 #endif
844
845 boost::shared_ptr<HitBundle> bundle(new HitBundle);
846 bool non_empty = sample_factory.next_bundle(*bundle, true);
847
848 char bundle_label_buf[2048];
849 sprintf(bundle_label_buf,
850 "%s:%d-%d",
851 rt.get_name(bundle->ref_id()),
852 bundle->left(),
853 bundle->right());
854 string locus_tag = bundle_label_buf;
855
856 abundance->cluster_mass = bundle->mass();
857
858 recorder->register_locus(bundle->id());
859
860 abundance->locus_tag = locus_tag;
861
862 // if (rt.get_name(bundle->ref_id()) == "chr13_random")
863 // {
864 // int a = 5;
865 // }
866 if (!non_empty || (bias_run && bundle->ref_scaffolds().size() != 1)) // Only learn on single isoforms
867 {
868 //#if !ENABLE_THREADS
869 // If Cuffdiff was built without threads, we need to manually invoke
870 // the testing functor, which will check to see if all the workers
871 // are done, and if so, perform the cross sample testing.
872 recorder->abundance_avail(bundle->id(), abundance, factory_id);
873 recorder->record_finished_loci();
874 //launcher();
875 //#endif
876 return;
877 }
878
879 bool perform_cds_analysis = false;
880 bool perform_tss_analysis = false;
881
882 BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle->ref_scaffolds())
883 {
884 if (s->annotated_tss_id() != "")
885 {
886 perform_tss_analysis = final_est_run;
887 }
888 if (s->annotated_protein_id() != "")
889 {
890 perform_cds_analysis = final_est_run;
891 }
892 }
893
894 set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
895 for (size_t i = 0; i < sample_factory.factories().size(); ++i)
896 {
897 boost::shared_ptr<BundleFactory> bf = sample_factory.factories()[i];
898 rg_props.insert(bf->read_group_properties());
899 }
900
901 sample_abundance_worker(boost::cref(locus_tag),
902 boost::cref(rg_props),
903 boost::ref(*abundance),
904 bundle,
905 perform_cds_analysis,
906 perform_tss_analysis,
907 calculate_variance);
908
909 ///////////////////////////////////////////////
910
911
912 BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, bundle->ref_scaffolds())
913 {
914 ref_scaff->clear_hits();
915 }
916
917 recorder->abundance_avail(bundle->id(), abundance, factory_id);
918 recorder->record_finished_loci();
919 }
920
quantitate_next_locus(const RefSequenceTable & rt,vector<boost::shared_ptr<ReplicatedBundleFactory>> & bundle_factories,boost::shared_ptr<AbundanceRecorder> recorder)921 bool quantitate_next_locus(const RefSequenceTable& rt,
922 vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
923 boost::shared_ptr<AbundanceRecorder> recorder)
924 {
925 for (size_t i = 0; i < bundle_factories.size(); ++i)
926 {
927 boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
928
929 #if ENABLE_THREADS
930 while(1)
931 {
932 locus_thread_pool_lock.lock();
933 if (locus_curr_threads < locus_num_threads)
934 {
935 break;
936 }
937
938 locus_thread_pool_lock.unlock();
939
940 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
941
942 }
943
944 locus_curr_threads++;
945 locus_thread_pool_lock.unlock();
946
947 boost::thread quantitate(sample_worker,
948 boost::ref(rt),
949 boost::ref(*(bundle_factories[i])),
950 s_ab,
951 i,
952 recorder,
953 false);
954 #else
955 sample_worker(boost::ref(rt),
956 boost::ref(*(bundle_factories[i])),
957 s_ab,
958 i,
959 recorder,
960 false);
961 #endif
962 }
963 return true;
964 }
965
parse_norm_standards_file(FILE * norm_standards_file)966 void parse_norm_standards_file(FILE* norm_standards_file)
967 {
968 char pBuf[10 * 1024];
969 size_t non_blank_lines_read = 0;
970
971 boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
972
973 while (fgets(pBuf, 10*1024, norm_standards_file))
974 {
975 if (strlen(pBuf) > 0)
976 {
977 char* nl = strchr(pBuf, '\n');
978 if (nl)
979 *nl = 0;
980
981 string pBufstr = pBuf;
982 string trimmed = boost::trim_copy(pBufstr);
983
984 if (trimmed.length() > 0 && trimmed[0] != '#')
985 {
986 non_blank_lines_read++;
987 vector<string> columns;
988 tokenize(trimmed, "\t", columns);
989
990 if (non_blank_lines_read == 1)
991 continue;
992
993 if (columns.size() < 1) //
994 {
995 continue;
996 }
997
998 string gene_id = columns[0];
999 LibNormStandards L;
1000 norm_standards->insert(make_pair(gene_id, L));
1001 }
1002 }
1003 }
1004 lib_norm_standards = norm_standards;
1005 }
1006
1007 boost::shared_ptr<AbundanceRecorder> abx_recorder;
1008
driver(const std::string & ref_gtf_filename,const std::string & mask_gtf_filename,FILE * norm_standards_file,vector<string> & sam_hit_filename_lists,Outfiles & outfiles)1009 void driver(const std::string& ref_gtf_filename, const std::string& mask_gtf_filename, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
1010 {
1011
1012 FILE* ref_gtf = NULL;
1013 if (ref_gtf_filename != "")
1014 {
1015 ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
1016 if (!ref_gtf) // we actually already did this check, leave this code here in case we remove the upstream one
1017 {
1018 fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
1019 ref_gtf_filename.c_str());
1020 exit(1);
1021 }
1022 }
1023
1024 FILE* mask_gtf = NULL;
1025 if (mask_gtf_filename != "")
1026 {
1027 mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
1028 if (!mask_gtf) // we actually already did this check, leave this code here in case we remove the upstream one
1029 {
1030 fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
1031 mask_gtf_filename.c_str());
1032 exit(1);
1033 }
1034 }
1035
1036 ReadTable it;
1037 RefSequenceTable rt(true, false);
1038
1039 vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
1040
1041 vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
1042 vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
1043 vector<boost::shared_ptr<HitFactory> > all_hit_factories;
1044
1045 for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
1046 {
1047 vector<string> sam_hit_filenames;
1048 tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
1049
1050 vector<boost::shared_ptr<BundleFactory> > replicate_factories;
1051
1052 string condition_name = sample_labels[i];
1053
1054 for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
1055 {
1056 boost::shared_ptr<HitFactory> hs(createSamHitFactory(sam_hit_filenames[j], it, rt));
1057
1058 all_hit_factories.push_back(hs);
1059
1060 boost::shared_ptr<BundleFactory> hf(new BundleFactory(hs, REF_DRIVEN));
1061 boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
1062
1063 if (global_read_properties)
1064 {
1065 *rg_props = *global_read_properties;
1066 }
1067 else
1068 {
1069 *rg_props = hs->read_group_properties();
1070 }
1071
1072 rg_props->condition_name(condition_name);
1073 rg_props->replicate_num(j);
1074 rg_props->file_path(sam_hit_filenames[j]);
1075
1076 all_read_groups.push_back(rg_props);
1077
1078 hf->read_group_properties(rg_props);
1079
1080 replicate_factories.push_back(hf);
1081 //replicate_factories.back()->set_ref_rnas(ref_mRNAs);
1082 }
1083
1084 bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
1085 }
1086
1087 boost::crc_32_type ref_gtf_crc_result;
1088 ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
1089
1090 // for (size_t i = 0; i < ref_mRNAs.size(); ++i)
1091 // {
1092 // boost::shared_ptr<Scaffold> s = ref_mRNAs[i];
1093 // if (s->annotated_gene_id() == "ENSG00000268467.1")
1094 // {
1095 // int a = 4;
1096 // }
1097 // }
1098 //
1099 if (ref_mRNAs.empty())
1100 return;
1101
1102 boost::crc_32_type mask_gtf_crc_result;
1103 vector<boost::shared_ptr<Scaffold> > mask_rnas;
1104 if (mask_gtf)
1105 {
1106 ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
1107 }
1108
1109 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1110 {
1111 fac->set_ref_rnas(ref_mRNAs);
1112 if (mask_gtf)
1113 fac->set_mask_rnas(mask_rnas);
1114 }
1115
1116 if (norm_standards_file != NULL)
1117 {
1118 parse_norm_standards_file(norm_standards_file);
1119 }
1120
1121 for (size_t i = 0; i < all_read_groups.size(); ++i)
1122 {
1123 all_read_groups[i]->collect_checked_parameters();
1124 all_read_groups[i]->ref_gtf(ref_gtf_filename, ref_gtf_crc_result);
1125 all_read_groups[i]->mask_gtf(mask_gtf_filename, mask_gtf_crc_result);
1126 }
1127
1128 #if ENABLE_THREADS
1129 locus_num_threads = num_threads;
1130 #endif
1131
1132 dispersion_method = POISSON;
1133
1134 int tmp_min_frag_len = numeric_limits<int>::max();
1135 int tmp_max_frag_len = 0;
1136
1137 IdToLocusMap id_to_locus_map(boost::shared_ptr<map<string, set<string> > >(new map<string, set<string> >()));
1138
1139 ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
1140 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1141 {
1142
1143 #if ENABLE_THREADS
1144 while(1)
1145 {
1146 locus_thread_pool_lock.lock();
1147 if (locus_curr_threads < locus_num_threads)
1148 {
1149 break;
1150 }
1151
1152 locus_thread_pool_lock.unlock();
1153
1154 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1155 }
1156
1157 locus_curr_threads++;
1158 locus_thread_pool_lock.unlock();
1159
1160 boost::thread inspect(inspect_map_worker,
1161 boost::ref(*fac),
1162 boost::ref(tmp_min_frag_len),
1163 boost::ref(tmp_max_frag_len),
1164 boost::ref(id_to_locus_map));
1165 #else
1166 inspect_map_worker(boost::ref(*fac),
1167 boost::ref(tmp_min_frag_len),
1168 boost::ref(tmp_max_frag_len),
1169 boost::ref(id_to_locus_map));
1170 #endif
1171 }
1172
1173 // wait for the workers to finish up before reporting everthing.
1174 #if ENABLE_THREADS
1175 while(1)
1176 {
1177 locus_thread_pool_lock.lock();
1178 if (locus_curr_threads == 0)
1179 {
1180 locus_thread_pool_lock.unlock();
1181 break;
1182 }
1183 locus_thread_pool_lock.unlock();
1184
1185 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1186 }
1187 #endif
1188
1189 normalize_counts(all_read_groups);
1190
1191 for (size_t i = 0; i < all_read_groups.size(); ++i)
1192 {
1193 boost::shared_ptr<ReadGroupProperties> rg = all_read_groups[i];
1194 fprintf(stderr, "> Map Properties:\n");
1195
1196 fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", rg->normalized_map_mass());
1197 fprintf(stderr, ">\tRaw Map Mass: %.2Lf\n", rg->total_map_mass());
1198 if (corr_multi)
1199 fprintf(stderr,">\tNumber of Multi-Reads: %zu (with %zu total hits)\n", rg->multi_read_table()->num_multireads(), rg->multi_read_table()->num_multihits());
1200
1201 if (rg->frag_len_dist()->source() == LEARNED)
1202 {
1203 fprintf(stderr, ">\tFragment Length Distribution: Empirical (learned)\n");
1204 fprintf(stderr, ">\t Estimated Mean: %.2f\n", rg->frag_len_dist()->mean());
1205 fprintf(stderr, ">\t Estimated Std Dev: %.2f\n", rg->frag_len_dist()->std_dev());
1206 }
1207 else
1208 {
1209 if (rg->frag_len_dist()->source() == USER)
1210 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (user-specified)\n");
1211 else //rg->frag_len_dist()->source == FLD::DEFAULT
1212 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (default)\n");
1213 fprintf(stderr, ">\t Default Mean: %d\n", def_frag_len_mean);
1214 fprintf(stderr, ">\t Default Std Dev: %d\n", def_frag_len_std_dev);
1215 }
1216 }
1217
1218 long double total_norm_mass = 0.0;
1219 long double total_mass = 0.0;
1220 BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1221 {
1222 total_norm_mass += rg_props->normalized_map_mass();
1223 total_mass += rg_props->total_map_mass();
1224 }
1225
1226 min_frag_len = tmp_min_frag_len;
1227 max_frag_len = tmp_max_frag_len;
1228
1229 final_est_run = false;
1230
1231 double num_bundles = (double)bundle_factories[0]->num_bundles();
1232
1233 p_bar = ProgressBar("Calculating preliminary abundance estimates", num_bundles);
1234
1235 Tracking tracking;
1236
1237 abundance_recorder = boost::shared_ptr<AbundanceRecorder>(new AbundanceRecorder(bundle_factories.size(), &tracking, &p_bar));
1238
1239 if (model_mle_error || corr_bias || corr_multi) // Only run initial estimation if correcting bias or multi-reads
1240 {
1241 while (1)
1242 {
1243 boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1244 quantitate_next_locus(rt, bundle_factories, abundance_recorder);
1245 bool more_loci_remain = false;
1246 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1247 {
1248 if (rep_fac->bundles_remain())
1249 {
1250 more_loci_remain = true;
1251 break;
1252 }
1253 }
1254
1255 if (!more_loci_remain)
1256 {
1257 // wait for the workers to finish up before breaking out.
1258 #if ENABLE_THREADS
1259 while(1)
1260 {
1261 locus_thread_pool_lock.lock();
1262 if (locus_curr_threads == 0)
1263 {
1264 locus_thread_pool_lock.unlock();
1265 break;
1266 }
1267
1268 locus_thread_pool_lock.unlock();
1269
1270 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1271
1272 }
1273 #endif
1274 break;
1275 }
1276 }
1277
1278 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1279 {
1280 rep_fac->reset();
1281 }
1282
1283 p_bar.complete();
1284 }
1285 if (corr_bias)
1286 {
1287 bias_run = true;
1288 p_bar = ProgressBar("Learning bias parameters.", 0);
1289 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1290 {
1291 BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, rep_fac->factories())
1292 {
1293 #if ENABLE_THREADS
1294 while(1)
1295 {
1296 locus_thread_pool_lock.lock();
1297 if (locus_curr_threads < locus_num_threads)
1298 {
1299 break;
1300 }
1301
1302 locus_thread_pool_lock.unlock();
1303
1304 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1305 }
1306 locus_curr_threads++;
1307 locus_thread_pool_lock.unlock();
1308
1309 boost::thread bias(learn_bias_worker, fac);
1310 #else
1311 learn_bias_worker(fac);
1312 #endif
1313 }
1314 }
1315
1316 // wait for the workers to finish up before reporting everthing.
1317 #if ENABLE_THREADS
1318 while(1)
1319 {
1320 locus_thread_pool_lock.lock();
1321 if (locus_curr_threads == 0)
1322 {
1323 locus_thread_pool_lock.unlock();
1324 break;
1325 }
1326
1327 locus_thread_pool_lock.unlock();
1328
1329 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1330 }
1331 #endif
1332 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1333 {
1334 rep_fac->reset();
1335 }
1336 bias_run = false;
1337 }
1338
1339 // Allow the multiread tables to do their thing...
1340 BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1341 {
1342 rg_props->multi_read_table()->valid_mass(true);
1343 }
1344
1345
1346 abundance_recorder->clear_tracking_data();
1347
1348 abundance_recorder = boost::shared_ptr<AbundanceRecorder>(new AbundanceRecorder(bundle_factories.size(), &tracking, &p_bar));
1349
1350 final_est_run = true;
1351 p_bar = ProgressBar("Quantifying expression levels in locus.", num_bundles);
1352
1353 while (true)
1354 {
1355 //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1356 quantitate_next_locus(rt, bundle_factories, abundance_recorder);
1357 bool more_loci_remain = false;
1358 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1359 {
1360 if (rep_fac->bundles_remain())
1361 {
1362 more_loci_remain = true;
1363 break;
1364 }
1365 }
1366 if (!more_loci_remain)
1367 {
1368 // wait for the workers to finish up before doing the cross-sample testing.
1369 #if ENABLE_THREADS
1370 while(1)
1371 {
1372 locus_thread_pool_lock.lock();
1373 if (locus_curr_threads == 0)
1374 {
1375 locus_thread_pool_lock.unlock();
1376 break;
1377 }
1378
1379 locus_thread_pool_lock.unlock();
1380
1381 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1382
1383 }
1384 #endif
1385 break;
1386 }
1387 }
1388
1389 p_bar.complete();
1390
1391
1392 string expression_cxb_filename = output_dir + "/abundances.cxb";
1393 std::ofstream ofs(expression_cxb_filename.c_str());
1394 boost::archive::binary_oarchive oa(ofs);
1395
1396 vector< pair<int, AbundanceGroup> > single_sample_tracking;
1397
1398 const light_ab_group_tracking_table& sample_table = abundance_recorder->get_sample_table();
1399 for (light_ab_group_tracking_table::const_iterator itr = sample_table.begin(); itr != sample_table.end(); ++itr)
1400 {
1401
1402 assert (itr->second.size() == 1);
1403 single_sample_tracking.push_back(make_pair(itr->first, itr->second[0]));
1404 }
1405
1406 std::sort(single_sample_tracking.begin(), single_sample_tracking.end(),
1407 boost::bind(&std::pair<int, AbundanceGroup>::first, _1) <
1408 boost::bind(&std::pair<int, AbundanceGroup>::first, _2));
1409
1410 size_t num_loci = single_sample_tracking.size();
1411 oa << num_loci;
1412
1413
1414
1415 for (int i = 0; i < single_sample_tracking.size(); ++i)
1416 {
1417 oa << single_sample_tracking[i];
1418 }
1419
1420 // // FPKM tracking
1421 //
1422 // FILE* fiso_fpkm_tracking = outfiles.isoform_fpkm_tracking_out;
1423 // fprintf(stderr, "Writing isoform-level FPKM tracking\n");
1424 // print_FPKM_tracking(fiso_fpkm_tracking,tracking.isoform_fpkm_tracking);
1425 //
1426 // FILE* ftss_fpkm_tracking = outfiles.tss_group_fpkm_tracking_out;
1427 // fprintf(stderr, "Writing TSS group-level FPKM tracking\n");
1428 // print_FPKM_tracking(ftss_fpkm_tracking,tracking.tss_group_fpkm_tracking);
1429 //
1430 // FILE* fgene_fpkm_tracking = outfiles.gene_fpkm_tracking_out;
1431 // fprintf(stderr, "Writing gene-level FPKM tracking\n");
1432 // print_FPKM_tracking(fgene_fpkm_tracking,tracking.gene_fpkm_tracking);
1433 //
1434 // FILE* fcds_fpkm_tracking = outfiles.cds_fpkm_tracking_out;
1435 // fprintf(stderr, "Writing CDS-level FPKM tracking\n");
1436 // print_FPKM_tracking(fcds_fpkm_tracking,tracking.cds_fpkm_tracking);
1437 //
1438 // // Count tracking
1439 //
1440 // FILE* fiso_count_tracking = outfiles.isoform_count_tracking_out;
1441 // fprintf(stderr, "Writing isoform-level count tracking\n");
1442 // print_count_tracking(fiso_count_tracking,tracking.isoform_fpkm_tracking);
1443 //
1444 // FILE* ftss_count_tracking = outfiles.tss_group_count_tracking_out;
1445 // fprintf(stderr, "Writing TSS group-level count tracking\n");
1446 // print_count_tracking(ftss_count_tracking,tracking.tss_group_fpkm_tracking);
1447 //
1448 // FILE* fgene_count_tracking = outfiles.gene_count_tracking_out;
1449 // fprintf(stderr, "Writing gene-level count tracking\n");
1450 // print_count_tracking(fgene_count_tracking,tracking.gene_fpkm_tracking);
1451 //
1452 // FILE* fcds_count_tracking = outfiles.cds_count_tracking_out;
1453 // fprintf(stderr, "Writing CDS-level count tracking\n");
1454 // print_count_tracking(fcds_count_tracking,tracking.cds_fpkm_tracking);
1455 //
1456 // // Run info
1457 // FILE* frun_info = outfiles.run_info_out;
1458 // fprintf(stderr, "Writing run info\n");
1459 // print_run_info(frun_info);
1460 }
1461
main(int argc,char ** argv)1462 int main(int argc, char** argv)
1463 {
1464 // boost::serialization::void_cast_register<TranscriptAbundance, Abundance>(
1465 // static_cast<TranscriptAbundance *>(NULL),
1466 // static_cast<Abundance *>(NULL)
1467 // );
1468
1469 for (int i = 0; i < argc; ++i)
1470 {
1471 cmd_str += string(argv[i]) + " ";
1472 }
1473
1474 init_library_table();
1475 init_dispersion_method_table();
1476 init_lib_norm_method_table();
1477
1478 min_isoform_fraction = 0;
1479
1480 int parse_ret = parse_options(argc,argv);
1481 if (parse_ret)
1482 return parse_ret;
1483
1484 if (!use_total_mass && !use_compat_mass)
1485 {
1486 use_total_mass = false;
1487 use_compat_mass = true;
1488 }
1489
1490 if(optind >= argc)
1491 {
1492 print_usage();
1493 return 1;
1494 }
1495
1496 if (!no_update_check)
1497 check_version(PACKAGE_VERSION);
1498
1499 string ref_gtf_filename = argv[optind++];
1500 vector<string> sam_hit_filenames;
1501
1502 if(optind < argc)
1503 {
1504 string sam_hits_file_name = argv[optind++];
1505 sam_hit_filenames.push_back(sam_hits_file_name);
1506 }
1507
1508 if (sample_labels.size() == 0)
1509 {
1510 for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
1511 {
1512 char buf[256];
1513 sprintf(buf, "q%lu", i);
1514 sample_labels.push_back(buf);
1515 }
1516 }
1517
1518 while (sam_hit_filenames.size() < 1)
1519 {
1520 fprintf(stderr, "Error: cuffquant requires exactly 1 SAM/BAM file\n");
1521 exit(1);
1522 }
1523
1524
1525 if (sam_hit_filenames.size() != sample_labels.size())
1526 {
1527 fprintf(stderr, "Error: number of labels must match number of conditions\n");
1528 exit(1);
1529 }
1530
1531 if (random_seed == -1)
1532 random_seed = boost::mt19937::default_seed;
1533
1534 // seed the random number generator - we'll need it for the importance
1535 // sampling during MAP estimation of the gammas
1536 srand48(random_seed);
1537
1538 FILE* ref_gtf = NULL;
1539 if (ref_gtf_filename != "")
1540 {
1541 ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
1542 if (!ref_gtf)
1543 {
1544 fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
1545 ref_gtf_filename.c_str());
1546 exit(1);
1547 }
1548 }
1549
1550 FILE* mask_gtf = NULL;
1551 if (mask_gtf_filename != "")
1552 {
1553 mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
1554 if (!mask_gtf)
1555 {
1556 fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
1557 mask_gtf_filename.c_str());
1558 exit(1);
1559 }
1560 }
1561
1562 FILE* norm_standards_file = NULL;
1563 if (norm_standards_filename != "")
1564 {
1565 norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
1566 if (!norm_standards_file)
1567 {
1568 fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
1569 norm_standards_filename.c_str());
1570 exit(1);
1571 }
1572 }
1573
1574
1575 // Note: we don't want the assembly filters interfering with calculations
1576 // here
1577
1578 pre_mrna_fraction = 0.0;
1579 olap_radius = 0;
1580
1581 Outfiles outfiles;
1582
1583 if (output_dir != "")
1584 {
1585 int retcode = mkpath(output_dir.c_str(), 0777);
1586 if (retcode == -1)
1587 {
1588 if (errno != EEXIST)
1589 {
1590 fprintf (stderr,
1591 "Error: cannot create directory %s\n",
1592 output_dir.c_str());
1593 exit(1);
1594 }
1595 }
1596 }
1597
1598 static const int filename_buf_size = 2048;
1599
1600 char out_file_prefix[filename_buf_size];
1601 sprintf(out_file_prefix, "%s/", output_dir.c_str());
1602
1603 driver(ref_gtf_filename, mask_gtf_filename, norm_standards_file, sam_hit_filenames, outfiles);
1604
1605 return 0;
1606 }
1607