1 /*
2 * cuffdiff.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 10/21/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 #include <numeric>
14 #include <cfloat>
15 #include <iostream>
16
17 #include "common.h"
18 #include "hits.h"
19 #include "bundles.h"
20 #include "abundances.h"
21 #include "tokenize.h"
22 #include "biascorrection.h"
23 #include "update_check.h"
24
25 #include <boost/thread.hpp>
26 #include <boost/version.hpp>
27 #include <boost/graph/adjacency_list.hpp>
28 #include <boost/graph/graph_traits.hpp>
29 #include <boost/numeric/ublas/matrix.hpp>
30 #include <boost/numeric/ublas/matrix_proxy.hpp>
31 #include <boost/numeric/ublas/vector.hpp>
32 #include <boost/numeric/ublas/vector_proxy.hpp>
33 #include <boost/numeric/ublas/io.hpp>
34 #include <boost/algorithm/string.hpp>
35
36 #include <boost/random/mersenne_twister.hpp>
37
38 #include "differential.h"
39
40 extern "C" {
41 #include "locfit/local.h"
42 }
43
44 // Need at least this many reads in a locus to do any testing on it
45
46 vector<string> sample_labels;
47
48 double FDR = 0.05;
49 bool samples_are_time_series = false;
50 using namespace std;
51 using namespace boost;
52
53 // We leave out the short codes for options that don't take an argument
54 #if ENABLE_THREADS
55 const char *short_options = "m:p:s:c:I:j:L:M:o:b:TNqvuF:C:";
56 #else
57 const char *short_options = "m:s:c:I:j:L:M:o:b:TNqvuF:C:";
58 #endif
59
60 static struct option long_options[] = {
61 {"labels", required_argument, 0, 'L'},
62 {"seed", required_argument, 0, OPT_RANDOM_SEED},
63 {"norm-standards-file", required_argument, 0, OPT_NORM_STANDARDS_FILE},
64 {"use-sample-sheet", no_argument, 0, OPT_USE_SAMPLE_SHEET},
65 {"output-dir", required_argument, 0, 'o'},
66 {"verbose", no_argument, 0, 'v'},
67 {"quiet", no_argument, 0, 'q'},
68 #if ENABLE_THREADS
69 {"num-threads", required_argument, 0, 'p'},
70 #endif
71 {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
72 {"no-update-check", no_argument, 0, OPT_NO_UPDATE_CHECK},
73 {"compatible-hits-norm", no_argument, 0, OPT_USE_COMPAT_MASS},
74 {"total-hits-norm", no_argument, 0, OPT_USE_TOTAL_MASS},
75
76 // Some options for testing different stats policies
77 {"max-bundle-frags", required_argument, 0, OPT_MAX_FRAGS_PER_BUNDLE},
78 {"library-norm-method", required_argument, 0, OPT_LIB_NORM_METHOD},
79 {"output-format", required_argument, 0, OPT_OUTPUT_FORMAT},
80 {0, 0, 0, 0} // terminator
81 };
82
print_usage()83 void print_usage()
84 {
85 fprintf(stderr, "cuffnorm v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
86 fprintf(stderr, "-----------------------------\n");
87
88 //NOTE: SPACES ONLY, bozo
89 fprintf(stderr, "Usage: cuffnorm [options] <transcripts.gtf> <sample1_expr.cxb> <sample2_expr.cxb> [... sampleN_expr.cxb]\n");
90 fprintf(stderr, " Supply replicate CXB files as comma separated lists for each condition: sample1_rep1.cxb,sample1_rep2.cxb,...sample1_repM.cxb\n");
91 fprintf(stderr, "General Options:\n");
92 fprintf(stderr, " -o/--output-dir write all output files to this directory [ default: ./ ]\n");
93 fprintf(stderr, " -L/--labels comma-separated list of condition labels\n");
94 fprintf(stderr, " --norm-standards-file Housekeeping/spike genes to normalize libraries [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
95 #if ENABLE_THREADS
96 fprintf(stderr, " -p/--num-threads number of threads used during quantification [ default: 1 ]\n");
97 #endif
98 fprintf(stderr, " --library-type Library prep used for input reads [ default: below ]\n");
99 fprintf(stderr, " --library-norm-method Method used to normalize library sizes [ default: below ]\n");
100 fprintf(stderr, " --output-format Format for output tables [ default: below ]\n");
101
102 fprintf(stderr, "\nAdvanced Options:\n");
103 fprintf(stderr, " --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]\n");
104 fprintf(stderr, " --total-hits-norm count all hits for normalization [ default: FALSE ]\n");
105 fprintf(stderr, " -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]\n");
106 fprintf(stderr, " -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]\n");
107 fprintf(stderr, " --seed value of random number generator seed [ default: 0 ]\n");
108 fprintf(stderr, " --no-update-check do not contact server to check for update availability[ default: FALSE ]\n");
109 print_library_table();
110 print_lib_norm_method_table();
111 print_output_format_table();
112 }
113
parse_options(int argc,char ** argv)114 int parse_options(int argc, char** argv)
115 {
116 int option_index = 0;
117 int next_option;
118 string sample_label_list;
119 string dispersion_method_str;
120 string lib_norm_method_str;
121 string output_format_str;
122
123 do {
124 next_option = getopt_long_only(argc, argv, short_options, long_options, &option_index);
125 if (next_option == -1) /* Done with options. */
126 break;
127 switch (next_option) {
128 case 0:
129 /* If this option set a flag, do nothing else now. */
130 if (long_options[option_index].flag != 0)
131 break;
132 break;
133
134 case 'p':
135 num_threads = (uint32_t)parseInt(1, "-p/--num-threads arg must be at least 1", print_usage);
136 break;
137 case 'F':
138 min_isoform_fraction = parseFloat(0, 1.0, "-F/--min-isoform-fraction must be between 0 and 1.0", print_usage);
139 break;
140 case 'L':
141 sample_label_list = optarg;
142 break;
143
144
145 case OPT_NORM_STANDARDS_FILE:
146 {
147 norm_standards_filename = optarg;
148 break;
149 }
150 case OPT_USE_SAMPLE_SHEET:
151 {
152 use_sample_sheet = true;
153 break;
154 }
155 case 'v':
156 {
157 if (cuff_quiet)
158 {
159 fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting verbose only.\n");
160 }
161 cuff_quiet = false;
162 cuff_verbose = true;
163 break;
164 }
165 case 'q':
166 {
167 if (cuff_verbose)
168 {
169 fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting quiet only.\n");
170 }
171 cuff_verbose = false;
172 cuff_quiet = true;
173 break;
174 }
175 case 'o':
176 {
177 output_dir = optarg;
178 break;
179 }
180
181
182 case OPT_LIBRARY_TYPE:
183 {
184 library_type = optarg;
185 break;
186 }
187
188 case OPT_NO_UPDATE_CHECK:
189 {
190 no_update_check = true;
191 break;
192 }
193 case OPT_RANDOM_SEED:
194 {
195 random_seed = parseInt(0, "--seed must be at least 0", print_usage);
196 break;
197 }
198 case OPT_USE_COMPAT_MASS:
199 {
200 use_compat_mass = true;
201 break;
202 }
203 case OPT_USE_TOTAL_MASS:
204 {
205 use_total_mass = true;
206 break;
207 }
208 case OPT_LIB_NORM_METHOD:
209 {
210 lib_norm_method_str = optarg;
211 break;
212 }
213 case OPT_OUTPUT_FORMAT:
214 {
215 output_format_str = optarg;
216 break;
217 }
218 default:
219 print_usage();
220 return 1;
221 }
222 } while(next_option != -1);
223
224 if (library_type != "")
225 {
226 map<string, ReadGroupProperties>::iterator lib_itr =
227 library_type_table.find(library_type);
228 if (lib_itr == library_type_table.end())
229 {
230 fprintf(stderr, "Error: Library type %s not supported\n", library_type.c_str());
231 exit(1);
232 }
233 else
234 {
235 if (library_type == "transfrags")
236 {
237 allow_junk_filtering = false;
238 }
239 global_read_properties = &lib_itr->second;
240 }
241 }
242 else
243 {
244
245 }
246
247 // Set the count dispersion method to use
248 if (dispersion_method_str == "")
249 {
250 dispersion_method_str = default_dispersion_method;
251 }
252
253 map<string, DispersionMethod>::iterator disp_itr =
254 dispersion_method_table.find(dispersion_method_str);
255 if (disp_itr == dispersion_method_table.end())
256 {
257 fprintf(stderr, "Error: Dispersion method %s not supported\n", dispersion_method_str.c_str());
258 exit(1);
259 }
260 else
261 {
262 dispersion_method = disp_itr->second;
263 }
264
265 // Set the library size normalization method to use
266 if (lib_norm_method_str == "")
267 {
268 lib_norm_method_str = default_lib_norm_method;
269 }
270
271 map<string, LibNormalizationMethod>::iterator lib_norm_itr =
272 lib_norm_method_table.find(lib_norm_method_str);
273 if (lib_norm_itr == lib_norm_method_table.end())
274 {
275 fprintf(stderr, "Error: Normalization method %s not supported\n", lib_norm_method_str.c_str());
276 exit(1);
277 }
278 else
279 {
280 lib_norm_method = lib_norm_itr->second;
281 }
282
283 // Set the count dispersion method to use
284 if (output_format_str == "")
285 {
286 output_format_str = default_output_format;
287 }
288
289 map<string, OutputFormat>::iterator output_itr =
290 output_format_table.find(output_format_str);
291 if (output_itr == output_format_table.end())
292 {
293 fprintf(stderr, "Error: Output format %s not supported\n", output_format_str.c_str());
294 exit(1);
295 }
296 else
297 {
298 output_format = output_itr->second;
299 }
300
301 if (use_total_mass && use_compat_mass)
302 {
303 fprintf (stderr, "Error: please supply only one of --compatibile-hits-norm and --total-hits-norm\n");
304 exit(1);
305 }
306
307 tokenize(sample_label_list, ",", sample_labels);
308
309 allow_junk_filtering = false;
310
311 return 0;
312 }
313
314 #if ENABLE_THREADS
315 boost::mutex inspect_lock;
316 #endif
317
inspect_map_worker(ReplicatedBundleFactory & fac,int & tmp_min_frag_len,int & tmp_max_frag_len,IdToLocusMap & id_to_locus_map)318 void inspect_map_worker(ReplicatedBundleFactory& fac,
319 int& tmp_min_frag_len,
320 int& tmp_max_frag_len,
321 IdToLocusMap& id_to_locus_map)
322 {
323 #if ENABLE_THREADS
324 boost::this_thread::at_thread_exit(decr_pool_count);
325 #endif
326
327 int min_f = std::numeric_limits<int>::max();
328 int max_f = 0;
329
330 fac.inspect_replicate_maps(min_f, max_f, id_to_locus_map);
331
332 #if ENABLE_THREADS
333 inspect_lock.lock();
334 #endif
335 tmp_min_frag_len = min(min_f, tmp_min_frag_len);
336 tmp_max_frag_len = max(max_f, tmp_max_frag_len);
337 #if ENABLE_THREADS
338 inspect_lock.unlock();
339 #endif
340 }
341
342 boost::shared_ptr<TrackingDataWriter> tracking_data_writer;
343
quantitate_next_locus(const RefSequenceTable & rt,vector<boost::shared_ptr<ReplicatedBundleFactory>> & bundle_factories,boost::shared_ptr<TestLauncher> launcher)344 bool quantitate_next_locus(const RefSequenceTable& rt,
345 vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
346 boost::shared_ptr<TestLauncher> launcher)
347 {
348 for (size_t i = 0; i < bundle_factories.size(); ++i)
349 {
350 boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
351
352 #if ENABLE_THREADS
353 while(1)
354 {
355 locus_thread_pool_lock.lock();
356 if (locus_curr_threads < locus_num_threads)
357 {
358 break;
359 }
360
361 locus_thread_pool_lock.unlock();
362
363 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
364
365 }
366
367 locus_curr_threads++;
368 locus_thread_pool_lock.unlock();
369
370 boost::shared_ptr<HitBundle> pBundle = boost::shared_ptr<HitBundle>(new HitBundle());
371 bool non_empty = bundle_factories[i]->next_bundle(*pBundle, true);
372
373 if (pBundle->compatible_mass() > 0)
374 {
375 boost::thread quantitate(sample_worker,
376 non_empty,
377 pBundle,
378 boost::ref(rt),
379 boost::ref(*(bundle_factories[i])),
380 s_ab,
381 i,
382 launcher,
383 false);
384 }
385 else
386 {
387 sample_worker(non_empty,
388 pBundle,
389 boost::ref(rt),
390 boost::ref(*(bundle_factories[i])),
391 s_ab,
392 i,
393 launcher,
394 false);
395 locus_thread_pool_lock.lock();
396 locus_curr_threads--;
397 locus_thread_pool_lock.unlock();
398 }
399 #else
400 HitBundle bundle;
401 bool non_empty = sample_factory.next_bundle(bundle);
402
403 sample_worker(non_emtpy,
404 pBundle,
405 boost::ref(rt),
406 boost::ref(*(bundle_factories[i])),
407 s_ab,
408 i,
409 launcher,
410 false);
411 #endif
412 }
413 return true;
414 }
415
parse_sample_sheet_file(FILE * sample_sheet_file,vector<string> & sample_labels,vector<string> & sam_hit_filename_lists)416 void parse_sample_sheet_file(FILE* sample_sheet_file,
417 vector<string>& sample_labels,
418 vector<string>& sam_hit_filename_lists)
419 {
420
421 char pBuf[10 * 1024];
422 size_t non_blank_lines_read = 0;
423
424 sample_labels.clear();
425
426 map<string, vector<string> > sample_groups;
427
428 while (fgets(pBuf, 10*1024, sample_sheet_file))
429 {
430 if (strlen(pBuf) > 0)
431 {
432 char* nl = strchr(pBuf, '\n');
433 if (nl)
434 *nl = 0;
435
436 string pBufstr = pBuf;
437 string trimmed = boost::trim_copy(pBufstr);
438
439 if (trimmed.length() > 0 && trimmed[0] != '#')
440 {
441 non_blank_lines_read++;
442 vector<string> columns;
443 tokenize(trimmed, "\t", columns);
444
445 if (non_blank_lines_read == 1)
446 continue;
447
448 if (columns.size() < 2)
449 {
450 if (columns.size() > 0)
451 fprintf(stderr, "Malformed record in sample sheet: \n > %s\n", pBuf);
452 else
453 continue;
454 }
455
456 string sam_file = columns[0];
457 string sample_group = columns[1];
458
459 pair<map<string, vector<string> >::iterator, bool> inserted = sample_groups.insert(make_pair(sample_group, vector<string>()));
460 inserted.first->second.push_back(sam_file);
461 }
462 }
463 }
464
465 for (map<string, vector<string> >::iterator itr = sample_groups.begin();
466 itr != sample_groups.end(); ++itr)
467 {
468 sample_labels.push_back(itr->first);
469 string sam_list = boost::join(itr->second, ",");
470 sam_hit_filename_lists.push_back(sam_list);
471 }
472 }
473
parse_norm_standards_file(FILE * norm_standards_file)474 void parse_norm_standards_file(FILE* norm_standards_file)
475 {
476 char pBuf[10 * 1024];
477 size_t non_blank_lines_read = 0;
478
479 boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
480
481 while (fgets(pBuf, 10*1024, norm_standards_file))
482 {
483 if (strlen(pBuf) > 0)
484 {
485 char* nl = strchr(pBuf, '\n');
486 if (nl)
487 *nl = 0;
488
489 string pBufstr = pBuf;
490 string trimmed = boost::trim_copy(pBufstr);
491
492 if (trimmed.length() > 0 && trimmed[0] != '#')
493 {
494 non_blank_lines_read++;
495 vector<string> columns;
496 tokenize(trimmed, "\t", columns);
497
498 if (non_blank_lines_read == 1)
499 continue;
500
501 if (columns.size() < 1) //
502 {
503 continue;
504 }
505
506 string gene_id = columns[0];
507 LibNormStandards L;
508 norm_standards->insert(make_pair(gene_id, L));
509 }
510 }
511 }
512 lib_norm_standards = norm_standards;
513 }
514
515
print_variability_models(FILE * var_model_out,const vector<boost::shared_ptr<ReplicatedBundleFactory>> & factories)516 void print_variability_models(FILE* var_model_out, const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories)
517 {
518
519 fprintf(var_model_out, "condition\tlocus\tcompatible_count_mean\tcompatible_count_var\ttotal_count_mean\ttotal_count_var\tfitted_var\n");
520
521 for (size_t i = 0; i < factories.size(); ++i)
522 {
523 string factor_name = factories[i]->condition_name();
524 boost::shared_ptr<ReadGroupProperties> rg = factories[i]->factories()[0]->read_group_properties();
525 boost::shared_ptr<MassDispersionModel const> model = rg->mass_dispersion_model();
526 // const vector<double>& means = model->scaled_compatible_mass_means();
527 // const vector<double>& raw_vars = model->scaled_compatible_variances();
528
529 const vector<LocusCount>& common_scale_compatible_counts = rg->common_scale_compatible_counts();
530 for (size_t j = 0; j < common_scale_compatible_counts.size(); ++j)
531 {
532 string locus_desc = common_scale_compatible_counts[j].locus_desc;
533 pair<double, double> compat_mean_and_var = model->get_compatible_mean_and_var(locus_desc);
534 pair<double, double> total_mean_and_var = model->get_total_mean_and_var(locus_desc);
535 // double total_compat_count = 0;
536 // if (itr != locus_to_total_count_table.end())
537 // total_compat_count = itr->second.count;
538
539
540 fprintf(var_model_out, "%s\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\n",
541 factor_name.c_str(),
542 locus_desc.c_str(),
543 compat_mean_and_var.first,
544 compat_mean_and_var.second,
545 total_mean_and_var.first,
546 total_mean_and_var.second,
547 model->scale_mass_variance(compat_mean_and_var.first));
548 }
549 }
550 fclose(var_model_out);
551
552 }
553
driver(FILE * ref_gtf,FILE * mask_gtf,FILE * contrast_file,FILE * norm_standards_file,vector<string> & sam_hit_filename_lists,Outfiles & outfiles)554 void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
555 {
556
557 ReadTable it;
558 RefSequenceTable rt(true, false);
559
560 vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
561
562 vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
563 vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
564 set<boost::shared_ptr<ReadGroupProperties> > serialized_read_groups;
565
566 for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
567 {
568 vector<string> sam_hit_filenames;
569 tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
570
571 vector<boost::shared_ptr<BundleFactory> > replicate_factories;
572
573 string condition_name = sample_labels[i];
574
575 for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
576 {
577 boost::shared_ptr<HitFactory> hs;
578 boost::shared_ptr<BundleFactory> hf;
579 try
580 {
581 hs = boost::shared_ptr<HitFactory>(new PrecomputedExpressionHitFactory(sam_hit_filenames[j], it, rt));
582 hf = boost::shared_ptr<BundleFactory>(new PrecomputedExpressionBundleFactory(static_pointer_cast<PrecomputedExpressionHitFactory>(hs)));
583 }
584
585 catch(boost::archive::archive_exception & e)
586 {
587 hs = boost::shared_ptr<HitFactory>(createSamHitFactory(sam_hit_filenames[j], it, rt));
588 hf = boost::shared_ptr<BundleFactory>(new BundleFactory(hs, REF_DRIVEN));
589 }
590
591
592 boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
593
594 if (global_read_properties)
595 {
596 *rg_props = *global_read_properties;
597 }
598 else
599 {
600 *rg_props = hs->read_group_properties();
601 }
602
603 rg_props->checked_parameters(hs->read_group_properties().checked_parameters());
604 rg_props->condition_name(condition_name);
605 rg_props->replicate_num(j);
606 rg_props->file_path(sam_hit_filenames[j]);
607
608 all_read_groups.push_back(rg_props);
609
610 hf->read_group_properties(rg_props);
611
612 replicate_factories.push_back(hf);
613 //replicate_factories.back()->set_ref_rnas(ref_mRNAs);
614 }
615
616 bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
617 }
618
619 boost::crc_32_type ref_gtf_crc_result;
620 ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
621 if (ref_mRNAs.empty())
622 return;
623
624 vector<boost::shared_ptr<Scaffold> > mask_rnas;
625 if (mask_gtf)
626 {
627 boost::crc_32_type mask_gtf_crc_result;
628 ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
629 }
630
631 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
632 {
633 // NOTE: we aren't deep copying here to save memory. We can pull this off
634 // because unless bias correction or multiread correction is enabled,
635 // we don't need to write into Scaffold objects.
636 fac->set_ref_rnas(ref_mRNAs, false);
637 if (mask_gtf)
638 fac->set_mask_rnas(mask_rnas);
639 }
640
641 if (norm_standards_file != NULL)
642 {
643 parse_norm_standards_file(norm_standards_file);
644 }
645
646 validate_cross_sample_parameters(all_read_groups);
647
648 vector<pair<size_t, size_t > > contrasts;
649
650 #if ENABLE_THREADS
651 locus_num_threads = num_threads;
652 #endif
653
654 int tmp_min_frag_len = numeric_limits<int>::max();
655 int tmp_max_frag_len = 0;
656
657 boost::shared_ptr<IdToLocusMap> id_to_locus_map(new IdToLocusMap(boost::shared_ptr<map<string, set<string> > >(new map<string, set<string> >())));
658
659 ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
660 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
661 {
662 #if ENABLE_THREADS
663 while(1)
664 {
665 locus_thread_pool_lock.lock();
666 if (locus_curr_threads < locus_num_threads)
667 {
668 break;
669 }
670
671 locus_thread_pool_lock.unlock();
672
673 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
674 }
675
676 locus_curr_threads++;
677 locus_thread_pool_lock.unlock();
678
679 boost::thread inspect(inspect_map_worker,
680 boost::ref(*fac),
681 boost::ref(tmp_min_frag_len),
682 boost::ref(tmp_max_frag_len),
683 boost::ref(*id_to_locus_map));
684 #else
685 inspect_map_worker(boost::ref(*fac),
686 boost::ref(tmp_min_frag_len),
687 boost::ref(tmp_max_frag_len),
688 boost::ref(*id_to_locus_map));
689 #endif
690 }
691
692 // wait for the workers to finish up before reporting everthing.
693 #if ENABLE_THREADS
694 while(1)
695 {
696 locus_thread_pool_lock.lock();
697 if (locus_curr_threads == 0)
698 {
699 locus_thread_pool_lock.unlock();
700 break;
701 }
702 locus_thread_pool_lock.unlock();
703
704 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
705 }
706 #endif
707
708 normalize_counts(all_read_groups);
709
710 long double total_norm_mass = 0.0;
711 long double total_mass = 0.0;
712 BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
713 {
714 total_norm_mass += rg_props->normalized_map_mass();
715 total_mass += rg_props->total_map_mass();
716 rg_props->clear_count_tables();
717 }
718
719 min_frag_len = tmp_min_frag_len;
720 max_frag_len = tmp_max_frag_len;
721
722 final_est_run = false;
723
724 double num_bundles = (double)bundle_factories[0]->num_bundles();
725
726 p_bar = ProgressBar("Calculating preliminary abundance estimates", num_bundles);
727
728 Tracking tracking;
729
730 final_est_run = true;
731 p_bar = ProgressBar("Normalizing expression levels for locus", num_bundles);
732
733 tracking_data_writer = boost::shared_ptr<TrackingDataWriter>(new TrackingDataWriter(bundle_factories.size(), &outfiles, &tracking, all_read_groups, sample_labels, &p_bar, id_to_locus_map));
734
735 while (true)
736 {
737 //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
738 quantitate_next_locus(rt, bundle_factories, tracking_data_writer);
739 bool more_loci_remain = false;
740 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
741 {
742 if (rep_fac->bundles_remain())
743 {
744 more_loci_remain = true;
745 break;
746 }
747 }
748 if (!more_loci_remain)
749 {
750 // wait for the workers to finish up before doing the cross-sample testing.
751 #if ENABLE_THREADS
752 while(1)
753 {
754 locus_thread_pool_lock.lock();
755 if (locus_curr_threads == 0)
756 {
757 locus_thread_pool_lock.unlock();
758 break;
759 }
760
761 locus_thread_pool_lock.unlock();
762
763 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
764
765 }
766 #endif
767 break;
768 }
769 }
770
771 p_bar.complete();
772
773 }
774
open_outfiles_for_writing_cuffdiff_format(Outfiles & outfiles)775 void open_outfiles_for_writing_cuffdiff_format(Outfiles& outfiles)
776 {
777
778 static const int filename_buf_size = 2048;
779
780 char isoform_fpkm_tracking_name[filename_buf_size];
781 sprintf(isoform_fpkm_tracking_name, "%s/isoforms.fpkm_tracking", output_dir.c_str());
782 FILE* isoform_fpkm_out = fopen(isoform_fpkm_tracking_name, "w");
783 if (!isoform_fpkm_out)
784 {
785 fprintf(stderr, "Error: cannot open isoform-level FPKM tracking file %s for writing\n",
786 isoform_fpkm_tracking_name);
787 exit(1);
788 }
789 outfiles.isoform_fpkm_tracking_out = isoform_fpkm_out;
790
791 char tss_group_fpkm_tracking_name[filename_buf_size];
792 sprintf(tss_group_fpkm_tracking_name, "%s/tss_groups.fpkm_tracking", output_dir.c_str());
793 FILE* tss_group_fpkm_out = fopen(tss_group_fpkm_tracking_name, "w");
794 if (!tss_group_fpkm_out)
795 {
796 fprintf(stderr, "Error: cannot open TSS group-level FPKM tracking file %s for writing\n",
797 tss_group_fpkm_tracking_name);
798 exit(1);
799 }
800 outfiles.tss_group_fpkm_tracking_out = tss_group_fpkm_out;
801
802 char cds_fpkm_tracking_name[filename_buf_size];
803 sprintf(cds_fpkm_tracking_name, "%s/cds.fpkm_tracking", output_dir.c_str());
804 FILE* cds_fpkm_out = fopen(cds_fpkm_tracking_name, "w");
805 if (!cds_fpkm_out)
806 {
807 fprintf(stderr, "Error: cannot open CDS level FPKM tracking file %s for writing\n",
808 cds_fpkm_tracking_name);
809 exit(1);
810 }
811 outfiles.cds_fpkm_tracking_out = cds_fpkm_out;
812
813 char gene_fpkm_tracking_name[filename_buf_size];
814 sprintf(gene_fpkm_tracking_name, "%s/genes.fpkm_tracking", output_dir.c_str());
815 FILE* gene_fpkm_out = fopen(gene_fpkm_tracking_name, "w");
816 if (!gene_fpkm_out)
817 {
818 fprintf(stderr, "Error: cannot open gene-level FPKM tracking file %s for writing\n",
819 gene_fpkm_tracking_name);
820 exit(1);
821 }
822 outfiles.gene_fpkm_tracking_out = gene_fpkm_out;
823
824 char isoform_count_tracking_name[filename_buf_size];
825 sprintf(isoform_count_tracking_name, "%s/isoforms.count_tracking", output_dir.c_str());
826 FILE* isoform_count_out = fopen(isoform_count_tracking_name, "w");
827 if (!isoform_count_out)
828 {
829 fprintf(stderr, "Error: cannot open isoform-level count tracking file %s for writing\n",
830 isoform_count_tracking_name);
831 exit(1);
832 }
833 outfiles.isoform_count_tracking_out = isoform_count_out;
834
835 char tss_group_count_tracking_name[filename_buf_size];
836 sprintf(tss_group_count_tracking_name, "%s/tss_groups.count_tracking", output_dir.c_str());
837 FILE* tss_group_count_out = fopen(tss_group_count_tracking_name, "w");
838 if (!tss_group_count_out)
839 {
840 fprintf(stderr, "Error: cannot open TSS group-level count tracking file %s for writing\n",
841 tss_group_count_tracking_name);
842 exit(1);
843 }
844 outfiles.tss_group_count_tracking_out = tss_group_count_out;
845
846 char cds_count_tracking_name[filename_buf_size];
847 sprintf(cds_count_tracking_name, "%s/cds.count_tracking", output_dir.c_str());
848 FILE* cds_count_out = fopen(cds_count_tracking_name, "w");
849 if (!cds_count_out)
850 {
851 fprintf(stderr, "Error: cannot open CDS level count tracking file %s for writing\n",
852 cds_count_tracking_name);
853 exit(1);
854 }
855 outfiles.cds_count_tracking_out = cds_count_out;
856
857 char gene_count_tracking_name[filename_buf_size];
858 sprintf(gene_count_tracking_name, "%s/genes.count_tracking", output_dir.c_str());
859 FILE* gene_count_out = fopen(gene_count_tracking_name, "w");
860 if (!gene_count_out)
861 {
862 fprintf(stderr, "Error: cannot open gene-level count tracking file %s for writing\n",
863 gene_count_tracking_name);
864 exit(1);
865 }
866 outfiles.gene_count_tracking_out = gene_count_out;
867
868 char isoform_rep_tracking_name[filename_buf_size];
869 sprintf(isoform_rep_tracking_name, "%s/isoforms.read_group_tracking", output_dir.c_str());
870 FILE* isoform_rep_out = fopen(isoform_rep_tracking_name, "w");
871 if (!isoform_rep_out)
872 {
873 fprintf(stderr, "Error: cannot open isoform-level read group tracking file %s for writing\n",
874 isoform_rep_tracking_name);
875 exit(1);
876 }
877 outfiles.isoform_rep_tracking_out = isoform_rep_out;
878
879 char tss_group_rep_tracking_name[filename_buf_size];
880 sprintf(tss_group_rep_tracking_name, "%s/tss_groups.read_group_tracking", output_dir.c_str());
881 FILE* tss_group_rep_out = fopen(tss_group_rep_tracking_name, "w");
882 if (!tss_group_rep_out)
883 {
884 fprintf(stderr, "Error: cannot open TSS group-level read group tracking file %s for writing\n",
885 tss_group_rep_tracking_name);
886 exit(1);
887 }
888 outfiles.tss_group_rep_tracking_out = tss_group_rep_out;
889
890 char cds_rep_tracking_name[filename_buf_size];
891 sprintf(cds_rep_tracking_name, "%s/cds.read_group_tracking", output_dir.c_str());
892 FILE* cds_rep_out = fopen(cds_rep_tracking_name, "w");
893 if (!cds_rep_out)
894 {
895 fprintf(stderr, "Error: cannot open CDS level read group tracking file %s for writing\n",
896 cds_rep_tracking_name);
897 exit(1);
898 }
899 outfiles.cds_rep_tracking_out = cds_rep_out;
900
901 char gene_rep_tracking_name[filename_buf_size];
902 sprintf(gene_rep_tracking_name, "%s/genes.read_group_tracking", output_dir.c_str());
903 FILE* gene_rep_out = fopen(gene_rep_tracking_name, "w");
904 if (!gene_rep_out)
905 {
906 fprintf(stderr, "Error: cannot open gene-level read group tracking file %s for writing\n",
907 gene_rep_tracking_name);
908 exit(1);
909 }
910 outfiles.gene_rep_tracking_out = gene_rep_out;
911
912 char read_group_info_name[filename_buf_size];
913 sprintf(read_group_info_name, "%s/read_groups.info", output_dir.c_str());
914 FILE* read_group_out = fopen(read_group_info_name, "w");
915 if (!read_group_out)
916 {
917 fprintf(stderr, "Error: cannot open read group info file %s for writing\n",
918 read_group_info_name);
919 exit(1);
920 }
921 outfiles.read_group_info_out = read_group_out;
922
923 char run_info_name[filename_buf_size];
924 sprintf(run_info_name, "%s/run.info", output_dir.c_str());
925 FILE* run_info_out = fopen(run_info_name, "w");
926 if (!run_info_out)
927 {
928 fprintf(stderr, "Error: cannot open run info file %s for writing\n",
929 run_info_name);
930 exit(1);
931 }
932 outfiles.run_info_out = run_info_out;
933
934 }
935
open_outfiles_for_writing_simple_table_format(Outfiles & outfiles)936 void open_outfiles_for_writing_simple_table_format(Outfiles& outfiles)
937 {
938 static const int filename_buf_size = 2048;
939
940 char isoform_fpkm_tracking_name[filename_buf_size];
941 sprintf(isoform_fpkm_tracking_name, "%s/isoforms.fpkm_table", output_dir.c_str());
942 FILE* isoform_fpkm_out = fopen(isoform_fpkm_tracking_name, "w");
943 if (!isoform_fpkm_out)
944 {
945 fprintf(stderr, "Error: cannot open isoform-level FPKM table %s for writing\n",
946 isoform_fpkm_tracking_name);
947 exit(1);
948 }
949 outfiles.isoform_fpkm_tracking_out = isoform_fpkm_out;
950
951 char tss_group_fpkm_tracking_name[filename_buf_size];
952 sprintf(tss_group_fpkm_tracking_name, "%s/tss_groups.fpkm_table", output_dir.c_str());
953 FILE* tss_group_fpkm_out = fopen(tss_group_fpkm_tracking_name, "w");
954 if (!tss_group_fpkm_out)
955 {
956 fprintf(stderr, "Error: cannot open TSS group-level FPKM table %s for writing\n",
957 tss_group_fpkm_tracking_name);
958 exit(1);
959 }
960 outfiles.tss_group_fpkm_tracking_out = tss_group_fpkm_out;
961
962 char cds_fpkm_tracking_name[filename_buf_size];
963 sprintf(cds_fpkm_tracking_name, "%s/cds.fpkm_table", output_dir.c_str());
964 FILE* cds_fpkm_out = fopen(cds_fpkm_tracking_name, "w");
965 if (!cds_fpkm_out)
966 {
967 fprintf(stderr, "Error: cannot open CDS level FPKM table %s for writing\n",
968 cds_fpkm_tracking_name);
969 exit(1);
970 }
971 outfiles.cds_fpkm_tracking_out = cds_fpkm_out;
972
973 char gene_fpkm_tracking_name[filename_buf_size];
974 sprintf(gene_fpkm_tracking_name, "%s/genes.fpkm_table", output_dir.c_str());
975 FILE* gene_fpkm_out = fopen(gene_fpkm_tracking_name, "w");
976 if (!gene_fpkm_out)
977 {
978 fprintf(stderr, "Error: cannot open gene-level FPKM table %s for writing\n",
979 gene_fpkm_tracking_name);
980 exit(1);
981 }
982 outfiles.gene_fpkm_tracking_out = gene_fpkm_out;
983
984 char isoform_count_tracking_name[filename_buf_size];
985 sprintf(isoform_count_tracking_name, "%s/isoforms.count_table", output_dir.c_str());
986 FILE* isoform_count_out = fopen(isoform_count_tracking_name, "w");
987 if (!isoform_count_out)
988 {
989 fprintf(stderr, "Error: cannot open isoform-level count table %s for writing\n",
990 isoform_count_tracking_name);
991 exit(1);
992 }
993 outfiles.isoform_count_tracking_out = isoform_count_out;
994
995 char tss_group_count_tracking_name[filename_buf_size];
996 sprintf(tss_group_count_tracking_name, "%s/tss_groups.count_table", output_dir.c_str());
997 FILE* tss_group_count_out = fopen(tss_group_count_tracking_name, "w");
998 if (!tss_group_count_out)
999 {
1000 fprintf(stderr, "Error: cannot open TSS group-level count table %s for writing\n",
1001 tss_group_count_tracking_name);
1002 exit(1);
1003 }
1004 outfiles.tss_group_count_tracking_out = tss_group_count_out;
1005
1006 char cds_count_tracking_name[filename_buf_size];
1007 sprintf(cds_count_tracking_name, "%s/cds.count_table", output_dir.c_str());
1008 FILE* cds_count_out = fopen(cds_count_tracking_name, "w");
1009 if (!cds_count_out)
1010 {
1011 fprintf(stderr, "Error: cannot open CDS level count table %s for writing\n",
1012 cds_count_tracking_name);
1013 exit(1);
1014 }
1015 outfiles.cds_count_tracking_out = cds_count_out;
1016
1017 char gene_count_tracking_name[filename_buf_size];
1018 sprintf(gene_count_tracking_name, "%s/genes.count_table", output_dir.c_str());
1019 FILE* gene_count_out = fopen(gene_count_tracking_name, "w");
1020 if (!gene_count_out)
1021 {
1022 fprintf(stderr, "Error: cannot open gene-level count table %s for writing\n",
1023 gene_count_tracking_name);
1024 exit(1);
1025 }
1026 outfiles.gene_count_tracking_out = gene_count_out;
1027
1028 char isoform_attr_name[filename_buf_size];
1029 sprintf(isoform_attr_name, "%s/isoforms.attr_table", output_dir.c_str());
1030 FILE* isoform_attr_out = fopen(isoform_attr_name, "w");
1031 if (!isoform_attr_out)
1032 {
1033 fprintf(stderr, "Error: cannot open isoform-level attribute table %s for writing\n",
1034 isoform_attr_name);
1035 exit(1);
1036 }
1037 outfiles.isoform_attr_out = isoform_attr_out;
1038
1039 char tss_group_attr_name[filename_buf_size];
1040 sprintf(tss_group_attr_name, "%s/tss_groups.attr_table", output_dir.c_str());
1041 FILE* tss_group_attr_out = fopen(tss_group_attr_name, "w");
1042 if (!tss_group_attr_out)
1043 {
1044 fprintf(stderr, "Error: cannot open TSS group-level attribute table %s for writing\n",
1045 tss_group_attr_name);
1046 exit(1);
1047 }
1048 outfiles.tss_group_attr_out = tss_group_attr_out;
1049
1050 char cds_attr_name[filename_buf_size];
1051 sprintf(cds_attr_name, "%s/cds.attr_table", output_dir.c_str());
1052 FILE* cds_attr_out = fopen(cds_attr_name, "w");
1053 if (!cds_attr_out)
1054 {
1055 fprintf(stderr, "Error: cannot open CDS level attribute table %s for writing\n",
1056 cds_attr_name);
1057 exit(1);
1058 }
1059 outfiles.cds_attr_out = cds_attr_out;
1060
1061 char gene_attr_name[filename_buf_size];
1062 sprintf(gene_attr_name, "%s/genes.attr_table", output_dir.c_str());
1063 FILE* gene_attr_out = fopen(gene_attr_name, "w");
1064 if (!gene_attr_out)
1065 {
1066 fprintf(stderr, "Error: cannot open gene-level attribute table %s for writing\n",
1067 gene_attr_name);
1068 exit(1);
1069 }
1070 outfiles.gene_attr_out = gene_attr_out;
1071
1072 char read_group_info_name[filename_buf_size];
1073 sprintf(read_group_info_name, "%s/samples.table", output_dir.c_str());
1074 FILE* read_group_out = fopen(read_group_info_name, "w");
1075 if (!read_group_out)
1076 {
1077 fprintf(stderr, "Error: cannot open read group info file %s for writing\n",
1078 read_group_info_name);
1079 exit(1);
1080 }
1081 outfiles.read_group_info_out = read_group_out;
1082
1083 char run_info_name[filename_buf_size];
1084 sprintf(run_info_name, "%s/run.info", output_dir.c_str());
1085 FILE* run_info_out = fopen(run_info_name, "w");
1086 if (!run_info_out)
1087 {
1088 fprintf(stderr, "Error: cannot open run info file %s for writing\n",
1089 run_info_name);
1090 exit(1);
1091 }
1092 outfiles.run_info_out = run_info_out;
1093
1094 }
1095
open_outfiles_for_writing(Outfiles & outfiles)1096 void open_outfiles_for_writing(Outfiles& outfiles)
1097 {
1098 if (output_format == CUFFDIFF_OUTPUT_FMT)
1099 {
1100 open_outfiles_for_writing_cuffdiff_format(outfiles);
1101 }
1102 else if (output_format == SIMPLE_TABLE_OUTPUT_FMT)
1103 {
1104 open_outfiles_for_writing_simple_table_format(outfiles);
1105 }
1106 else{
1107 fprintf(stderr, "Error: unrecognized output format!\n");
1108 exit(1);
1109 }
1110
1111 }
1112
main(int argc,char ** argv)1113 int main(int argc, char** argv)
1114 {
1115 for (int i = 0; i < argc; ++i)
1116 {
1117 cmd_str += string(argv[i]) + " ";
1118 }
1119
1120 init_library_table();
1121 init_dispersion_method_table();
1122 init_lib_norm_method_table();
1123 init_output_format_table();
1124
1125 min_isoform_fraction = 1e-5;
1126
1127 int parse_ret = parse_options(argc,argv);
1128 if (parse_ret)
1129 return parse_ret;
1130
1131 if (!use_total_mass && !use_compat_mass)
1132 {
1133 use_total_mass = false;
1134 use_compat_mass = true;
1135 }
1136
1137 if(optind >= argc)
1138 {
1139 print_usage();
1140 return 1;
1141 }
1142
1143 if (!no_update_check)
1144 check_version(PACKAGE_VERSION);
1145
1146 string ref_gtf_filename = argv[optind++];
1147 vector<string> sam_hit_filenames;
1148
1149 if (use_sample_sheet)
1150 {
1151 if (optind < argc)
1152 {
1153
1154 string sample_sheet_filename = argv[optind++];
1155 FILE* sample_sheet_file = NULL;
1156 if (sample_sheet_filename != "")
1157 {
1158 sample_sheet_file = fopen(sample_sheet_filename.c_str(), "r");
1159 if (!sample_sheet_file)
1160 {
1161 fprintf(stderr, "Error: cannot open sample sheet file %s for reading\n",
1162 sample_sheet_filename.c_str());
1163 exit(1);
1164 }
1165 }
1166 parse_sample_sheet_file(sample_sheet_file, sample_labels, sam_hit_filenames);
1167 }
1168 else
1169 {
1170 fprintf(stderr, "Error: option --use-sample-sheet requires a single sample sheet filename instead of a list of SAM/BAM files\n");
1171 }
1172 }
1173 else
1174 {
1175 while(optind < argc)
1176 {
1177 string sam_hits_file_name = argv[optind++];
1178 sam_hit_filenames.push_back(sam_hits_file_name);
1179 }
1180
1181 if (sample_labels.size() == 0)
1182 {
1183 for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
1184 {
1185 char buf[256];
1186 sprintf(buf, "q%lu", i);
1187 sample_labels.push_back(buf);
1188 }
1189 }
1190 }
1191
1192 while (sam_hit_filenames.size() < 2)
1193 {
1194 fprintf(stderr, "Error: cuffdiff requires at least 2 SAM files\n");
1195 exit(1);
1196 }
1197
1198
1199 if (sam_hit_filenames.size() != sample_labels.size())
1200 {
1201 fprintf(stderr, "Error: number of labels must match number of conditions\n");
1202 exit(1);
1203 }
1204
1205 if (random_seed == -1)
1206 random_seed = boost::mt19937::default_seed;
1207
1208 // seed the random number generator - we'll need it for the importance
1209 // sampling during MAP estimation of the gammas
1210 srand48(random_seed);
1211
1212 FILE* ref_gtf = NULL;
1213 if (ref_gtf_filename != "")
1214 {
1215 ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
1216 if (!ref_gtf)
1217 {
1218 fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
1219 ref_gtf_filename.c_str());
1220 exit(1);
1221 }
1222 }
1223
1224 FILE* mask_gtf = NULL;
1225 if (mask_gtf_filename != "")
1226 {
1227 mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
1228 if (!mask_gtf)
1229 {
1230 fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
1231 mask_gtf_filename.c_str());
1232 exit(1);
1233 }
1234 }
1235
1236
1237 FILE* contrast_file = NULL;
1238 if (contrast_filename != "")
1239 {
1240 contrast_file = fopen(contrast_filename.c_str(), "r");
1241 if (!contrast_file)
1242 {
1243 fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
1244 contrast_filename.c_str());
1245 exit(1);
1246 }
1247 }
1248
1249 FILE* norm_standards_file = NULL;
1250 if (norm_standards_filename != "")
1251 {
1252 norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
1253 if (!norm_standards_file)
1254 {
1255 fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
1256 norm_standards_filename.c_str());
1257 exit(1);
1258 }
1259 }
1260
1261
1262 // Note: we don't want the assembly filters interfering with calculations
1263 // here
1264
1265 pre_mrna_fraction = 0.0;
1266 olap_radius = 0;
1267
1268 Outfiles outfiles;
1269
1270 if (output_dir != "")
1271 {
1272 int retcode = mkpath(output_dir.c_str(), 0777);
1273 if (retcode == -1)
1274 {
1275 if (errno != EEXIST)
1276 {
1277 fprintf (stderr,
1278 "Error: cannot create directory %s\n",
1279 output_dir.c_str());
1280 exit(1);
1281 }
1282 }
1283 }
1284
1285 open_outfiles_for_writing(outfiles);
1286
1287 driver(ref_gtf, mask_gtf, contrast_file, norm_standards_file, sam_hit_filenames, outfiles);
1288
1289 #if 0
1290 if (emit_count_tables)
1291 {
1292 dump_locus_variance_info(output_dir + string("/locus_var.txt"));
1293 }
1294 #endif
1295
1296 return 0;
1297 }
1298