1 /*
2  *  cuffdiff.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 10/21/09.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 #include <numeric>
14 #include <cfloat>
15 #include <iostream>
16 
17 #include "common.h"
18 #include "hits.h"
19 #include "bundles.h"
20 #include "abundances.h"
21 #include "tokenize.h"
22 #include "biascorrection.h"
23 #include "update_check.h"
24 
25 #include <boost/thread.hpp>
26 #include <boost/version.hpp>
27 #include <boost/graph/adjacency_list.hpp>
28 #include <boost/graph/graph_traits.hpp>
29 #include <boost/numeric/ublas/matrix.hpp>
30 #include <boost/numeric/ublas/matrix_proxy.hpp>
31 #include <boost/numeric/ublas/vector.hpp>
32 #include <boost/numeric/ublas/vector_proxy.hpp>
33 #include <boost/numeric/ublas/io.hpp>
34 #include <boost/algorithm/string.hpp>
35 
36 #include "differential.h"
37 
38 extern "C" {
39 #include "locfit/local.h"
40 }
41 
42 // Need at least this many reads in a locus to do any testing on it
43 
44 vector<string> sample_labels;
45 
46 double FDR = 0.05;
47 bool samples_are_time_series = false;
48 using namespace std;
49 using namespace boost;
50 
51 // We leave out the short codes for options that don't take an argument
52 #if ENABLE_THREADS
53 const char *short_options = "m:p:s:c:I:j:L:M:o:b:TNqvuF:C:";
54 #else
55 const char *short_options = "m:s:c:I:j:L:M:o:b:TNqvuF:C:";
56 #endif
57 
58 
59 
60 static struct option long_options[] = {
61 {"frag-len-mean",			required_argument,       0,          'm'},
62 {"frag-len-std-dev",        required_argument,       0,          's'},
63 {"transcript-score-thresh", required_argument,       0,          't'},
64 {"pre-mrna-fraction",		required_argument,		 0,			 'j'},
65 {"max-intron-length",		required_argument,		 0,			 'I'},
66 {"labels",					required_argument,		 0,			 'L'},
67 {"min-alignment-count",     required_argument,		 0,			 'c'},
68 {"FDR",					    required_argument,		 0,			 OPT_FDR},
69 {"seed",                    required_argument,		 0,			 OPT_RANDOM_SEED},
70 {"mask-file",               required_argument,		 0,			 'M'},
71 {"contrast-file",           required_argument,		 0,			 'C'},
72 {"norm-standards-file",     required_argument,		 0,			 OPT_NORM_STANDARDS_FILE},
73 {"use-sample-sheet",        no_argument,             0,			 OPT_USE_SAMPLE_SHEET},
74 {"output-dir",			    required_argument,		 0,			 'o'},
75 {"verbose",			    	no_argument,			 0,			 'v'},
76 {"quiet",			    	no_argument,			 0,			 'q'},
77 {"frag-bias-correct",       required_argument,		 0,			 'b'},
78 {"multi-read-correct",      no_argument,			 0,			 'u'},
79 {"time-series",             no_argument,             0,			 'T'},
80 {"upper-quartile-norm",     no_argument,	 		 0,	         'N'},
81 {"geometric-norm",          no_argument,	 		 0,	         OPT_GEOMETRIC_NORM},
82 {"raw-mapped-norm",         no_argument,	 		 0,	         OPT_RAW_MAPPED_NORM},
83 {"min-isoform-fraction",    required_argument,       0,          'F'},
84 #if ENABLE_THREADS
85 {"num-threads",				required_argument,       0,          'p'},
86 #endif
87 {"library-type",		    required_argument,		 0,			 OPT_LIBRARY_TYPE},
88 {"seed",                    required_argument,		 0,			 OPT_RANDOM_SEED},
89 {"no-collapse-cond-prob",   no_argument,             0,			 OPT_COLLAPSE_COND_PROB},
90 {"num-importance-samples",  required_argument,		 0,			 OPT_NUM_IMP_SAMPLES},
91 {"max-mle-iterations",		required_argument,		 0,			 OPT_MLE_MAX_ITER},
92 {"min-mle-accuracy",		required_argument,		 0,			 OPT_MLE_MIN_ACC},
93 {"poisson-dispersion",		no_argument,             0,		     OPT_POISSON_DISPERSION},
94 {"bias-mode",               required_argument,		 0,			 OPT_BIAS_MODE},
95 {"no-update-check",         no_argument,             0,          OPT_NO_UPDATE_CHECK},
96 {"emit-count-tables",       no_argument,             0,          OPT_EMIT_COUNT_TABLES},
97 {"compatible-hits-norm",    no_argument,	 		 0,	         OPT_USE_COMPAT_MASS},
98 {"total-hits-norm",         no_argument,	 		 0,	         OPT_USE_TOTAL_MASS},
99 //{"analytic-diff",           no_argument,	 		 0,	         OPT_ANALYTIC_DIFF},
100 {"no-diff",                 no_argument,	 		 0,	         OPT_NO_DIFF},
101 {"num-frag-count-draws",	required_argument,		 0,			 OPT_NUM_FRAG_COUNT_DRAWS},
102 {"num-frag-assign-draws",	required_argument,		 0,			 OPT_NUM_FRAG_ASSIGN_DRAWS},
103 
104 // Some options for testing different stats policies
105 {"max-bundle-frags",        required_argument,       0,          OPT_MAX_FRAGS_PER_BUNDLE},
106 {"read-skip-fraction",      required_argument,	     0,          OPT_READ_SKIP_FRACTION},
107 {"no-read-pairs",           no_argument,	 		 0,          OPT_NO_READ_PAIRS},
108 {"trim-read-length",        required_argument,	     0,          OPT_TRIM_READ_LENGTH},
109 {"cov-delta",               required_argument,	     0,          OPT_MAX_DELTA_GAP},
110 {"locus-count-dispersion",  no_argument,             0,          OPT_LOCUS_COUNT_DISPERSION},
111 {"max-frag-multihits",      required_argument,       0,          OPT_FRAG_MAX_MULTIHITS},
112 {"min-outlier-p",           required_argument,       0,          OPT_MIN_OUTLIER_P},
113 {"min-reps-for-js-test",      required_argument,     0,          OPT_MIN_REPS_FOR_JS_TEST},
114 {"no-effective-length-correction",  no_argument,     0,          OPT_NO_EFFECTIVE_LENGTH_CORRECTION},
115 {"no-length-correction",    no_argument,             0,          OPT_NO_LENGTH_CORRECTION},
116 {"no-js-tests",             no_argument,             0,          OPT_NO_JS_TESTS},
117 {"dispersion-method",       required_argument,       0,          OPT_DISPERSION_METHOD},
118 {"library-norm-method",     required_argument,       0,          OPT_LIB_NORM_METHOD},
119 {"no-scv-correction",       no_argument,             0,          OPT_NO_SCV_CORRECTION},
120 {0, 0, 0, 0} // terminator
121 };
122 
print_usage()123 void print_usage()
124 {
125 	fprintf(stderr, "cuffdiff v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
126 	fprintf(stderr, "-----------------------------\n");
127 
128 	//NOTE: SPACES ONLY, bozo
129     fprintf(stderr, "Usage:   cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]\n");
130 	fprintf(stderr, "   Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam\n");
131     fprintf(stderr, "General Options:\n");
132     fprintf(stderr, "  -o/--output-dir              write all output files to this directory              [ default:     ./ ]\n");
133     fprintf(stderr, "  -L/--labels                  comma-separated list of condition labels\n");
134 	fprintf(stderr, "  --FDR                        False discovery rate used in testing                  [ default:   0.05 ]\n");
135 	fprintf(stderr, "  -M/--mask-file               ignore all alignment within transcripts in this file  [ default:   NULL ]\n");
136     fprintf(stderr, "  -C/--contrast-file           Perform the constrasts specified in this file         [ default:   NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
137     //fprintf(stderr, "  --norm-standards-file        Housekeeping/spike genes to normalize libraries       [ default:   NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
138     fprintf(stderr, "  -b/--frag-bias-correct       use bias correction - reference fasta required        [ default:   NULL ]\n");
139     fprintf(stderr, "  -u/--multi-read-correct      use 'rescue method' for multi-reads                   [ default:  FALSE ]\n");
140 #if ENABLE_THREADS
141 	fprintf(stderr, "  -p/--num-threads             number of threads used during quantification          [ default:      1 ]\n");
142 #endif
143     fprintf(stderr, "  --no-diff                    Don't generate differential analysis files            [ default:  FALSE ]\n");
144     fprintf(stderr, "  --no-js-tests                Don't perform isoform switching tests                 [ default:  FALSE ]\n");
145 	fprintf(stderr, "  -T/--time-series             treat samples as a time-series                        [ default:  FALSE ]\n");
146     fprintf(stderr, "  --library-type               Library prep used for input reads                     [ default:  below ]\n");
147     fprintf(stderr, "  --dispersion-method          Method used to estimate dispersion models             [ default:  below ]\n");
148     fprintf(stderr, "  --library-norm-method        Method used to normalize library sizes                [ default:  below ]\n");
149 
150     fprintf(stderr, "\nAdvanced Options:\n");
151     fprintf(stderr, "  -m/--frag-len-mean           average fragment length (unpaired reads only)         [ default:    200 ]\n");
152     fprintf(stderr, "  -s/--frag-len-std-dev        fragment length std deviation (unpaired reads only)   [ default:     80 ]\n");
153     fprintf(stderr, "  -c/--min-alignment-count     minimum number of alignments in a locus for testing   [ default:   10 ]\n");
154     fprintf(stderr, "  --max-mle-iterations         maximum iterations allowed for MLE calculation        [ default:   5000 ]\n");
155     fprintf(stderr, "  --compatible-hits-norm       count hits compatible with reference RNAs only        [ default:   TRUE ]\n");
156     fprintf(stderr, "  --total-hits-norm            count all hits for normalization                      [ default:  FALSE ]\n");
157     fprintf(stderr, "  -v/--verbose                 log-friendly verbose processing (no progress bar)     [ default:  FALSE ]\n");
158 	fprintf(stderr, "  -q/--quiet                   log-friendly quiet processing (no progress bar)       [ default:  FALSE ]\n");
159     fprintf(stderr, "  --seed                       value of random number generator seed                 [ default:      0 ]\n");
160     fprintf(stderr, "  --no-update-check            do not contact server to check for update availability[ default:  FALSE ]\n");
161     fprintf(stderr, "  --emit-count-tables          print count tables used to fit overdispersion         [    DEPRECATED   ]\n");
162     fprintf(stderr, "  --max-bundle-frags           maximum fragments allowed in a bundle before skipping [ default: 500000 ]\n");
163     fprintf(stderr, "  --num-frag-count-draws       Number of fragment generation samples                 [ default:    100 ]\n");
164     fprintf(stderr, "  --num-frag-assign-draws      Number of fragment assignment samples per generation  [ default:     50 ]\n");
165     fprintf(stderr, "  --max-frag-multihits         Maximum number of alignments allowed per fragment     [ default: unlim  ]\n");
166     fprintf(stderr, "  --min-outlier-p              Min replicate p value to admit for testing            [    DEPRECATED   ]\n");
167     fprintf(stderr, "  --min-reps-for-js-test       Replicates needed for relative isoform shift testing  [ default:      3 ]\n");
168     fprintf(stderr, "  --no-effective-length-correction   No effective length correction                  [ default:  FALSE ]\n");
169     fprintf(stderr, "  --no-length-correction       No length correction                                  [ default:  FALSE ]\n");
170     fprintf(stderr, "  -N/--upper-quartile-norm     Deprecated, use --library-norm-method                 [    DEPRECATED   ]\n");
171     fprintf(stderr, "  --geometric-norm             Deprecated, use --library-norm-method                 [    DEPRECATED   ]\n");
172     fprintf(stderr, "  --raw-mapped-norm            Deprecated, use --library-norm-method                 [    DEPRECATED   ]\n");
173     fprintf(stderr, "  --poisson-dispersion         Deprecated, use --dispersion-method                   [    DEPRECATED   ]\n");
174     fprintf(stderr, "\nDebugging use only:\n");
175     fprintf(stderr, "  --read-skip-fraction         Skip a random subset of reads this size               [ default:    0.0 ]\n");
176     fprintf(stderr, "  --no-read-pairs              Break all read pairs                                  [ default:  FALSE ]\n");
177     fprintf(stderr, "  --trim-read-length           Trim reads to be this long (keep 5' end)              [ default:   none ]\n");
178     fprintf(stderr, "  --no-scv-correction          Disable SCV correction                                [ default:  FALSE ]\n");
179     print_library_table();
180     print_dispersion_method_table();
181     print_lib_norm_method_table();
182 }
183 
parse_options(int argc,char ** argv)184 int parse_options(int argc, char** argv)
185 {
186     int option_index = 0;
187     int next_option;
188     string sample_label_list;
189     string dispersion_method_str;
190     string lib_norm_method_str;
191     do {
192         next_option = getopt_long_only(argc, argv, short_options, long_options, &option_index);
193         if (next_option == -1)     /* Done with options. */
194             break;
195         switch (next_option) {
196             case 0:
197                 /* If this option set a flag, do nothing else now. */
198                 if (long_options[option_index].flag != 0)
199                     break;
200                 break;
201 
202 			case 'm':
203 				user_provided_fld = true;
204 				def_frag_len_mean = (uint32_t)parseInt(0, "-m/--frag-len-mean arg must be at least 0", print_usage);
205 				break;
206 			case 'c':
207 				min_read_count = (uint32_t)parseInt(0, "-c/--min-alignment-count arg must be at least 0", print_usage);
208 				break;
209 			case 's':
210 				user_provided_fld = true;
211 				def_frag_len_std_dev = (uint32_t)parseInt(0, "-s/--frag-len-std-dev arg must be at least 0", print_usage);
212 				break;
213 			case 'p':
214 				num_threads = (uint32_t)parseInt(1, "-p/--num-threads arg must be at least 1", print_usage);
215 				break;
216             case 'F':
217 				min_isoform_fraction = parseFloat(0, 1.0, "-F/--min-isoform-fraction must be between 0 and 1.0", print_usage);
218 				break;
219             case 'L':
220 				sample_label_list = optarg;
221 				break;
222 			case OPT_FDR:
223 				FDR = (double)parseFloat(0.00, 1.00, "--FDR arg must be between 0 and 1", print_usage);
224 				break;
225 			case OPT_NUM_IMP_SAMPLES:
226 				num_importance_samples = parseInt(1, "--num-importance-samples must be at least 1", print_usage);
227 				break;
228 			case OPT_MLE_MAX_ITER:
229 				max_mle_iterations = parseInt(1, "--max-mle-iterations must be at least 1", print_usage);
230 				break;
231 			case OPT_BIAS_MODE:
232 				if (!strcmp(optarg, "site"))
233 					bias_mode = SITE;
234 				else if (!strcmp(optarg, "pos"))
235 					bias_mode = POS;
236 				else if (!strcmp(optarg, "pos_vlmm"))
237 					bias_mode = POS_VLMM;
238 				else if (!strcmp(optarg, "vlmm"))
239 					bias_mode = VLMM;
240                 else if (!strcmp(optarg, "pos_site"))
241 					bias_mode = POS_SITE;
242 				else
243 				{
244 					fprintf(stderr, "Unknown bias mode.\n");
245 					exit(1);
246 				}
247 				break;
248 			case 'M':
249 			{
250 				mask_gtf_filename = optarg;
251 				break;
252 			}
253             case 'C':
254 			{
255 				contrast_filename = optarg;
256 				break;
257 			}
258 			case OPT_NORM_STANDARDS_FILE:
259 			{
260 				norm_standards_filename = optarg;
261 				break;
262 			}
263             case OPT_USE_SAMPLE_SHEET:
264 			{
265                 use_sample_sheet = true;
266 				break;
267 			}
268             case 'v':
269 			{
270 				if (cuff_quiet)
271 				{
272 					fprintf(stderr, "Warning: Can't be both verbose and quiet!  Setting verbose only.\n");
273 				}
274 				cuff_quiet = false;
275 				cuff_verbose = true;
276 				break;
277 			}
278 			case 'q':
279 			{
280 				if (cuff_verbose)
281 				{
282 					fprintf(stderr, "Warning: Can't be both verbose and quiet!  Setting quiet only.\n");
283 				}
284 				cuff_verbose = false;
285 				cuff_quiet = true;
286 				break;
287 			}
288             case 'o':
289 			{
290 				output_dir = optarg;
291 				break;
292 			}
293 			case 'b':
294 			{
295 				fasta_dir = optarg;
296 				corr_bias = true;
297 				break;
298             }
299 
300             case 'T':
301 			{
302                 samples_are_time_series = true;
303 				break;
304             }
305             case 'N':
306             {
307             	lib_norm_method_str = "quartile";
308             	break;
309             }
310             case 'u':
311             {
312                 corr_multi = true;
313                 break;
314             }
315             case OPT_LIBRARY_TYPE:
316 			{
317 				library_type = optarg;
318 				break;
319 			}
320             case OPT_POISSON_DISPERSION:
321 			{
322 				fprintf (stderr, "Warning: --poisson-dispersion is deprecated, use --dispersion-method poisson instead.\n");
323 				break;
324 			}
325             case OPT_NO_UPDATE_CHECK:
326             {
327                 no_update_check = true;
328                 break;
329             }
330             case OPT_RANDOM_SEED:
331             {
332                 random_seed = parseInt(0, "--seed must be at least 0", print_usage);
333                 break;
334             }
335             case OPT_EMIT_COUNT_TABLES:
336             {
337                 emit_count_tables = true;
338                 break;
339             }
340             case OPT_COLLAPSE_COND_PROB:
341             {
342                 cond_prob_collapse = false;
343                 break;
344             }
345             case OPT_USE_COMPAT_MASS:
346             {
347                 use_compat_mass = true;
348                 break;
349             }
350             case OPT_USE_TOTAL_MASS:
351             {
352                 use_total_mass = true;
353                 break;
354             }
355             case OPT_USE_FISHER_COVARIANCE:
356             {
357                 use_fisher_covariance = true;
358                 break;
359             }
360             case OPT_USE_EMPIRICAL_COVARIANCE:
361             {
362                 use_fisher_covariance = false;
363                 break;
364             }
365             case OPT_SPLIT_MASS:
366             {
367                 split_variance = false;
368                 break;
369             }
370             case OPT_SPLIT_VARIANCE:
371             {
372                 split_variance = true;
373                 break;
374             }
375 
376             case OPT_MAX_FRAGS_PER_BUNDLE:
377             {
378                 max_frags_per_bundle = parseInt(0, "--max-bundle-frags must be at least 0", print_usage);
379                 break;
380             }
381             case OPT_READ_SKIP_FRACTION:
382             {
383                 read_skip_fraction = parseFloat(0, 1.0, "--read-skip-fraction must be between 0 and 1.0", print_usage);
384                 break;
385             }
386             case OPT_NO_READ_PAIRS:
387             {
388                 no_read_pairs = true;
389                 break;
390             }
391             case OPT_TRIM_READ_LENGTH:
392             {
393                 trim_read_length = parseInt(0, "--trim-read-length must be at least 1", print_usage);
394                 break;
395             }
396 
397             case OPT_NO_DIFF:
398             {
399                 no_differential = true;
400                 break;
401             }
402             case OPT_GEOMETRIC_NORM:
403             {
404                 lib_norm_method_str = "geometric";
405                 break;
406             }
407             case OPT_RAW_MAPPED_NORM:
408             {
409                 lib_norm_method_str = "classic-fpkm";
410                 break;
411             }
412             case OPT_NUM_FRAG_COUNT_DRAWS:
413             {
414                 num_frag_count_draws = parseInt(1, "--num-frag-count-draws must be at least 1", print_usage);
415                 break;
416             }
417             case OPT_NUM_FRAG_ASSIGN_DRAWS:
418             {
419                 num_frag_assignments = parseInt(1, "--num-frag-assign-draws must be at least 1", print_usage);
420                 break;
421             }
422             case OPT_FRAG_MAX_MULTIHITS:
423             {
424                 max_frag_multihits = parseInt(1, "--max-frag-multihits must be at least 1", print_usage);
425                 break;
426             }
427             case OPT_MIN_OUTLIER_P:
428             {
429                 min_outlier_p = parseFloat(0, 1.0, "--min-outlier-p must be between 0 and 1.0", print_usage);
430                 break;
431             }
432             case OPT_MIN_REPS_FOR_JS_TEST:
433             {
434                 min_reps_for_js_test = parseInt(1, "--min-reps-for-js-test must be at least 1", print_usage);
435                 break;
436             }
437             case OPT_NO_EFFECTIVE_LENGTH_CORRECTION:
438             {
439                 no_effective_length_correction = true;
440                 break;
441             }
442             case OPT_NO_LENGTH_CORRECTION:
443             {
444                 no_length_correction = true;
445                 break;
446             }
447             case OPT_NO_JS_TESTS:
448             {
449                 no_js_tests = true;
450                 break;
451             }
452             case OPT_DISPERSION_METHOD:
453 			{
454 				dispersion_method_str = optarg;
455 				break;
456 			}
457             case OPT_LIB_NORM_METHOD:
458 			{
459 				lib_norm_method_str = optarg;
460 				break;
461 			}
462             case OPT_NO_SCV_CORRECTION:
463             {
464                 no_scv_correction = true;
465                 break;
466             }
467 			default:
468 				print_usage();
469 				return 1;
470         }
471     } while(next_option != -1);
472 
473 	if (library_type != "")
474     {
475         map<string, ReadGroupProperties>::iterator lib_itr =
476 		library_type_table.find(library_type);
477         if (lib_itr == library_type_table.end())
478         {
479             fprintf(stderr, "Error: Library type %s not supported\n", library_type.c_str());
480             exit(1);
481         }
482         else
483         {
484             if (library_type == "transfrags")
485             {
486                 allow_junk_filtering = false;
487             }
488             global_read_properties = &lib_itr->second;
489         }
490     }
491     else
492     {
493 
494     }
495 
496     // Set the count dispersion method to use
497     if (dispersion_method_str == "")
498     {
499         dispersion_method_str = default_dispersion_method;
500     }
501 
502     map<string, DispersionMethod>::iterator disp_itr =
503     dispersion_method_table.find(dispersion_method_str);
504     if (disp_itr == dispersion_method_table.end())
505     {
506         fprintf(stderr, "Error: Dispersion method %s not supported\n", dispersion_method_str.c_str());
507         exit(1);
508     }
509     else
510     {
511         dispersion_method = disp_itr->second;
512     }
513 
514     // Set the library size normalization method to use
515     if (lib_norm_method_str == "")
516     {
517         lib_norm_method_str = default_lib_norm_method;
518     }
519 
520     map<string, LibNormalizationMethod>::iterator lib_norm_itr =
521     lib_norm_method_table.find(lib_norm_method_str);
522     if (lib_norm_itr == lib_norm_method_table.end())
523     {
524         fprintf(stderr, "Error: Dispersion method %s not supported\n", lib_norm_method_str.c_str());
525         exit(1);
526     }
527     else
528     {
529         lib_norm_method = lib_norm_itr->second;
530     }
531 
532 
533 
534     if (use_total_mass && use_compat_mass)
535     {
536         fprintf (stderr, "Error: please supply only one of --compatibile-hits-norm and --total-hits-norm\n");
537         exit(1);
538     }
539 
540     tokenize(sample_label_list, ",", sample_labels);
541 
542 	allow_junk_filtering = false;
543 
544 	return 0;
545 }
546 
print_tests(FILE * fout,const char * sample_1_label,const char * sample_2_label,const SampleDiffs & de_tests)547 void print_tests(FILE* fout,
548                  const char* sample_1_label,
549                  const char* sample_2_label,
550 				 const SampleDiffs& de_tests)
551 {
552 	for (SampleDiffs::const_iterator itr = de_tests.begin();
553 		 itr != de_tests.end();
554 		 ++itr)
555 	{
556 		const SampleDifference& test = itr->second;
557 
558         string all_gene_ids = cat_strings(test.meta_data->gene_ids);
559 		if (all_gene_ids == "")
560 			all_gene_ids = "-";
561 
562 		string all_gene_names = cat_strings(test.meta_data->gene_names);
563 		if (all_gene_names == "")
564 			all_gene_names = "-";
565 
566 		string all_protein_ids = cat_strings(test.meta_data->protein_ids);
567 		if (all_protein_ids == "")
568 			all_protein_ids = "-";
569 
570 		fprintf(fout, "%s\t%s\t%s\t%s\t%s\t%s",
571                 itr->first.c_str(),
572                 all_gene_ids.c_str(),
573                 all_gene_names.c_str(),
574                 test.meta_data->locus_desc.c_str(),
575                 sample_1_label,
576                 sample_2_label);
577 
578         double t = test.test_stat;
579         double r1 = test.value_1;
580         double r2 = test.value_2;
581         double d = test.differential;
582         double p = test.p_value;
583         double q = test.corrected_p;
584         const char* sig;
585         if (test.significant && test.test_status == OK)
586             sig = "yes";
587         else
588             sig = "no";
589 
590         const char* status = "OK";
591         if (test.test_status == OK)
592             status = "OK";
593         else if (test.test_status == LOWDATA)
594             status = "LOWDATA";
595         else if (test.test_status == HIDATA)
596             status = "HIDATA";
597         else if (test.test_status == NOTEST)
598             status = "NOTEST";
599         else
600             status = "FAIL";
601 
602         fprintf(fout, "\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\t%lg\t%s", status, r1, r2, d, t, p, q, sig);
603         fprintf(fout, "\n");
604 	}
605 }
606 
print_FPKM_tracking(FILE * fout,const FPKMTrackingTable & tracking)607 void print_FPKM_tracking(FILE* fout,
608 						 const FPKMTrackingTable& tracking)
609 {
610 	fprintf(fout,"tracking_id\tclass_code\tnearest_ref_id\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage");
611 	FPKMTrackingTable::const_iterator first_itr = tracking.begin();
612 	if (first_itr != tracking.end())
613 	{
614 		const FPKMTracking& track = first_itr->second;
615 		const vector<FPKMContext>& fpkms = track.fpkm_series;
616 		for (size_t i = 0; i < fpkms.size(); ++i)
617 		{
618 			fprintf(fout, "\t%s_FPKM\t%s_conf_lo\t%s_conf_hi\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
619 		}
620 	}
621 	fprintf(fout, "\n");
622 	for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
623 	{
624 		const string& description = itr->first;
625 		const FPKMTracking& track = itr->second;
626 		const vector<FPKMContext>& fpkms = track.fpkm_series;
627 
628         AbundanceStatus status = NUMERIC_OK;
629         BOOST_FOREACH (const FPKMContext& c, fpkms)
630         {
631             if (c.status == NUMERIC_FAIL)
632                 status = NUMERIC_FAIL;
633         }
634 
635         string all_gene_ids = cat_strings(track.gene_ids);
636 		if (all_gene_ids == "")
637 			all_gene_ids = "-";
638 
639 		string all_gene_names = cat_strings(track.gene_names);
640 		if (all_gene_names == "")
641 			all_gene_names = "-";
642 
643 		string all_tss_ids = cat_strings(track.tss_ids);
644 		if (all_tss_ids == "")
645 			all_tss_ids = "-";
646 
647         char length_buff[33] = "-";
648         if (track.length)
649             sprintf(length_buff, "%d", track.length);
650 
651         fprintf(fout, "%s\t%c\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
652                 description.c_str(),
653                 track.classcode ? track.classcode : '-',
654                 track.ref_match.c_str(),
655                 all_gene_ids.c_str(),
656                 all_gene_names.c_str(),
657                 all_tss_ids.c_str(),
658                 track.locus_tag.c_str(),
659                 length_buff,
660                 "-");
661 
662 		for (size_t i = 0; i < fpkms.size(); ++i)
663 		{
664 			double fpkm = fpkms[i].FPKM;
665 			//double std_dev = sqrt(fpkms[i].FPKM_variance);
666 			double fpkm_conf_hi = fpkms[i].FPKM_conf_hi;
667 			double fpkm_conf_lo = fpkms[i].FPKM_conf_lo;
668             const char* status_str = "OK";
669 
670             if (fpkms[i].status == NUMERIC_OK)
671             {
672                 status_str = "OK";
673             }
674             else if (fpkms[i].status == NUMERIC_FAIL)
675             {
676                 status_str = "FAIL";
677             }
678             else if (fpkms[i].status == NUMERIC_LOW_DATA)
679             {
680                 status_str = "LOWDATA";
681             }
682             else if (fpkms[i].status == NUMERIC_HI_DATA)
683             {
684                 status_str = "HIDATA";
685             }
686             else
687             {
688                 assert(false);
689             }
690 
691 			fprintf(fout, "\t%lg\t%lg\t%lg\t%s", fpkm, fpkm_conf_lo, fpkm_conf_hi, status_str);
692 		}
693 
694 		fprintf(fout, "\n");
695 	}
696 }
697 
print_count_tracking(FILE * fout,const FPKMTrackingTable & tracking)698 void print_count_tracking(FILE* fout,
699 						  const FPKMTrackingTable& tracking)
700 {
701 	fprintf(fout,"tracking_id");
702 	FPKMTrackingTable::const_iterator first_itr = tracking.begin();
703 	if (first_itr != tracking.end())
704 	{
705 		const FPKMTracking& track = first_itr->second;
706 		const vector<FPKMContext>& fpkms = track.fpkm_series;
707 		for (size_t i = 0; i < fpkms.size(); ++i)
708 		{
709 			fprintf(fout, "\t%s_count\t%s_count_variance\t%s_count_uncertainty_var\t%s_count_dispersion_var\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
710 		}
711 	}
712 	fprintf(fout, "\n");
713 	for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
714 	{
715 		const string& description = itr->first;
716 		const FPKMTracking& track = itr->second;
717 		const vector<FPKMContext>& fpkms = track.fpkm_series;
718 
719         AbundanceStatus status = NUMERIC_OK;
720         BOOST_FOREACH (const FPKMContext& c, fpkms)
721         {
722             if (c.status == NUMERIC_FAIL)
723                 status = NUMERIC_FAIL;
724         }
725 
726         fprintf(fout, "%s",
727                 description.c_str());
728 
729 		for (size_t i = 0; i < fpkms.size(); ++i)
730 		{
731             const char* status_str = "OK";
732 
733             if (fpkms[i].status == NUMERIC_OK)
734             {
735                 status_str = "OK";
736             }
737             else if (fpkms[i].status == NUMERIC_FAIL)
738             {
739                 status_str = "FAIL";
740             }
741             else if (fpkms[i].status == NUMERIC_LOW_DATA)
742             {
743                 status_str = "LOWDATA";
744             }
745             else if (fpkms[i].status == NUMERIC_HI_DATA)
746             {
747                 status_str = "HIDATA";
748             }
749             else
750             {
751                 assert(false);
752             }
753 
754             double external_counts = fpkms[i].count_mean;
755             double external_count_var = fpkms[i].count_var;
756             double uncertainty_var = fpkms[i].count_uncertainty_var;
757             double dispersion_var = fpkms[i].count_dispersion_var;
758 			fprintf(fout, "\t%lg\t%lg\t%lg\t%lg\t%s", external_counts, external_count_var, uncertainty_var, dispersion_var, status_str);
759 		}
760 
761 		fprintf(fout, "\n");
762 	}
763 }
764 
print_read_group_tracking(FILE * fout,const FPKMTrackingTable & tracking)765 void print_read_group_tracking(FILE* fout,
766                                const FPKMTrackingTable& tracking)
767 {
768 	fprintf(fout,"tracking_id\tcondition\treplicate\traw_frags\tinternal_scaled_frags\texternal_scaled_frags\tFPKM\teffective_length\tstatus");
769 
770 	fprintf(fout, "\n");
771 	for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
772 	{
773 		const string& description = itr->first;
774 		const FPKMTracking& track = itr->second;
775 		const vector<FPKMContext>& fpkms = track.fpkm_series;
776 
777 		for (size_t i = 0; i < fpkms.size(); ++i)
778 		{
779             for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
780                  ++j)
781             {
782                 double FPKM = fpkms[i].tracking_info_per_rep[j].fpkm;
783                 double internal_count = fpkms[i].tracking_info_per_rep[j].count;
784                 double external_count = internal_count / fpkms[i].tracking_info_per_rep[j].rg_props->external_scale_factor();
785                 double raw_count = internal_count * fpkms[i].tracking_info_per_rep[j].rg_props->internal_scale_factor();
786                 const  string& condition_name = fpkms[i].tracking_info_per_rep[j].rg_props->condition_name();
787                 AbundanceStatus status = fpkms[i].tracking_info_per_rep[j].status;
788 
789                 int rep_num = fpkms[i].tracking_info_per_rep[j].rg_props->replicate_num();
790 
791                 const char* status_str = "OK";
792 
793                 if (status == NUMERIC_OK)
794                 {
795                     status_str = "OK";
796                 }
797                 else if (status == NUMERIC_FAIL)
798                 {
799                     status_str = "FAIL";
800                 }
801                 else if (status == NUMERIC_LOW_DATA)
802                 {
803                     status_str = "LOWDATA";
804                 }
805                 else if (status == NUMERIC_HI_DATA)
806                 {
807                     status_str = "HIDATA";
808                 }
809                 else
810                 {
811                     assert(false);
812                 }
813 
814                 fprintf(fout, "%s\t%s\t%d\t%lg\t%lg\t%lg\t%lg\t%s\t%s\n",
815                         description.c_str(),
816                         condition_name.c_str(),
817                         rep_num,
818                         raw_count,
819                         internal_count,
820                         external_count,
821                         FPKM,
822                         "-",
823                         status_str);
824             }
825 		}
826 	}
827 }
828 
print_read_group_info(FILE * fout,const vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)829 void print_read_group_info(FILE* fout,
830                            const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
831 {
832     fprintf(fout, "file\tcondition\treplicate_num\ttotal_mass\tnorm_mass\tinternal_scale\texternal_scale\n");
833     for (size_t i = 0; i < all_read_groups.size(); ++i)
834     {
835         boost::shared_ptr<ReadGroupProperties const> rg_props = all_read_groups[i];
836         fprintf(fout, "%s\t%s\t%d\t%Lg\t%Lg\t%lg\t%lg\n",
837                 rg_props->file_path().c_str(),
838                 rg_props->condition_name().c_str(),
839                 rg_props->replicate_num(),
840                 rg_props->total_map_mass(),
841                 rg_props->normalized_map_mass(),
842                 rg_props->internal_scale_factor(),
843                 rg_props->external_scale_factor());
844 
845     }
846 }
847 
print_run_info(FILE * fout)848 void print_run_info(FILE* fout)
849 {
850     fprintf(fout, "param\tvalue\n");
851     fprintf(fout, "cmd_line\t%s\n", cmd_str.c_str());
852     fprintf(fout, "version\t%s\n", PACKAGE_VERSION);
853     fprintf(fout, "SVN_revision\t%s\n",SVN_REVISION);
854     fprintf(fout, "boost_version\t%d\n", BOOST_VERSION);
855 }
856 
p_value_lt(const SampleDifference * lhs,const SampleDifference * rhs)857 bool p_value_lt(const SampleDifference* lhs, const SampleDifference* rhs)
858 {
859 	return lhs->p_value < rhs->p_value;
860 }
861 
862 // Benjamani-Hochberg procedure
fdr_significance(double fdr,vector<SampleDifference * > & tests)863 int fdr_significance(double fdr,
864                      vector<SampleDifference*>& tests)
865 {
866 	sort(tests.begin(), tests.end(), p_value_lt);
867 	vector<SampleDifference*> passing;
868 
869 	for (int k = 0; k < (int)tests.size(); ++k)
870 	{
871 		if (tests[k]->test_status == OK)
872 		{
873 			passing.push_back(tests[k]);
874 		}
875 		else
876 		{
877 			tests[k]->significant = false;
878 		}
879 	}
880 	int significant = 0;
881 	float pmin=1;
882 	int n = (int) passing.size();
883     //use the same procedure as p.adjust(...,"BH") in R
884 	for (int k = n-1; k >= 0; k--)
885 	{
886 		double corrected_p = (double) passing[k]->p_value * ((double) n/(double) (k+1));
887         //make sure that no entry with lower p-value will get higher q-value than any entry with higher p-value
888 		if (corrected_p < pmin)
889 		{
890 			pmin = corrected_p;
891 		}
892 		else
893 		{
894 			corrected_p = pmin;
895 		}
896         // make sure that the q-value is always <= 1
897 		passing[k]->corrected_p = (corrected_p < 1 ? corrected_p : 1);
898 		passing[k]->significant = (corrected_p <= fdr);
899         significant += passing[k]->significant;
900 	}
901 
902 	return passing.size();
903 }
904 
905 
extract_sample_diffs(SampleDiffs & diff_map,vector<SampleDifference * > & diffs)906 void extract_sample_diffs(SampleDiffs& diff_map,
907 						  vector<SampleDifference*>& diffs)
908 {
909 	for (SampleDiffs::iterator itr = diff_map.begin();
910 		 itr != diff_map.end();
911 		 ++itr)
912 	{
913 		diffs.push_back(&(itr->second));
914 	}
915 }
916 
917 #if ENABLE_THREADS
918 boost::mutex inspect_lock;
919 #endif
920 
inspect_map_worker(ReplicatedBundleFactory & fac,int & tmp_min_frag_len,int & tmp_max_frag_len,IdToLocusMap & id_to_locus_map)921 void inspect_map_worker(ReplicatedBundleFactory& fac,
922                         int& tmp_min_frag_len,
923                         int& tmp_max_frag_len,
924                         IdToLocusMap& id_to_locus_map)
925 {
926 #if ENABLE_THREADS
927 	boost::this_thread::at_thread_exit(decr_pool_count);
928 #endif
929 
930     int min_f = std::numeric_limits<int>::max();
931     int max_f = 0;
932 
933     fac.inspect_replicate_maps(min_f, max_f, id_to_locus_map);
934 
935 #if ENABLE_THREADS
936     inspect_lock.lock();
937 #endif
938     tmp_min_frag_len = min(min_f, tmp_min_frag_len);
939     tmp_max_frag_len = max(max_f, tmp_max_frag_len);
940 #if ENABLE_THREADS
941     inspect_lock.unlock();
942 #endif
943 }
944 
learn_bias_worker(boost::shared_ptr<BundleFactory> fac)945 void learn_bias_worker(boost::shared_ptr<BundleFactory> fac)
946 {
947 #if ENABLE_THREADS
948 	boost::this_thread::at_thread_exit(decr_pool_count);
949 #endif
950 	boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
951 	BiasLearner* bl = new BiasLearner(rg_props->frag_len_dist());
952 	learn_bias(*fac, *bl, false);
953 	rg_props->bias_learner(boost::shared_ptr<BiasLearner>(bl));
954 }
955 
956 
957 boost::shared_ptr<TestLauncher> test_launcher;
958 
quantitate_next_locus(const RefSequenceTable & rt,vector<boost::shared_ptr<ReplicatedBundleFactory>> & bundle_factories,boost::shared_ptr<TestLauncher> launcher)959 bool quantitate_next_locus(const RefSequenceTable& rt,
960                            vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
961                            boost::shared_ptr<TestLauncher> launcher)
962 {
963     for (size_t i = 0; i < bundle_factories.size(); ++i)
964     {
965         boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
966 
967 #if ENABLE_THREADS
968         while(1)
969         {
970             locus_thread_pool_lock.lock();
971             if (locus_curr_threads < locus_num_threads)
972             {
973                 break;
974             }
975 
976             locus_thread_pool_lock.unlock();
977 
978             boost::this_thread::sleep(boost::posix_time::milliseconds(5));
979 
980         }
981 
982         locus_curr_threads++;
983         locus_thread_pool_lock.unlock();
984 
985         boost::shared_ptr<HitBundle> pBundle = boost::shared_ptr<HitBundle>(new HitBundle());
986         bool non_empty = bundle_factories[i]->next_bundle(*pBundle, true);
987 
988         if (pBundle->compatible_mass() > 0)
989         {
990             boost::thread quantitate(sample_worker,
991                               non_empty,
992                               pBundle,
993                               boost::ref(rt),
994                               boost::ref(*(bundle_factories[i])),
995                               s_ab,
996                               i,
997                               launcher,
998                               true);
999         }
1000         else
1001         {
1002             sample_worker(non_empty,
1003                           pBundle,
1004                           boost::ref(rt),
1005                           boost::ref(*(bundle_factories[i])),
1006                           s_ab,
1007                           i,
1008                           launcher,
1009                           true);
1010             locus_thread_pool_lock.lock();
1011             locus_curr_threads--;
1012             locus_thread_pool_lock.unlock();
1013         }
1014 #else
1015         HitBundle bundle;
1016         bool non_empty = sample_factory.next_bundle(bundle);
1017 
1018         sample_worker(non_emtpy,
1019                       pBundle,
1020                       boost::ref(rt),
1021                       boost::ref(*(bundle_factories[i])),
1022                       s_ab,
1023                       i,
1024                       launcher,
1025                       true);
1026 #endif
1027     }
1028     return true;
1029 }
1030 
parse_contrast_file(FILE * contrast_file,const vector<boost::shared_ptr<ReplicatedBundleFactory>> & factories,vector<pair<size_t,size_t>> & contrasts)1031 void parse_contrast_file(FILE* contrast_file,
1032                          const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories,
1033                          vector<pair<size_t, size_t > >& contrasts)
1034 {
1035 
1036     char pBuf[10 * 1024];
1037     size_t non_blank_lines_read = 0;
1038 
1039     map<string, pair<string, string> > contrast_table;
1040     map<string, size_t > factor_name_to_factory_idx;
1041 
1042     for (size_t j = 0; j < factories.size(); ++j)
1043     {
1044         string factor_name = factories[j]->condition_name();
1045         if (factor_name_to_factory_idx.find(factor_name) != factor_name_to_factory_idx.end())
1046         {
1047             fprintf(stderr, "Error in contrast file: condition names must be unique! (%s is duplicated)", factor_name.c_str());
1048             exit(1);
1049         }
1050         factor_name_to_factory_idx[factor_name] = j;
1051     }
1052 
1053     while (fgets(pBuf, 10*1024, contrast_file))
1054     {
1055         if (strlen(pBuf) > 0)
1056         {
1057             char* nl = strchr(pBuf, '\n');
1058             if (nl)
1059                 *nl = 0;
1060 
1061             string pBufstr = pBuf;
1062             string trimmed = boost::trim_copy(pBufstr);
1063 
1064             if (trimmed.length() > 0 && trimmed[0] != '#')
1065             {
1066                 non_blank_lines_read++;
1067                 vector<string> columns;
1068                 tokenize(trimmed, "\t", columns);
1069 
1070                 if (non_blank_lines_read == 1)
1071                     continue;
1072 
1073                 if (columns.size() < 2)
1074                 {
1075                     if (columns.size() > 0)
1076                         fprintf(stderr, "Malformed record in contrast file: \n   >  %s\n", pBuf);
1077                     else
1078                         continue;
1079                 }
1080 
1081                 string factor_1 = columns[0];
1082                 string factor_2 = columns[1];
1083 
1084                 if (columns.size() >= 3)
1085                 {
1086                     string contrast_name = columns[2];
1087                     contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
1088                 }
1089                 else
1090                 {
1091                     char contrast_name[1024];
1092                     sprintf(contrast_name, "contrast_%lu", contrast_table.size());
1093                     contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
1094                 }
1095             }
1096         }
1097     }
1098 
1099     for (map<string, pair<string, string> >::const_iterator itr = contrast_table.begin();
1100          itr != contrast_table.end(); ++itr)
1101     {
1102         string factor_1 = itr->second.first;
1103         map<string, size_t >::iterator f1_itr = factor_name_to_factory_idx.find(factor_1);
1104         if (f1_itr == factor_name_to_factory_idx.end())
1105         {
1106             fprintf (stderr, "Error: condition %s not found among samples\n", factor_1.c_str());
1107             exit(1);
1108         }
1109         size_t f1_idx = f1_itr->second;
1110 
1111         string factor_2 = itr->second.second;
1112         map<string, size_t >::iterator f2_itr = factor_name_to_factory_idx.find(factor_2);
1113         if (f2_itr == factor_name_to_factory_idx.end())
1114         {
1115             fprintf (stderr, "Error: condition %s not found among samples\n", factor_2.c_str());
1116             exit(1);
1117         }
1118         size_t f2_idx = f2_itr->second;
1119         contrasts.push_back(make_pair(f2_idx, f1_idx));
1120     }
1121  }
1122 
parse_sample_sheet_file(FILE * sample_sheet_file,vector<string> & sample_labels,vector<string> & sam_hit_filename_lists)1123 void parse_sample_sheet_file(FILE* sample_sheet_file,
1124                              vector<string>& sample_labels,
1125                              vector<string>& sam_hit_filename_lists)
1126 {
1127 
1128     char pBuf[10 * 1024];
1129     size_t non_blank_lines_read = 0;
1130 
1131     sample_labels.clear();
1132 
1133     map<string, vector<string> > sample_groups;
1134 
1135     while (fgets(pBuf, 10*1024, sample_sheet_file))
1136     {
1137         if (strlen(pBuf) > 0)
1138         {
1139             char* nl = strchr(pBuf, '\n');
1140             if (nl)
1141                 *nl = 0;
1142 
1143             string pBufstr = pBuf;
1144             string trimmed = boost::trim_copy(pBufstr);
1145 
1146             if (trimmed.length() > 0 && trimmed[0] != '#')
1147             {
1148                 non_blank_lines_read++;
1149                 vector<string> columns;
1150                 tokenize(trimmed, "\t", columns);
1151 
1152                 if (non_blank_lines_read == 1)
1153                     continue;
1154 
1155                 if (columns.size() < 2)
1156                 {
1157                     if (columns.size() > 0)
1158                         fprintf(stderr, "Malformed record in sample sheet: \n   >  %s\n", pBuf);
1159                     else
1160                         continue;
1161                 }
1162 
1163                 string sam_file = columns[0];
1164                 string sample_group = columns[1];
1165 
1166                 pair<map<string, vector<string> >::iterator, bool> inserted = sample_groups.insert(make_pair(sample_group, vector<string>()));
1167                 inserted.first->second.push_back(sam_file);
1168             }
1169         }
1170     }
1171 
1172     for (map<string, vector<string> >::iterator itr = sample_groups.begin();
1173          itr != sample_groups.end(); ++itr)
1174     {
1175         sample_labels.push_back(itr->first);
1176         string sam_list = boost::join(itr->second, ",");
1177         sam_hit_filename_lists.push_back(sam_list);
1178     }
1179 }
1180 
1181 
init_default_contrasts(const vector<boost::shared_ptr<ReplicatedBundleFactory>> & factories,bool samples_are_time_series,vector<pair<size_t,size_t>> & contrasts)1182 void init_default_contrasts(const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories,
1183                             bool samples_are_time_series,
1184                             vector<pair<size_t, size_t > >& contrasts)
1185 {
1186 
1187     for (size_t i = 1; i < factories.size(); ++i)
1188     {
1189         //bool multi_transcript_locus = samples[i]->transcripts.abundances().size() > 1;
1190 
1191         int sample_to_start_test_against = 0;
1192 
1193         if (samples_are_time_series)
1194             sample_to_start_test_against = i - 1;
1195 
1196         for (size_t j = sample_to_start_test_against; j < i; ++j)
1197         {
1198             contrasts.push_back(make_pair(i,j));
1199         }
1200     }
1201 }
1202 
parse_norm_standards_file(FILE * norm_standards_file)1203 void parse_norm_standards_file(FILE* norm_standards_file)
1204 {
1205     char pBuf[10 * 1024];
1206     size_t non_blank_lines_read = 0;
1207 
1208     boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
1209 
1210     while (fgets(pBuf, 10*1024, norm_standards_file))
1211     {
1212         if (strlen(pBuf) > 0)
1213         {
1214             char* nl = strchr(pBuf, '\n');
1215             if (nl)
1216                 *nl = 0;
1217 
1218             string pBufstr = pBuf;
1219             string trimmed = boost::trim_copy(pBufstr);
1220 
1221             if (trimmed.length() > 0 && trimmed[0] != '#')
1222             {
1223                 non_blank_lines_read++;
1224                 vector<string> columns;
1225                 tokenize(trimmed, "\t", columns);
1226 
1227                 if (non_blank_lines_read == 1)
1228                     continue;
1229 
1230                 if (columns.size() < 1) //
1231                 {
1232                     continue;
1233                 }
1234 
1235                 string gene_id = columns[0];
1236                 LibNormStandards L;
1237                 norm_standards->insert(make_pair(gene_id, L));
1238             }
1239         }
1240     }
1241     lib_norm_standards = norm_standards;
1242 }
1243 
1244 
print_variability_models(FILE * var_model_out,const vector<boost::shared_ptr<ReplicatedBundleFactory>> & factories)1245 void print_variability_models(FILE* var_model_out, const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories)
1246 {
1247 
1248     fprintf(var_model_out, "condition\tlocus\tcompatible_count_mean\tcompatible_count_var\ttotal_count_mean\ttotal_count_var\tfitted_var\n");
1249 
1250     for (size_t i = 0; i < factories.size(); ++i)
1251     {
1252         string factor_name = factories[i]->condition_name();
1253         boost::shared_ptr<ReadGroupProperties> rg = factories[i]->factories()[0]->read_group_properties();
1254         boost::shared_ptr<MassDispersionModel const> model = rg->mass_dispersion_model();
1255 //        const vector<double>& means = model->scaled_compatible_mass_means();
1256 //        const vector<double>& raw_vars  = model->scaled_compatible_variances();
1257 
1258         const vector<LocusCount>& common_scale_compatible_counts = rg->common_scale_compatible_counts();
1259         for (size_t j = 0; j < common_scale_compatible_counts.size(); ++j)
1260         {
1261             string locus_desc = common_scale_compatible_counts[j].locus_desc;
1262             pair<double, double> compat_mean_and_var = model->get_compatible_mean_and_var(locus_desc);
1263             pair<double, double> total_mean_and_var = model->get_total_mean_and_var(locus_desc);
1264 //            double total_compat_count = 0;
1265 //            if (itr != locus_to_total_count_table.end())
1266 //                total_compat_count = itr->second.count;
1267 
1268 
1269             fprintf(var_model_out, "%s\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\n",
1270                     factor_name.c_str(),
1271                     locus_desc.c_str(),
1272                     compat_mean_and_var.first,
1273                     compat_mean_and_var.second,
1274                     total_mean_and_var.first,
1275                     total_mean_and_var.second,
1276                     model->scale_mass_variance(compat_mean_and_var.first));
1277         }
1278     }
1279     fclose(var_model_out);
1280 
1281 }
1282 
1283 struct DispModelAverageContext
1284 {
1285     double compatible_count_mean;
1286     double total_count_mean;
1287     double compatible_count_var;
1288     double total_count_var;
1289 
1290     double fitted_var;
1291     double weight;
1292 };
1293 
fit_dispersions(vector<boost::shared_ptr<ReplicatedBundleFactory>> & bundle_factories)1294 void fit_dispersions(vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories)
1295 {
1296     if (dispersion_method == PER_CONDITION)
1297     {
1298         for (size_t i = 0; i < bundle_factories.size(); ++i)
1299         {
1300             bundle_factories[i]->fit_dispersion_model();
1301         }
1302     }
1303     else if (dispersion_method == BLIND)
1304     {
1305         size_t num_samples = 0;
1306         for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
1307         {
1308             const vector<boost::shared_ptr<BundleFactory> >& factories = bundle_factories[cond_idx]->factories();
1309             for (size_t fac_idx = 0; fac_idx < factories.size(); ++fac_idx)
1310             {
1311                 num_samples++;
1312             }
1313         }
1314         vector<double> scale_factors;
1315 
1316         vector<LocusCountList> sample_compatible_count_table;
1317         vector<LocusCountList> sample_total_count_table;
1318         size_t curr_fac = 0;
1319         for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
1320         {
1321             vector<boost::shared_ptr<BundleFactory> > factories = bundle_factories[cond_idx]->factories();
1322             for (size_t fac_idx = 0; fac_idx < factories.size(); ++fac_idx)
1323             {
1324                 boost::shared_ptr<BundleFactory> fac = factories[fac_idx];
1325 
1326                 boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
1327                 const vector<LocusCount>& compatible_count_table = rg_props->common_scale_compatible_counts();
1328                 const vector<LocusCount>& total_count_table = rg_props->common_scale_total_counts();
1329 
1330                 for (size_t i = 0; i < compatible_count_table.size(); ++i)
1331                 {
1332                     const LocusCount& c = compatible_count_table[i];
1333                     double common_scale_compatible_count = c.count;
1334                     double common_scale_total_count = total_count_table[i].count;
1335 
1336                     if (i >= sample_compatible_count_table.size())
1337                     {
1338                         LocusCountList locus_count(c.locus_desc, num_samples, c.num_transcripts, c.gene_ids, c.gene_short_names);
1339                         sample_compatible_count_table.push_back(locus_count);
1340                         sample_compatible_count_table.back().counts[0] = common_scale_compatible_count;
1341                         sample_total_count_table.push_back(locus_count);
1342                         sample_total_count_table.back().counts[0] = common_scale_total_count;
1343                     }
1344                     else
1345                     {
1346                         if (sample_compatible_count_table[i].locus_desc != c.locus_desc)
1347                         {
1348                             fprintf (stderr, "Error: bundle boundaries don't match across replicates!\n");
1349                             exit(1);
1350                         }
1351                         sample_compatible_count_table[i].counts[curr_fac + fac_idx] = common_scale_compatible_count;
1352                         sample_total_count_table[i].counts[curr_fac + fac_idx] = common_scale_total_count;
1353                     }
1354                 }
1355                 scale_factors.push_back(rg_props->internal_scale_factor());
1356 
1357             }
1358 
1359             curr_fac += factories.size();
1360         }
1361 
1362         boost::shared_ptr<MassDispersionModel> disperser = fit_dispersion_model("blind", scale_factors, sample_compatible_count_table);
1363 
1364         vector<pair<double, double> > compatible_means_and_vars;
1365         calculate_count_means_and_vars(sample_compatible_count_table,
1366                                        compatible_means_and_vars);
1367 
1368         for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
1369         {
1370             const LocusCountList& p = sample_compatible_count_table[i];
1371             double mean = compatible_means_and_vars[i].first;
1372             double var = compatible_means_and_vars[i].second;
1373             disperser->set_compatible_mean_and_var(p.locus_desc, make_pair(mean, var));
1374         }
1375 
1376         vector<pair<double, double> > total_means_and_vars;
1377         calculate_count_means_and_vars(sample_total_count_table,
1378                                        total_means_and_vars);
1379 
1380         for (size_t i = 0; i < sample_total_count_table.size(); ++i)
1381         {
1382             const LocusCountList& p = sample_compatible_count_table[i];
1383             double mean = total_means_and_vars[i].first;
1384             double var = total_means_and_vars[i].second;
1385             disperser->set_total_mean_and_var(p.locus_desc, make_pair(mean, var));
1386         }
1387 
1388         for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
1389         {
1390             bundle_factories[cond_idx]->mass_dispersion_model(disperser);
1391         }
1392     }
1393     else if (dispersion_method == POOLED)
1394     {
1395         for (size_t i = 0; i < bundle_factories.size(); ++i)
1396         {
1397             bundle_factories[i]->fit_dispersion_model();
1398         }
1399         // now need to replace them with the average
1400 
1401         boost::shared_ptr<MassDispersionModel> pooled_model;
1402         // Let's compute the pooled average of the dispersion models
1403         if (dispersion_method != BLIND)
1404         {
1405             vector<boost::shared_ptr<MassDispersionModel const> > disp_models;
1406             double total_replicates = 0.0;
1407             vector<double> disp_model_weight;
1408             BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1409             {
1410                 total_replicates += fac->num_replicates();
1411             }
1412             BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1413             {
1414                 if (fac->num_replicates() > 1)
1415                 {
1416                     disp_models.push_back(fac->mass_dispersion_model());
1417                     disp_model_weight.push_back((double)fac->num_replicates() / total_replicates);
1418                 }
1419             }
1420 
1421             double max_mass = 0.0;
1422 
1423             BOOST_FOREACH(boost::shared_ptr<MassDispersionModel const> disp, disp_models)
1424             {
1425                 if (disp->scaled_compatible_mass_means().empty() == false && max_mass < disp->scaled_compatible_mass_means().back())
1426                 {
1427                     max_mass = disp->scaled_compatible_mass_means().back();
1428                 }
1429             }
1430 
1431             map<std::string, vector<DispModelAverageContext> > disp_info_by_locus;
1432             for (size_t disp_idx = 0; disp_idx < disp_models.size(); ++disp_idx)
1433             {
1434                 boost::shared_ptr<MassDispersionModel const> disp = disp_models[disp_idx];
1435 
1436                 const std::map<std::string, std::pair<double, double> >& total_mv_by_locus = disp->total_mv_by_locus();
1437                 const std::map<std::string, std::pair<double, double> >& compatible_mv_by_locus = disp->compatible_mv_by_locus();
1438 
1439                 for (map<std::string, std::pair<double, double> >::const_iterator itr = compatible_mv_by_locus.begin();
1440                      itr != compatible_mv_by_locus.end(); ++itr)
1441                 {
1442 
1443                     std::map<std::string, std::pair<double, double> >::const_iterator total_itr = total_mv_by_locus.find(itr->first);
1444                     if (total_itr == total_mv_by_locus.end())
1445                         continue;
1446 
1447                     pair<map<std::string, vector<DispModelAverageContext> >::iterator, bool> ins_pair =
1448                     disp_info_by_locus.insert(make_pair(itr->first, vector<DispModelAverageContext>()));
1449 
1450 
1451                     DispModelAverageContext ctx;
1452                     ctx.compatible_count_mean = itr->second.first;
1453                     ctx.compatible_count_var = itr->second.second;
1454                     ctx.total_count_mean = total_itr->second.first;
1455                     ctx.total_count_var = total_itr->second.second;
1456                     if (use_compat_mass)
1457                         ctx.fitted_var = disp->scale_mass_variance(ctx.compatible_count_mean);
1458                     else
1459                         ctx.fitted_var = disp->scale_mass_variance(ctx.total_count_mean);
1460 
1461                     ctx.weight = disp_model_weight[disp_idx];
1462 
1463                     ins_pair.first->second.push_back(ctx);
1464                 }
1465             }
1466 
1467             map<string, DispModelAverageContext> pooled_info_by_locus;
1468 
1469             for (map<std::string, vector<DispModelAverageContext> >::const_iterator itr = disp_info_by_locus.begin();
1470                  itr != disp_info_by_locus.end();
1471                  ++itr)
1472             {
1473                 DispModelAverageContext avg_ctx;
1474                 avg_ctx.compatible_count_mean = 0;
1475                 avg_ctx.compatible_count_var = 0;
1476                 avg_ctx.total_count_mean = 0;
1477                 avg_ctx.total_count_var = 0;
1478                 avg_ctx.fitted_var = 0;
1479 
1480                 double total_weight = 0.0;
1481                 for (size_t i = 0; i < itr->second.size(); ++i)
1482                 {
1483                     total_weight += itr->second[i].weight;
1484                 }
1485 
1486                 for (size_t i = 0; i < itr->second.size(); ++i)
1487                 {
1488                     avg_ctx.compatible_count_mean += (itr->second[i].weight / total_weight) * itr->second[i].compatible_count_mean;
1489                     avg_ctx.compatible_count_var += (itr->second[i].weight / total_weight) * itr->second[i].compatible_count_var;
1490                     avg_ctx.total_count_mean += (itr->second[i].weight / total_weight) * itr->second[i].total_count_mean;
1491                     avg_ctx.total_count_var += (itr->second[i].weight / total_weight) * itr->second[i].total_count_var;
1492                 }
1493                 pooled_info_by_locus[itr->first] = avg_ctx;
1494             }
1495 
1496             vector<double> compatible_mass;
1497             vector<double> compatible_variances;
1498             vector<double> est_fitted_var;
1499             double epsilon = 0.2;
1500             for (double frag_idx = 0.0; frag_idx < max_mass; frag_idx += epsilon)
1501             {
1502                 compatible_mass.push_back(frag_idx);
1503                 double var_est = 0.0;
1504                 for(size_t i = 0; i < disp_models.size(); ++i)
1505                 {
1506                     boost::shared_ptr<MassDispersionModel const> disp = disp_models[i];
1507                     double weight = disp_model_weight[i];
1508                     var_est += disp->scale_mass_variance(frag_idx) * weight;
1509                 }
1510                 compatible_variances.push_back(var_est);
1511                 est_fitted_var.push_back(var_est);
1512             }
1513 
1514             pooled_model = boost::shared_ptr<MassDispersionModel>(new MassDispersionModel("pooled", compatible_mass, compatible_variances, est_fitted_var));
1515 
1516             for (map<std::string, DispModelAverageContext>::iterator itr = pooled_info_by_locus.begin();
1517                  itr != pooled_info_by_locus.end();
1518                  ++itr)
1519             {
1520                 const string& locus = itr->first;
1521                 pair<double, double> cmv = make_pair(itr->second.compatible_count_mean, itr->second.compatible_count_var);
1522                 pooled_model->set_compatible_mean_and_var(locus, cmv);
1523 
1524                 pair<double, double> tmv = make_pair(itr->second.total_count_mean, itr->second.total_count_var);
1525                 pooled_model->set_total_mean_and_var(locus, tmv);
1526             }
1527         }
1528 
1529         if (dispersion_method == POOLED)
1530         {
1531             BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1532             {
1533                 fac->mass_dispersion_model(pooled_model);
1534             }
1535         }
1536 
1537 
1538     }
1539     else if (dispersion_method == POISSON)
1540     {
1541         boost::shared_ptr<MassDispersionModel> disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(""));
1542         for (size_t i = 0; i < bundle_factories.size(); ++i)
1543         {
1544             bundle_factories[i]->mass_dispersion_model(disperser);
1545         }
1546 
1547     }
1548     else
1549     {
1550         fprintf (stderr, "Error: unknown dispersion method requested\n");
1551     }
1552 }
1553 
driver(FILE * ref_gtf,FILE * mask_gtf,FILE * contrast_file,FILE * norm_standards_file,vector<string> & sam_hit_filename_lists,Outfiles & outfiles)1554 void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
1555 {
1556 
1557 	ReadTable it;
1558 	RefSequenceTable rt(true, false);
1559 
1560 	vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
1561 
1562 	vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
1563     vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
1564 
1565 	for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
1566 	{
1567         vector<string> sam_hit_filenames;
1568         tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
1569 
1570         vector<boost::shared_ptr<BundleFactory> > replicate_factories;
1571 
1572         string condition_name = sample_labels[i];
1573 
1574         for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
1575         {
1576             boost::shared_ptr<HitFactory> hs;
1577             boost::shared_ptr<BundleFactory> hf;
1578             try
1579             {
1580                 hs = boost::shared_ptr<HitFactory>(new PrecomputedExpressionHitFactory(sam_hit_filenames[j], it, rt));
1581                 hf = boost::shared_ptr<BundleFactory>(new PrecomputedExpressionBundleFactory(static_pointer_cast<PrecomputedExpressionHitFactory>(hs)));
1582             }
1583 
1584             catch(boost::archive::archive_exception & e)
1585             {
1586 				hs = boost::shared_ptr<HitFactory>(createSamHitFactory(sam_hit_filenames[j], it, rt));
1587                 hf = boost::shared_ptr<BundleFactory>(new BundleFactory(hs, REF_DRIVEN));
1588             }
1589 
1590 
1591             boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
1592 
1593             if (global_read_properties)
1594             {
1595                 *rg_props = *global_read_properties;
1596             }
1597             else
1598             {
1599                 *rg_props = hs->read_group_properties();
1600             }
1601 
1602             rg_props->checked_parameters(hs->read_group_properties().checked_parameters());
1603             rg_props->condition_name(condition_name);
1604             rg_props->replicate_num(j);
1605             rg_props->file_path(sam_hit_filenames[j]);
1606 
1607             all_read_groups.push_back(rg_props);
1608 
1609             hf->read_group_properties(rg_props);
1610 
1611             replicate_factories.push_back(hf);
1612             //replicate_factories.back()->set_ref_rnas(ref_mRNAs);
1613         }
1614 
1615         bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
1616 	}
1617 
1618     boost::crc_32_type ref_gtf_crc_result;
1619     ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
1620     if (ref_mRNAs.empty())
1621         return;
1622 
1623     vector<boost::shared_ptr<Scaffold> > mask_rnas;
1624     if (mask_gtf)
1625     {
1626         boost::crc_32_type mask_gtf_crc_result;
1627         ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
1628     }
1629 
1630     BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1631     {
1632         fac->set_ref_rnas(ref_mRNAs);
1633         if (mask_gtf)
1634             fac->set_mask_rnas(mask_rnas);
1635     }
1636 
1637     vector<pair<size_t, size_t > > contrasts;
1638     if (contrast_file != NULL)
1639     {
1640         parse_contrast_file(contrast_file, bundle_factories, contrasts);
1641     }
1642     else
1643     {
1644         init_default_contrasts(bundle_factories, samples_are_time_series, contrasts);
1645     }
1646 
1647     if (norm_standards_file != NULL)
1648     {
1649         parse_norm_standards_file(norm_standards_file);
1650     }
1651 
1652     validate_cross_sample_parameters(all_read_groups);
1653 
1654 #if ENABLE_THREADS
1655     locus_num_threads = num_threads;
1656 #endif
1657 
1658     // Validate the dispersion method the user's chosen.
1659     int most_reps = -1;
1660     int most_reps_idx = 0;
1661 
1662     bool single_replicate_fac = false;
1663 
1664     for (size_t i = 0; i < bundle_factories.size(); ++i)
1665     {
1666         ReplicatedBundleFactory& fac = *(bundle_factories[i]);
1667         if (fac.num_replicates() > most_reps)
1668         {
1669             most_reps = fac.num_replicates();
1670             most_reps_idx = i;
1671         }
1672         if (most_reps == 1)
1673         {
1674             single_replicate_fac = true;
1675             if (dispersion_method == PER_CONDITION)
1676             {
1677                 fprintf(stderr, "Error: Dispersion method 'per-condition' requires that all conditions have at least 2 replicates.  Please use either 'pooled' or 'blind'\n");
1678                 exit(1);
1679             }
1680         }
1681     }
1682 
1683     if (most_reps == 1 && (dispersion_method != BLIND || dispersion_method != POISSON))
1684     {
1685         fprintf(stderr, "Warning: No conditions are replicated, switching to 'blind' dispersion method\n");
1686         dispersion_method = BLIND;
1687     }
1688 
1689 
1690     //bool pool_all_samples = ((most_reps <= 1 && dispersion_method == NOT_SET) || dispersion_method == BLIND);
1691 
1692 
1693 	int tmp_min_frag_len = numeric_limits<int>::max();
1694 	int tmp_max_frag_len = 0;
1695 
1696 	ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
1697 
1698     IdToLocusMap id_to_locus_map(boost::shared_ptr<map<string, set<string> > >(new map<string, set<string> >()));
1699 	BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1700     {
1701 #if ENABLE_THREADS
1702         while(1)
1703         {
1704             locus_thread_pool_lock.lock();
1705             if (locus_curr_threads < locus_num_threads)
1706             {
1707                 break;
1708             }
1709 
1710             locus_thread_pool_lock.unlock();
1711 
1712             boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1713         }
1714 
1715         locus_curr_threads++;
1716         locus_thread_pool_lock.unlock();
1717 
1718         boost::thread inspect(inspect_map_worker,
1719                        boost::ref(*fac),
1720                        boost::ref(tmp_min_frag_len),
1721                        boost::ref(tmp_max_frag_len),
1722                        boost::ref(id_to_locus_map));
1723 #else
1724         inspect_map_worker(boost::ref(*fac),
1725                            boost::ref(tmp_min_frag_len),
1726                            boost::ref(tmp_max_frag_len),
1727                            id_to_locus_map);
1728 #endif
1729     }
1730 
1731     // wait for the workers to finish up before reporting everthing.
1732 #if ENABLE_THREADS
1733     while(1)
1734     {
1735         locus_thread_pool_lock.lock();
1736         if (locus_curr_threads == 0)
1737         {
1738             locus_thread_pool_lock.unlock();
1739             break;
1740         }
1741         locus_thread_pool_lock.unlock();
1742 
1743         boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1744     }
1745 #endif
1746 
1747     normalize_counts(all_read_groups);
1748     fit_dispersions(bundle_factories);
1749 
1750     print_variability_models(outfiles.var_model_out, bundle_factories);
1751 
1752     for (size_t i = 0; i < all_read_groups.size(); ++i)
1753     {
1754         boost::shared_ptr<ReadGroupProperties> rg = all_read_groups[i];
1755         fprintf(stderr, "> Map Properties:\n");
1756 
1757         fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", rg->normalized_map_mass());
1758         fprintf(stderr, ">\tRaw Map Mass: %.2Lf\n", rg->total_map_mass());
1759         if (corr_multi)
1760             fprintf(stderr,">\tNumber of Multi-Reads: %zu (with %zu total hits)\n", rg->multi_read_table()->num_multireads(), rg->multi_read_table()->num_multihits());
1761 
1762         if (rg->frag_len_dist()->source() == LEARNED)
1763         {
1764             fprintf(stderr, ">\tFragment Length Distribution: Empirical (learned)\n");
1765             fprintf(stderr, ">\t              Estimated Mean: %.2f\n", rg->frag_len_dist()->mean());
1766             fprintf(stderr, ">\t           Estimated Std Dev: %.2f\n", rg->frag_len_dist()->std_dev());
1767         }
1768         else
1769         {
1770             if (rg->frag_len_dist()->source() == USER)
1771                 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (user-specified)\n");
1772             else //rg->frag_len_dist()->source == FLD::DEFAULT
1773                 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (default)\n");
1774             fprintf(stderr, ">\t              Default Mean: %d\n", def_frag_len_mean);
1775             fprintf(stderr, ">\t           Default Std Dev: %d\n", def_frag_len_std_dev);
1776         }
1777     }
1778 
1779     long double total_norm_mass = 0.0;
1780     long double total_mass = 0.0;
1781     BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1782     {
1783         total_norm_mass += rg_props->normalized_map_mass();
1784         total_mass += rg_props->total_map_mass();
1785     }
1786 
1787 	min_frag_len = tmp_min_frag_len;
1788     max_frag_len = tmp_max_frag_len;
1789 
1790 	final_est_run = false;
1791 
1792 	double num_bundles = (double)bundle_factories[0]->num_bundles();
1793 
1794 //    if (corr_bias && corr_multi)
1795 //        p_bar = ProgressBar("Calculating initial abundance estimates for bias and multi-read correction.", num_bundles);
1796 //    else if (corr_bias)
1797 //        p_bar = ProgressBar("Calculating initial abundance estimates for bias correction.", num_bundles);
1798 //    else if (corr_multi)
1799 //        p_bar = ProgressBar("Calculating initial abundance estimates for multi-read correction.", num_bundles);
1800 //    else
1801 
1802     p_bar = ProgressBar("Calculating preliminary abundance estimates", num_bundles);
1803 
1804     Tracking tracking;
1805 
1806     test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, NULL, &tracking, &p_bar, true));
1807 
1808 	if (model_mle_error || corr_bias || corr_multi) // Only run initial estimation if correcting bias or multi-reads
1809 	{
1810 		while (1)
1811 		{
1812 			boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1813 			quantitate_next_locus(rt, bundle_factories, test_launcher);
1814 			bool more_loci_remain = false;
1815             BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1816             {
1817                 if (rep_fac->bundles_remain())
1818                 {
1819                     more_loci_remain = true;
1820                     break;
1821                 }
1822             }
1823 
1824 			if (!more_loci_remain)
1825             {
1826                 // wait for the workers to finish up before breaking out.
1827 #if ENABLE_THREADS
1828                 while(1)
1829                 {
1830                     locus_thread_pool_lock.lock();
1831                     if (locus_curr_threads == 0)
1832                     {
1833                         locus_thread_pool_lock.unlock();
1834                         break;
1835                     }
1836 
1837                     locus_thread_pool_lock.unlock();
1838 
1839                     boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1840 
1841                 }
1842 #endif
1843 				break;
1844             }
1845 		}
1846 
1847         BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1848 		{
1849 			rep_fac->reset();
1850         }
1851 
1852 		p_bar.complete();
1853 	}
1854     if (corr_bias)
1855     {
1856         bias_run = true;
1857         p_bar = ProgressBar("Learning bias parameters.", 0);
1858 		BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1859 		{
1860 			BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, rep_fac->factories())
1861 			{
1862 #if ENABLE_THREADS
1863 				while(1)
1864 				{
1865 					locus_thread_pool_lock.lock();
1866 					if (locus_curr_threads < locus_num_threads)
1867 					{
1868 						break;
1869 					}
1870 
1871 					locus_thread_pool_lock.unlock();
1872 
1873 					boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1874 				}
1875 				locus_curr_threads++;
1876 				locus_thread_pool_lock.unlock();
1877 
1878 				boost::thread bias(learn_bias_worker, fac);
1879 #else
1880 				learn_bias_worker(fac);
1881 #endif
1882 			}
1883     	}
1884 
1885     // wait for the workers to finish up before reporting everthing.
1886 #if ENABLE_THREADS
1887 		while(1)
1888 		{
1889 			locus_thread_pool_lock.lock();
1890 			if (locus_curr_threads == 0)
1891 			{
1892 				locus_thread_pool_lock.unlock();
1893 				break;
1894 			}
1895 
1896 			locus_thread_pool_lock.unlock();
1897 
1898 			boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1899 		}
1900 #endif
1901         BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1902 		{
1903 			rep_fac->reset();
1904         }
1905         bias_run = false;
1906 	}
1907 
1908     fprintf(outfiles.bias_out, "condition_name\treplicate_num\tparam\tpos_i\tpos_j\tvalue\n");
1909     BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1910     {
1911         if (rg_props->bias_learner())
1912             rg_props->bias_learner()->output(outfiles.bias_out, rg_props->condition_name(), rg_props->replicate_num());
1913     }
1914 
1915 
1916     // Allow the multiread tables to do their thing...
1917     BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1918     {
1919         rg_props->multi_read_table()->valid_mass(true);
1920     }
1921 
1922     test_launcher->clear_tracking_data();
1923 
1924 	Tests tests;
1925 
1926     int N = (int)sam_hit_filename_lists.size();
1927 
1928     tests.isoform_de_tests = vector<vector<SampleDiffs> >(N);
1929     tests.tss_group_de_tests = vector<vector<SampleDiffs> >(N);
1930     tests.gene_de_tests = vector<vector<SampleDiffs> >(N);
1931     tests.cds_de_tests = vector<vector<SampleDiffs> >(N);
1932     tests.diff_splicing_tests = vector<vector<SampleDiffs> >(N);
1933     tests.diff_promoter_tests = vector<vector<SampleDiffs> >(N);
1934     tests.diff_cds_tests = vector<vector<SampleDiffs> >(N);
1935 
1936 	for (int i = 0; i < N; ++i)
1937     {
1938         tests.isoform_de_tests[i] = vector<SampleDiffs>(N);
1939         tests.tss_group_de_tests[i] = vector<SampleDiffs>(N);
1940         tests.gene_de_tests[i] = vector<SampleDiffs>(N);
1941         tests.cds_de_tests[i] = vector<SampleDiffs>(N);
1942         tests.diff_splicing_tests[i] = vector<SampleDiffs>(N);
1943         tests.diff_promoter_tests[i] = vector<SampleDiffs>(N);
1944         tests.diff_cds_tests[i] = vector<SampleDiffs>(N);
1945     }
1946 
1947 	final_est_run = true;
1948 	p_bar = ProgressBar("Testing for differential expression and regulation in locus.", num_bundles);
1949 
1950     test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, &tests, &tracking, &p_bar));
1951 
1952 	while (true)
1953 	{
1954         //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1955         quantitate_next_locus(rt, bundle_factories, test_launcher);
1956         bool more_loci_remain = false;
1957         BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1958         {
1959             if (rep_fac->bundles_remain())
1960             {
1961                 more_loci_remain = true;
1962                 break;
1963             }
1964         }
1965         if (!more_loci_remain)
1966         {
1967             // wait for the workers to finish up before doing the cross-sample testing.
1968 #if ENABLE_THREADS
1969             while(1)
1970             {
1971                 locus_thread_pool_lock.lock();
1972                 if (locus_curr_threads == 0)
1973                 {
1974                     locus_thread_pool_lock.unlock();
1975                     break;
1976                 }
1977 
1978                 locus_thread_pool_lock.unlock();
1979 
1980                 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1981 
1982             }
1983 #endif
1984             break;
1985         }
1986     }
1987 
1988 	p_bar.complete();
1989 
1990 	//double FDR = 0.05;
1991 	int total_iso_de_tests = 0;
1992 
1993 	vector<SampleDifference*> isoform_exp_diffs;
1994     fprintf(outfiles.isoform_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
1995 
1996 
1997     for (size_t i = 0; i < tests.isoform_de_tests.size(); ++i)
1998     {
1999         for (size_t j = 0; j < i; ++j)
2000         {
2001             total_iso_de_tests += tests.isoform_de_tests[i][j].size();
2002             extract_sample_diffs(tests.isoform_de_tests[i][j], isoform_exp_diffs);
2003         }
2004     }
2005 
2006     int iso_exp_tests = fdr_significance(FDR, isoform_exp_diffs);
2007     fprintf(stderr, "Performed %d isoform-level transcription difference tests\n", iso_exp_tests);
2008 
2009     for (size_t i = 0; i < tests.isoform_de_tests.size(); ++i)
2010     {
2011         for (size_t j = 0; j < i; ++j)
2012         {
2013             print_tests(outfiles.isoform_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.isoform_de_tests[i][j]);
2014         }
2015     }
2016 
2017 
2018 	int total_group_de_tests = 0;
2019 	vector<SampleDifference*> tss_group_exp_diffs;
2020     fprintf(outfiles.group_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
2021 
2022     for (size_t i = 0; i < tests.tss_group_de_tests.size(); ++i)
2023     {
2024         for (size_t j = 0; j < i; ++j)
2025         {
2026             extract_sample_diffs(tests.tss_group_de_tests[i][j], tss_group_exp_diffs);
2027             total_group_de_tests += tests.tss_group_de_tests[i][j].size();
2028         }
2029     }
2030 
2031     int tss_group_exp_tests = fdr_significance(FDR, tss_group_exp_diffs);
2032     fprintf(stderr, "Performed %d tss-level transcription difference tests\n", tss_group_exp_tests);
2033 
2034     for (size_t i = 0; i < tests.tss_group_de_tests.size(); ++i)
2035     {
2036         for (size_t j = 0; j < i; ++j)
2037         {
2038             print_tests(outfiles.group_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.tss_group_de_tests[i][j]);
2039         }
2040     }
2041 
2042 
2043 	int total_gene_de_tests = 0;
2044 	vector<SampleDifference*> gene_exp_diffs;
2045     fprintf(outfiles.gene_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
2046 
2047     for (size_t i = 0; i < tests.gene_de_tests.size(); ++i)
2048     {
2049         for (size_t j = 0; j < tests.gene_de_tests.size(); ++j)
2050         {
2051             total_gene_de_tests += tests.gene_de_tests[i][j].size();
2052             extract_sample_diffs(tests.gene_de_tests[i][j], gene_exp_diffs);
2053         }
2054     }
2055 
2056     //fprintf(stderr, "***There are %lu difference records in gene_exp_diffs\n", gene_exp_diffs.size());
2057     int gene_exp_tests = fdr_significance(FDR, gene_exp_diffs);
2058     fprintf(stderr, "Performed %d gene-level transcription difference tests\n", gene_exp_tests);
2059 
2060     for (size_t i = 0; i < tests.gene_de_tests.size(); ++i)
2061     {
2062         for (size_t j = 0; j < tests.gene_de_tests.size(); ++j)
2063         {
2064             print_tests(outfiles.gene_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.gene_de_tests[i][j]);
2065         }
2066     }
2067 
2068 
2069 	int total_cds_de_tests = 0;
2070 	vector<SampleDifference*> cds_exp_diffs;
2071     fprintf(outfiles.cds_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
2072 
2073 
2074     for (size_t i = 0; i < tests.cds_de_tests.size(); ++i)
2075     {
2076         for (size_t j = 0; j < tests.cds_de_tests.size(); ++j)
2077         {
2078             total_cds_de_tests += tests.cds_de_tests[i][j].size();
2079             extract_sample_diffs(tests.cds_de_tests[i][j], cds_exp_diffs);
2080         }
2081     }
2082 
2083 
2084     int cds_exp_tests = fdr_significance(FDR, cds_exp_diffs);
2085     fprintf(stderr, "Performed %d CDS-level transcription difference tests\n", cds_exp_tests);
2086 
2087     for (size_t i = 0; i < tests.cds_de_tests.size(); ++i)
2088     {
2089         for (size_t j = 0; j < tests.cds_de_tests.size(); ++j)
2090         {
2091             print_tests(outfiles.cds_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.cds_de_tests[i][j]);
2092         }
2093     }
2094 
2095 
2096 	int total_diff_splice_tests = 0;
2097 	vector<SampleDifference*> splicing_diffs;
2098     fprintf(outfiles.diff_splicing_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tsqrt(JS)\ttest_stat\tp_value\tq_value\tsignificant\n");
2099 
2100     for (size_t i = 0; i < tests.diff_splicing_tests.size(); ++i)
2101     {
2102         for (size_t j = 0; j < tests.diff_splicing_tests.size(); ++j)
2103         {
2104             total_diff_splice_tests += tests.diff_splicing_tests[i][j].size();
2105             extract_sample_diffs(tests.diff_splicing_tests[i][j], splicing_diffs);
2106         }
2107     }
2108 
2109     int splicing_tests = fdr_significance(FDR, splicing_diffs);
2110     fprintf(stderr, "Performed %d splicing tests\n", splicing_tests);
2111 
2112     for (size_t i = 0; i < tests.diff_splicing_tests.size(); ++i)
2113     {
2114         for (size_t j = 0; j < tests.diff_splicing_tests.size(); ++j)
2115         {
2116             const SampleDiffs& diffs = tests.diff_splicing_tests[i][j];
2117             print_tests(outfiles.diff_splicing_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), diffs);
2118         }
2119     }
2120 
2121 
2122 	int total_diff_promoter_tests = 0;
2123 	vector<SampleDifference*> promoter_diffs;
2124     fprintf(outfiles.diff_promoter_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tsqrt(JS)\ttest_stat\tp_value\tq_value\tsignificant\n");
2125 
2126     for (size_t i = 1; i < tests.diff_splicing_tests.size(); ++i)
2127     {
2128         for (size_t j = 0; j < tests.diff_splicing_tests.size(); ++j)
2129         {
2130             total_diff_promoter_tests += tests.diff_promoter_tests[i][j].size();
2131             extract_sample_diffs(tests.diff_promoter_tests[i][j], promoter_diffs);
2132         }
2133     }
2134 
2135 
2136     int promoter_tests = fdr_significance(FDR, promoter_diffs);
2137     fprintf(stderr, "Performed %d promoter preference tests\n", promoter_tests);
2138 
2139     for (size_t i = 1; i < tests.diff_promoter_tests.size(); ++i)
2140     {
2141         for (size_t j = 0; j < tests.diff_promoter_tests.size(); ++j)
2142         {
2143             print_tests(outfiles.diff_promoter_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.diff_promoter_tests[i][j]);
2144         }
2145     }
2146 
2147 	int total_diff_cds_tests = 0;
2148 	vector<SampleDifference*> cds_use_diffs;
2149     fprintf(outfiles.diff_cds_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tsqrt(JS)\ttest_stat\tp_value\tq_value\tsignificant\n");
2150 
2151     for (size_t i = 1; i < tests.diff_cds_tests.size(); ++i)
2152     {
2153         for (size_t j = 0; j < tests.diff_cds_tests.size(); ++j)
2154         {
2155             extract_sample_diffs(tests.diff_cds_tests[i][j], cds_use_diffs);
2156             total_diff_cds_tests += tests.diff_cds_tests[i][j].size();
2157         }
2158     }
2159 
2160 
2161     int cds_use_tests = fdr_significance(FDR, cds_use_diffs);
2162     fprintf(stderr, "Performing %d relative CDS output tests\n", cds_use_tests);
2163 
2164     for (size_t i = 1; i < tests.diff_cds_tests.size(); ++i)
2165     {
2166         for (size_t j = 0; j < tests.diff_cds_tests.size(); ++j)
2167         {
2168             print_tests(outfiles.diff_cds_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.diff_cds_tests[i][j]);
2169         }
2170     }
2171 
2172 
2173     // FPKM tracking
2174 
2175 	FILE* fiso_fpkm_tracking =  outfiles.isoform_fpkm_tracking_out;
2176 	fprintf(stderr, "Writing isoform-level FPKM tracking\n");
2177 	print_FPKM_tracking(fiso_fpkm_tracking,tracking.isoform_fpkm_tracking);
2178 
2179 	FILE* ftss_fpkm_tracking =  outfiles.tss_group_fpkm_tracking_out;
2180 	fprintf(stderr, "Writing TSS group-level FPKM tracking\n");
2181 	print_FPKM_tracking(ftss_fpkm_tracking,tracking.tss_group_fpkm_tracking);
2182 
2183 	FILE* fgene_fpkm_tracking =  outfiles.gene_fpkm_tracking_out;
2184 	fprintf(stderr, "Writing gene-level FPKM tracking\n");
2185 	print_FPKM_tracking(fgene_fpkm_tracking,tracking.gene_fpkm_tracking);
2186 
2187 	FILE* fcds_fpkm_tracking =  outfiles.cds_fpkm_tracking_out;
2188 	fprintf(stderr, "Writing CDS-level FPKM tracking\n");
2189 	print_FPKM_tracking(fcds_fpkm_tracking,tracking.cds_fpkm_tracking);
2190 
2191     // Count tracking
2192 
2193     FILE* fiso_count_tracking =  outfiles.isoform_count_tracking_out;
2194 	fprintf(stderr, "Writing isoform-level count tracking\n");
2195 	print_count_tracking(fiso_count_tracking,tracking.isoform_fpkm_tracking);
2196 
2197 	FILE* ftss_count_tracking =  outfiles.tss_group_count_tracking_out;
2198 	fprintf(stderr, "Writing TSS group-level count tracking\n");
2199 	print_count_tracking(ftss_count_tracking,tracking.tss_group_fpkm_tracking);
2200 
2201 	FILE* fgene_count_tracking =  outfiles.gene_count_tracking_out;
2202 	fprintf(stderr, "Writing gene-level count tracking\n");
2203 	print_count_tracking(fgene_count_tracking,tracking.gene_fpkm_tracking);
2204 
2205 	FILE* fcds_count_tracking =  outfiles.cds_count_tracking_out;
2206 	fprintf(stderr, "Writing CDS-level count tracking\n");
2207 	print_count_tracking(fcds_count_tracking,tracking.cds_fpkm_tracking);
2208 
2209     // Read group tracking
2210 
2211     FILE* fiso_rep_tracking =  outfiles.isoform_rep_tracking_out;
2212 	fprintf(stderr, "Writing isoform-level read group tracking\n");
2213 	print_read_group_tracking(fiso_rep_tracking,tracking.isoform_fpkm_tracking);
2214 
2215 	FILE* ftss_rep_tracking =  outfiles.tss_group_rep_tracking_out;
2216 	fprintf(stderr, "Writing TSS group-level read group tracking\n");
2217 	print_read_group_tracking(ftss_rep_tracking,tracking.tss_group_fpkm_tracking);
2218 
2219 	FILE* fgene_rep_tracking =  outfiles.gene_rep_tracking_out;
2220 	fprintf(stderr, "Writing gene-level read group tracking\n");
2221 	print_read_group_tracking(fgene_rep_tracking,tracking.gene_fpkm_tracking);
2222 
2223 	FILE* fcds_rep_tracking =  outfiles.cds_rep_tracking_out;
2224 	fprintf(stderr, "Writing CDS-level read group tracking\n");
2225 	print_read_group_tracking(fcds_rep_tracking,tracking.cds_fpkm_tracking);
2226 
2227     FILE* fread_group_info =  outfiles.read_group_info_out;
2228 	fprintf(stderr, "Writing read group info\n");
2229 	print_read_group_info(fread_group_info,all_read_groups);
2230 
2231     FILE* frun_info =  outfiles.run_info_out;
2232 	fprintf(stderr, "Writing run info\n");
2233 	print_run_info(frun_info);
2234 }
2235 
main(int argc,char ** argv)2236 int main(int argc, char** argv)
2237 {
2238     for (int i = 0; i < argc; ++i)
2239     {
2240         cmd_str += string(argv[i]) + " ";
2241     }
2242 
2243     init_library_table();
2244     init_dispersion_method_table();
2245     init_lib_norm_method_table();
2246 
2247     min_isoform_fraction = 1e-5;
2248 
2249 	int parse_ret = parse_options(argc,argv);
2250     if (parse_ret)
2251         return parse_ret;
2252 
2253     if (!use_total_mass && !use_compat_mass)
2254     {
2255         use_total_mass = false;
2256         use_compat_mass = true;
2257     }
2258 
2259 	if(optind >= argc)
2260     {
2261         print_usage();
2262         return 1;
2263     }
2264 
2265     if (!no_update_check)
2266         check_version(PACKAGE_VERSION);
2267 
2268     string ref_gtf_filename = argv[optind++];
2269     vector<string> sam_hit_filenames;
2270 
2271     if (use_sample_sheet)
2272     {
2273         if  (optind < argc)
2274         {
2275 
2276             string sample_sheet_filename = argv[optind++];
2277             FILE* sample_sheet_file = NULL;
2278             if (sample_sheet_filename != "")
2279             {
2280                 sample_sheet_file = fopen(sample_sheet_filename.c_str(), "r");
2281                 if (!sample_sheet_file)
2282                 {
2283                     fprintf(stderr, "Error: cannot open sample sheet file %s for reading\n",
2284                             sample_sheet_filename.c_str());
2285                     exit(1);
2286                 }
2287             }
2288             parse_sample_sheet_file(sample_sheet_file, sample_labels, sam_hit_filenames);
2289         }
2290         else
2291         {
2292             fprintf(stderr, "Error: option --use-sample-sheet requires a single sample sheet filename instead of a list of SAM/BAM files\n");
2293         }
2294     }
2295     else
2296     {
2297         while(optind < argc)
2298         {
2299             string sam_hits_file_name = argv[optind++];
2300             sam_hit_filenames.push_back(sam_hits_file_name);
2301         }
2302 
2303         if (sample_labels.size() == 0)
2304         {
2305             for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
2306             {
2307                 char buf[256];
2308                 sprintf(buf, "q%lu", i);
2309                 sample_labels.push_back(buf);
2310             }
2311         }
2312     }
2313 
2314 	while (sam_hit_filenames.size() < 2)
2315     {
2316         fprintf(stderr, "Error: cuffdiff requires at least 2 SAM files\n");
2317         exit(1);
2318     }
2319 
2320 
2321     if (sam_hit_filenames.size() != sample_labels.size())
2322     {
2323         fprintf(stderr, "Error: number of labels must match number of conditions\n");
2324         exit(1);
2325     }
2326 
2327     if (random_seed == -1)
2328         random_seed = boost::mt19937::default_seed;
2329 
2330 	// seed the random number generator - we'll need it for the importance
2331 	// sampling during MAP estimation of the gammas
2332 	srand48(random_seed);
2333 
2334 	FILE* ref_gtf = NULL;
2335 	if (ref_gtf_filename != "")
2336 	{
2337 		ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
2338 		if (!ref_gtf)
2339 		{
2340 			fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
2341 					ref_gtf_filename.c_str());
2342 			exit(1);
2343 		}
2344 	}
2345 
2346 	FILE* mask_gtf = NULL;
2347 	if (mask_gtf_filename != "")
2348 	{
2349 		mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
2350 		if (!mask_gtf)
2351 		{
2352 			fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
2353 					mask_gtf_filename.c_str());
2354 			exit(1);
2355 		}
2356 	}
2357 
2358 
2359     FILE* contrast_file = NULL;
2360 	if (contrast_filename != "")
2361 	{
2362 		contrast_file = fopen(contrast_filename.c_str(), "r");
2363 		if (!contrast_file)
2364 		{
2365 			fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
2366 					contrast_filename.c_str());
2367 			exit(1);
2368 		}
2369 	}
2370 
2371     FILE* norm_standards_file = NULL;
2372 	if (norm_standards_filename != "")
2373 	{
2374 		norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
2375 		if (!norm_standards_file)
2376 		{
2377 			fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
2378 					norm_standards_filename.c_str());
2379 			exit(1);
2380 		}
2381 	}
2382 
2383 
2384 	// Note: we don't want the assembly filters interfering with calculations
2385 	// here
2386 
2387 	pre_mrna_fraction = 0.0;
2388     olap_radius = 0;
2389 
2390 	Outfiles outfiles;
2391 
2392     if (output_dir != "")
2393     {
2394         int retcode = mkpath(output_dir.c_str(), 0777);
2395         if (retcode == -1)
2396         {
2397             if (errno != EEXIST)
2398             {
2399                 fprintf (stderr,
2400                          "Error: cannot create directory %s\n",
2401                          output_dir.c_str());
2402                 exit(1);
2403             }
2404         }
2405     }
2406 
2407     static const int filename_buf_size = 2048;
2408 
2409     char out_file_prefix[filename_buf_size];
2410     sprintf(out_file_prefix, "%s/", output_dir.c_str());
2411     char iso_out_file_name[filename_buf_size];
2412     sprintf(iso_out_file_name, "%sisoform_exp.diff", out_file_prefix);
2413     FILE* iso_out = fopen(iso_out_file_name, "w");
2414     if (!iso_out)
2415     {
2416         fprintf(stderr, "Error: cannot open differential isoform transcription file %s for writing\n",
2417                 iso_out_file_name);
2418         exit(1);
2419     }
2420 
2421     char group_out_file_name[filename_buf_size];
2422     sprintf(group_out_file_name, "%stss_group_exp.diff", out_file_prefix);
2423     FILE* group_out = fopen(group_out_file_name, "w");
2424     if (!group_out)
2425     {
2426         fprintf(stderr, "Error: cannot open differential TSS group transcription file %s for writing\n",
2427                 group_out_file_name);
2428         exit(1);
2429     }
2430 
2431     char gene_out_file_name[filename_buf_size];
2432     sprintf(gene_out_file_name, "%sgene_exp.diff", out_file_prefix);
2433     FILE* gene_out = fopen(gene_out_file_name, "w");
2434     if (!group_out)
2435     {
2436         fprintf(stderr, "Error: cannot open gene expression file %s for writing\n",
2437                 gene_out_file_name);
2438         exit(1);
2439     }
2440 
2441     char cds_out_file_name[filename_buf_size];
2442     sprintf(cds_out_file_name, "%scds_exp.diff", out_file_prefix);
2443     FILE* cds_out = fopen(cds_out_file_name, "w");
2444     if (!cds_out)
2445     {
2446         fprintf(stderr, "Error: cannot open cds expression file %s for writing\n",
2447                 cds_out_file_name);
2448         exit(1);
2449     }
2450 
2451     char diff_splicing_out_file_name[filename_buf_size];
2452     sprintf(diff_splicing_out_file_name, "%ssplicing.diff", out_file_prefix);
2453     FILE* diff_splicing_out = fopen(diff_splicing_out_file_name, "w");
2454     if (!diff_splicing_out)
2455     {
2456         fprintf(stderr, "Error: cannot open differential splicing file %s for writing\n",
2457                 diff_splicing_out_file_name);
2458         exit(1);
2459     }
2460 
2461     char diff_promoter_out_file_name[filename_buf_size];
2462     sprintf(diff_promoter_out_file_name, "%spromoters.diff", out_file_prefix);
2463     FILE* diff_promoter_out = fopen(diff_promoter_out_file_name, "w");
2464     if (!diff_promoter_out)
2465     {
2466         fprintf(stderr, "Error: cannot open differential transcription start file %s for writing\n",
2467                 diff_promoter_out_file_name);
2468         exit(1);
2469     }
2470 
2471     char diff_cds_out_file_name[filename_buf_size];
2472     sprintf(diff_cds_out_file_name, "%scds.diff", out_file_prefix);
2473     FILE* diff_cds_out = fopen(diff_cds_out_file_name, "w");
2474     if (!diff_cds_out)
2475     {
2476         fprintf(stderr, "Error: cannot open differential relative CDS file %s for writing\n",
2477                 diff_cds_out_file_name);
2478         exit(1);
2479     }
2480 
2481     outfiles.isoform_de_outfile = iso_out;
2482     outfiles.group_de_outfile = group_out;
2483     outfiles.gene_de_outfile = gene_out;
2484     outfiles.cds_de_outfile = cds_out;
2485     outfiles.diff_splicing_outfile = diff_splicing_out;
2486     outfiles.diff_promoter_outfile = diff_promoter_out;
2487     outfiles.diff_cds_outfile = diff_cds_out;
2488 
2489 	char isoform_fpkm_tracking_name[filename_buf_size];
2490 	sprintf(isoform_fpkm_tracking_name, "%s/isoforms.fpkm_tracking", output_dir.c_str());
2491 	FILE* isoform_fpkm_out = fopen(isoform_fpkm_tracking_name, "w");
2492 	if (!isoform_fpkm_out)
2493 	{
2494 		fprintf(stderr, "Error: cannot open isoform-level FPKM tracking file %s for writing\n",
2495 				isoform_fpkm_tracking_name);
2496 		exit(1);
2497 	}
2498 	outfiles.isoform_fpkm_tracking_out = isoform_fpkm_out;
2499 
2500 	char tss_group_fpkm_tracking_name[filename_buf_size];
2501 	sprintf(tss_group_fpkm_tracking_name, "%s/tss_groups.fpkm_tracking", output_dir.c_str());
2502 	FILE* tss_group_fpkm_out = fopen(tss_group_fpkm_tracking_name, "w");
2503 	if (!tss_group_fpkm_out)
2504 	{
2505 		fprintf(stderr, "Error: cannot open TSS group-level FPKM tracking file %s for writing\n",
2506 				tss_group_fpkm_tracking_name);
2507 		exit(1);
2508 	}
2509 	outfiles.tss_group_fpkm_tracking_out = tss_group_fpkm_out;
2510 
2511 	char cds_fpkm_tracking_name[filename_buf_size];
2512 	sprintf(cds_fpkm_tracking_name, "%s/cds.fpkm_tracking", output_dir.c_str());
2513 	FILE* cds_fpkm_out = fopen(cds_fpkm_tracking_name, "w");
2514 	if (!cds_fpkm_out)
2515 	{
2516 		fprintf(stderr, "Error: cannot open CDS level FPKM tracking file %s for writing\n",
2517 				cds_fpkm_tracking_name);
2518 		exit(1);
2519 	}
2520 	outfiles.cds_fpkm_tracking_out = cds_fpkm_out;
2521 
2522 	char gene_fpkm_tracking_name[filename_buf_size];
2523 	sprintf(gene_fpkm_tracking_name, "%s/genes.fpkm_tracking", output_dir.c_str());
2524 	FILE* gene_fpkm_out = fopen(gene_fpkm_tracking_name, "w");
2525 	if (!gene_fpkm_out)
2526 	{
2527 		fprintf(stderr, "Error: cannot open gene-level FPKM tracking file %s for writing\n",
2528 				gene_fpkm_tracking_name);
2529 		exit(1);
2530 	}
2531 	outfiles.gene_fpkm_tracking_out = gene_fpkm_out;
2532 
2533     char isoform_count_tracking_name[filename_buf_size];
2534 	sprintf(isoform_count_tracking_name, "%s/isoforms.count_tracking", output_dir.c_str());
2535 	FILE* isoform_count_out = fopen(isoform_count_tracking_name, "w");
2536 	if (!isoform_count_out)
2537 	{
2538 		fprintf(stderr, "Error: cannot open isoform-level count tracking file %s for writing\n",
2539 				isoform_count_tracking_name);
2540 		exit(1);
2541 	}
2542 	outfiles.isoform_count_tracking_out = isoform_count_out;
2543 
2544 	char tss_group_count_tracking_name[filename_buf_size];
2545 	sprintf(tss_group_count_tracking_name, "%s/tss_groups.count_tracking", output_dir.c_str());
2546 	FILE* tss_group_count_out = fopen(tss_group_count_tracking_name, "w");
2547 	if (!tss_group_count_out)
2548 	{
2549 		fprintf(stderr, "Error: cannot open TSS group-level count tracking file %s for writing\n",
2550 				tss_group_count_tracking_name);
2551 		exit(1);
2552 	}
2553 	outfiles.tss_group_count_tracking_out = tss_group_count_out;
2554 
2555 	char cds_count_tracking_name[filename_buf_size];
2556 	sprintf(cds_count_tracking_name, "%s/cds.count_tracking", output_dir.c_str());
2557 	FILE* cds_count_out = fopen(cds_count_tracking_name, "w");
2558 	if (!cds_count_out)
2559 	{
2560 		fprintf(stderr, "Error: cannot open CDS level count tracking file %s for writing\n",
2561 				cds_count_tracking_name);
2562 		exit(1);
2563 	}
2564 	outfiles.cds_count_tracking_out = cds_count_out;
2565 
2566 	char gene_count_tracking_name[filename_buf_size];
2567 	sprintf(gene_count_tracking_name, "%s/genes.count_tracking", output_dir.c_str());
2568 	FILE* gene_count_out = fopen(gene_count_tracking_name, "w");
2569 	if (!gene_count_out)
2570 	{
2571 		fprintf(stderr, "Error: cannot open gene-level count tracking file %s for writing\n",
2572 				gene_count_tracking_name);
2573 		exit(1);
2574 	}
2575 	outfiles.gene_count_tracking_out = gene_count_out;
2576 
2577     char isoform_rep_tracking_name[filename_buf_size];
2578 	sprintf(isoform_rep_tracking_name, "%s/isoforms.read_group_tracking", output_dir.c_str());
2579 	FILE* isoform_rep_out = fopen(isoform_rep_tracking_name, "w");
2580 	if (!isoform_rep_out)
2581 	{
2582 		fprintf(stderr, "Error: cannot open isoform-level read group tracking file %s for writing\n",
2583 				isoform_rep_tracking_name);
2584 		exit(1);
2585 	}
2586 	outfiles.isoform_rep_tracking_out = isoform_rep_out;
2587 
2588 	char tss_group_rep_tracking_name[filename_buf_size];
2589 	sprintf(tss_group_rep_tracking_name, "%s/tss_groups.read_group_tracking", output_dir.c_str());
2590 	FILE* tss_group_rep_out = fopen(tss_group_rep_tracking_name, "w");
2591 	if (!tss_group_rep_out)
2592 	{
2593 		fprintf(stderr, "Error: cannot open TSS group-level read group tracking file %s for writing\n",
2594 				tss_group_rep_tracking_name);
2595 		exit(1);
2596 	}
2597 	outfiles.tss_group_rep_tracking_out = tss_group_rep_out;
2598 
2599 	char cds_rep_tracking_name[filename_buf_size];
2600 	sprintf(cds_rep_tracking_name, "%s/cds.read_group_tracking", output_dir.c_str());
2601 	FILE* cds_rep_out = fopen(cds_rep_tracking_name, "w");
2602 	if (!cds_rep_out)
2603 	{
2604 		fprintf(stderr, "Error: cannot open CDS level read group tracking file %s for writing\n",
2605 				cds_rep_tracking_name);
2606 		exit(1);
2607 	}
2608 	outfiles.cds_rep_tracking_out = cds_rep_out;
2609 
2610 	char gene_rep_tracking_name[filename_buf_size];
2611 	sprintf(gene_rep_tracking_name, "%s/genes.read_group_tracking", output_dir.c_str());
2612 	FILE* gene_rep_out = fopen(gene_rep_tracking_name, "w");
2613 	if (!gene_rep_out)
2614 	{
2615 		fprintf(stderr, "Error: cannot open gene-level read group tracking file %s for writing\n",
2616 				gene_rep_tracking_name);
2617 		exit(1);
2618 	}
2619 	outfiles.gene_rep_tracking_out = gene_rep_out;
2620 
2621     char read_group_info_name[filename_buf_size];
2622 	sprintf(read_group_info_name, "%s/read_groups.info", output_dir.c_str());
2623 	FILE* read_group_out = fopen(read_group_info_name, "w");
2624 	if (!read_group_out)
2625 	{
2626 		fprintf(stderr, "Error: cannot open read group info file %s for writing\n",
2627 				read_group_info_name);
2628 		exit(1);
2629 	}
2630 	outfiles.read_group_info_out = read_group_out;
2631 
2632     char run_info_name[filename_buf_size];
2633 	sprintf(run_info_name, "%s/run.info", output_dir.c_str());
2634 	FILE* run_info_out = fopen(run_info_name, "w");
2635 	if (!run_info_out)
2636 	{
2637 		fprintf(stderr, "Error: cannot open run info file %s for writing\n",
2638 				run_info_name);
2639 		exit(1);
2640 	}
2641 	outfiles.run_info_out = run_info_out;
2642 
2643     char bias_name[filename_buf_size];
2644 	sprintf(bias_name, "%s/bias_params.info", output_dir.c_str());
2645 	FILE* bias_out = fopen(bias_name, "w");
2646 	if (!bias_out)
2647 	{
2648 		fprintf(stderr, "Error: cannot open run info file %s for writing\n",
2649 				bias_name);
2650 		exit(1);
2651 	}
2652 	outfiles.bias_out = bias_out;
2653 
2654     char var_model_name[filename_buf_size];
2655 	sprintf(var_model_name, "%s/var_model.info", output_dir.c_str());
2656 	FILE* var_model_out = fopen(var_model_name, "w");
2657 	if (!var_model_out)
2658 	{
2659 		fprintf(stderr, "Error: cannot open run info file %s for writing\n",
2660 				var_model_name);
2661 		exit(1);
2662 	}
2663 	outfiles.var_model_out = var_model_out;
2664 
2665 
2666     driver(ref_gtf, mask_gtf, contrast_file, norm_standards_file, sam_hit_filenames, outfiles);
2667 
2668 #if 0
2669     if (emit_count_tables)
2670     {
2671         dump_locus_variance_info(output_dir + string("/locus_var.txt"));
2672     }
2673 #endif
2674 
2675 	return 0;
2676 }
2677