1 /*
2 * cuffdiff.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 10/21/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 #include <numeric>
14 #include <cfloat>
15 #include <iostream>
16
17 #include "common.h"
18 #include "hits.h"
19 #include "bundles.h"
20 #include "abundances.h"
21 #include "tokenize.h"
22 #include "biascorrection.h"
23 #include "update_check.h"
24
25 #include <boost/thread.hpp>
26 #include <boost/version.hpp>
27 #include <boost/graph/adjacency_list.hpp>
28 #include <boost/graph/graph_traits.hpp>
29 #include <boost/numeric/ublas/matrix.hpp>
30 #include <boost/numeric/ublas/matrix_proxy.hpp>
31 #include <boost/numeric/ublas/vector.hpp>
32 #include <boost/numeric/ublas/vector_proxy.hpp>
33 #include <boost/numeric/ublas/io.hpp>
34 #include <boost/algorithm/string.hpp>
35
36 #include "differential.h"
37
38 extern "C" {
39 #include "locfit/local.h"
40 }
41
42 // Need at least this many reads in a locus to do any testing on it
43
44 vector<string> sample_labels;
45
46 double FDR = 0.05;
47 bool samples_are_time_series = false;
48 using namespace std;
49 using namespace boost;
50
51 // We leave out the short codes for options that don't take an argument
52 #if ENABLE_THREADS
53 const char *short_options = "m:p:s:c:I:j:L:M:o:b:TNqvuF:C:";
54 #else
55 const char *short_options = "m:s:c:I:j:L:M:o:b:TNqvuF:C:";
56 #endif
57
58
59
60 static struct option long_options[] = {
61 {"frag-len-mean", required_argument, 0, 'm'},
62 {"frag-len-std-dev", required_argument, 0, 's'},
63 {"transcript-score-thresh", required_argument, 0, 't'},
64 {"pre-mrna-fraction", required_argument, 0, 'j'},
65 {"max-intron-length", required_argument, 0, 'I'},
66 {"labels", required_argument, 0, 'L'},
67 {"min-alignment-count", required_argument, 0, 'c'},
68 {"FDR", required_argument, 0, OPT_FDR},
69 {"seed", required_argument, 0, OPT_RANDOM_SEED},
70 {"mask-file", required_argument, 0, 'M'},
71 {"contrast-file", required_argument, 0, 'C'},
72 {"norm-standards-file", required_argument, 0, OPT_NORM_STANDARDS_FILE},
73 {"use-sample-sheet", no_argument, 0, OPT_USE_SAMPLE_SHEET},
74 {"output-dir", required_argument, 0, 'o'},
75 {"verbose", no_argument, 0, 'v'},
76 {"quiet", no_argument, 0, 'q'},
77 {"frag-bias-correct", required_argument, 0, 'b'},
78 {"multi-read-correct", no_argument, 0, 'u'},
79 {"time-series", no_argument, 0, 'T'},
80 {"upper-quartile-norm", no_argument, 0, 'N'},
81 {"geometric-norm", no_argument, 0, OPT_GEOMETRIC_NORM},
82 {"raw-mapped-norm", no_argument, 0, OPT_RAW_MAPPED_NORM},
83 {"min-isoform-fraction", required_argument, 0, 'F'},
84 #if ENABLE_THREADS
85 {"num-threads", required_argument, 0, 'p'},
86 #endif
87 {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
88 {"seed", required_argument, 0, OPT_RANDOM_SEED},
89 {"no-collapse-cond-prob", no_argument, 0, OPT_COLLAPSE_COND_PROB},
90 {"num-importance-samples", required_argument, 0, OPT_NUM_IMP_SAMPLES},
91 {"max-mle-iterations", required_argument, 0, OPT_MLE_MAX_ITER},
92 {"min-mle-accuracy", required_argument, 0, OPT_MLE_MIN_ACC},
93 {"poisson-dispersion", no_argument, 0, OPT_POISSON_DISPERSION},
94 {"bias-mode", required_argument, 0, OPT_BIAS_MODE},
95 {"no-update-check", no_argument, 0, OPT_NO_UPDATE_CHECK},
96 {"emit-count-tables", no_argument, 0, OPT_EMIT_COUNT_TABLES},
97 {"compatible-hits-norm", no_argument, 0, OPT_USE_COMPAT_MASS},
98 {"total-hits-norm", no_argument, 0, OPT_USE_TOTAL_MASS},
99 //{"analytic-diff", no_argument, 0, OPT_ANALYTIC_DIFF},
100 {"no-diff", no_argument, 0, OPT_NO_DIFF},
101 {"num-frag-count-draws", required_argument, 0, OPT_NUM_FRAG_COUNT_DRAWS},
102 {"num-frag-assign-draws", required_argument, 0, OPT_NUM_FRAG_ASSIGN_DRAWS},
103
104 // Some options for testing different stats policies
105 {"max-bundle-frags", required_argument, 0, OPT_MAX_FRAGS_PER_BUNDLE},
106 {"read-skip-fraction", required_argument, 0, OPT_READ_SKIP_FRACTION},
107 {"no-read-pairs", no_argument, 0, OPT_NO_READ_PAIRS},
108 {"trim-read-length", required_argument, 0, OPT_TRIM_READ_LENGTH},
109 {"cov-delta", required_argument, 0, OPT_MAX_DELTA_GAP},
110 {"locus-count-dispersion", no_argument, 0, OPT_LOCUS_COUNT_DISPERSION},
111 {"max-frag-multihits", required_argument, 0, OPT_FRAG_MAX_MULTIHITS},
112 {"min-outlier-p", required_argument, 0, OPT_MIN_OUTLIER_P},
113 {"min-reps-for-js-test", required_argument, 0, OPT_MIN_REPS_FOR_JS_TEST},
114 {"no-effective-length-correction", no_argument, 0, OPT_NO_EFFECTIVE_LENGTH_CORRECTION},
115 {"no-length-correction", no_argument, 0, OPT_NO_LENGTH_CORRECTION},
116 {"no-js-tests", no_argument, 0, OPT_NO_JS_TESTS},
117 {"dispersion-method", required_argument, 0, OPT_DISPERSION_METHOD},
118 {"library-norm-method", required_argument, 0, OPT_LIB_NORM_METHOD},
119 {"no-scv-correction", no_argument, 0, OPT_NO_SCV_CORRECTION},
120 {0, 0, 0, 0} // terminator
121 };
122
print_usage()123 void print_usage()
124 {
125 fprintf(stderr, "cuffdiff v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
126 fprintf(stderr, "-----------------------------\n");
127
128 //NOTE: SPACES ONLY, bozo
129 fprintf(stderr, "Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]\n");
130 fprintf(stderr, " Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam\n");
131 fprintf(stderr, "General Options:\n");
132 fprintf(stderr, " -o/--output-dir write all output files to this directory [ default: ./ ]\n");
133 fprintf(stderr, " -L/--labels comma-separated list of condition labels\n");
134 fprintf(stderr, " --FDR False discovery rate used in testing [ default: 0.05 ]\n");
135 fprintf(stderr, " -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]\n");
136 fprintf(stderr, " -C/--contrast-file Perform the constrasts specified in this file [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
137 //fprintf(stderr, " --norm-standards-file Housekeeping/spike genes to normalize libraries [ default: NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
138 fprintf(stderr, " -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]\n");
139 fprintf(stderr, " -u/--multi-read-correct use 'rescue method' for multi-reads [ default: FALSE ]\n");
140 #if ENABLE_THREADS
141 fprintf(stderr, " -p/--num-threads number of threads used during quantification [ default: 1 ]\n");
142 #endif
143 fprintf(stderr, " --no-diff Don't generate differential analysis files [ default: FALSE ]\n");
144 fprintf(stderr, " --no-js-tests Don't perform isoform switching tests [ default: FALSE ]\n");
145 fprintf(stderr, " -T/--time-series treat samples as a time-series [ default: FALSE ]\n");
146 fprintf(stderr, " --library-type Library prep used for input reads [ default: below ]\n");
147 fprintf(stderr, " --dispersion-method Method used to estimate dispersion models [ default: below ]\n");
148 fprintf(stderr, " --library-norm-method Method used to normalize library sizes [ default: below ]\n");
149
150 fprintf(stderr, "\nAdvanced Options:\n");
151 fprintf(stderr, " -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]\n");
152 fprintf(stderr, " -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]\n");
153 fprintf(stderr, " -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]\n");
154 fprintf(stderr, " --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]\n");
155 fprintf(stderr, " --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]\n");
156 fprintf(stderr, " --total-hits-norm count all hits for normalization [ default: FALSE ]\n");
157 fprintf(stderr, " -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]\n");
158 fprintf(stderr, " -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]\n");
159 fprintf(stderr, " --seed value of random number generator seed [ default: 0 ]\n");
160 fprintf(stderr, " --no-update-check do not contact server to check for update availability[ default: FALSE ]\n");
161 fprintf(stderr, " --emit-count-tables print count tables used to fit overdispersion [ DEPRECATED ]\n");
162 fprintf(stderr, " --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]\n");
163 fprintf(stderr, " --num-frag-count-draws Number of fragment generation samples [ default: 100 ]\n");
164 fprintf(stderr, " --num-frag-assign-draws Number of fragment assignment samples per generation [ default: 50 ]\n");
165 fprintf(stderr, " --max-frag-multihits Maximum number of alignments allowed per fragment [ default: unlim ]\n");
166 fprintf(stderr, " --min-outlier-p Min replicate p value to admit for testing [ DEPRECATED ]\n");
167 fprintf(stderr, " --min-reps-for-js-test Replicates needed for relative isoform shift testing [ default: 3 ]\n");
168 fprintf(stderr, " --no-effective-length-correction No effective length correction [ default: FALSE ]\n");
169 fprintf(stderr, " --no-length-correction No length correction [ default: FALSE ]\n");
170 fprintf(stderr, " -N/--upper-quartile-norm Deprecated, use --library-norm-method [ DEPRECATED ]\n");
171 fprintf(stderr, " --geometric-norm Deprecated, use --library-norm-method [ DEPRECATED ]\n");
172 fprintf(stderr, " --raw-mapped-norm Deprecated, use --library-norm-method [ DEPRECATED ]\n");
173 fprintf(stderr, " --poisson-dispersion Deprecated, use --dispersion-method [ DEPRECATED ]\n");
174 fprintf(stderr, "\nDebugging use only:\n");
175 fprintf(stderr, " --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]\n");
176 fprintf(stderr, " --no-read-pairs Break all read pairs [ default: FALSE ]\n");
177 fprintf(stderr, " --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]\n");
178 fprintf(stderr, " --no-scv-correction Disable SCV correction [ default: FALSE ]\n");
179 print_library_table();
180 print_dispersion_method_table();
181 print_lib_norm_method_table();
182 }
183
parse_options(int argc,char ** argv)184 int parse_options(int argc, char** argv)
185 {
186 int option_index = 0;
187 int next_option;
188 string sample_label_list;
189 string dispersion_method_str;
190 string lib_norm_method_str;
191 do {
192 next_option = getopt_long_only(argc, argv, short_options, long_options, &option_index);
193 if (next_option == -1) /* Done with options. */
194 break;
195 switch (next_option) {
196 case 0:
197 /* If this option set a flag, do nothing else now. */
198 if (long_options[option_index].flag != 0)
199 break;
200 break;
201
202 case 'm':
203 user_provided_fld = true;
204 def_frag_len_mean = (uint32_t)parseInt(0, "-m/--frag-len-mean arg must be at least 0", print_usage);
205 break;
206 case 'c':
207 min_read_count = (uint32_t)parseInt(0, "-c/--min-alignment-count arg must be at least 0", print_usage);
208 break;
209 case 's':
210 user_provided_fld = true;
211 def_frag_len_std_dev = (uint32_t)parseInt(0, "-s/--frag-len-std-dev arg must be at least 0", print_usage);
212 break;
213 case 'p':
214 num_threads = (uint32_t)parseInt(1, "-p/--num-threads arg must be at least 1", print_usage);
215 break;
216 case 'F':
217 min_isoform_fraction = parseFloat(0, 1.0, "-F/--min-isoform-fraction must be between 0 and 1.0", print_usage);
218 break;
219 case 'L':
220 sample_label_list = optarg;
221 break;
222 case OPT_FDR:
223 FDR = (double)parseFloat(0.00, 1.00, "--FDR arg must be between 0 and 1", print_usage);
224 break;
225 case OPT_NUM_IMP_SAMPLES:
226 num_importance_samples = parseInt(1, "--num-importance-samples must be at least 1", print_usage);
227 break;
228 case OPT_MLE_MAX_ITER:
229 max_mle_iterations = parseInt(1, "--max-mle-iterations must be at least 1", print_usage);
230 break;
231 case OPT_BIAS_MODE:
232 if (!strcmp(optarg, "site"))
233 bias_mode = SITE;
234 else if (!strcmp(optarg, "pos"))
235 bias_mode = POS;
236 else if (!strcmp(optarg, "pos_vlmm"))
237 bias_mode = POS_VLMM;
238 else if (!strcmp(optarg, "vlmm"))
239 bias_mode = VLMM;
240 else if (!strcmp(optarg, "pos_site"))
241 bias_mode = POS_SITE;
242 else
243 {
244 fprintf(stderr, "Unknown bias mode.\n");
245 exit(1);
246 }
247 break;
248 case 'M':
249 {
250 mask_gtf_filename = optarg;
251 break;
252 }
253 case 'C':
254 {
255 contrast_filename = optarg;
256 break;
257 }
258 case OPT_NORM_STANDARDS_FILE:
259 {
260 norm_standards_filename = optarg;
261 break;
262 }
263 case OPT_USE_SAMPLE_SHEET:
264 {
265 use_sample_sheet = true;
266 break;
267 }
268 case 'v':
269 {
270 if (cuff_quiet)
271 {
272 fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting verbose only.\n");
273 }
274 cuff_quiet = false;
275 cuff_verbose = true;
276 break;
277 }
278 case 'q':
279 {
280 if (cuff_verbose)
281 {
282 fprintf(stderr, "Warning: Can't be both verbose and quiet! Setting quiet only.\n");
283 }
284 cuff_verbose = false;
285 cuff_quiet = true;
286 break;
287 }
288 case 'o':
289 {
290 output_dir = optarg;
291 break;
292 }
293 case 'b':
294 {
295 fasta_dir = optarg;
296 corr_bias = true;
297 break;
298 }
299
300 case 'T':
301 {
302 samples_are_time_series = true;
303 break;
304 }
305 case 'N':
306 {
307 lib_norm_method_str = "quartile";
308 break;
309 }
310 case 'u':
311 {
312 corr_multi = true;
313 break;
314 }
315 case OPT_LIBRARY_TYPE:
316 {
317 library_type = optarg;
318 break;
319 }
320 case OPT_POISSON_DISPERSION:
321 {
322 fprintf (stderr, "Warning: --poisson-dispersion is deprecated, use --dispersion-method poisson instead.\n");
323 break;
324 }
325 case OPT_NO_UPDATE_CHECK:
326 {
327 no_update_check = true;
328 break;
329 }
330 case OPT_RANDOM_SEED:
331 {
332 random_seed = parseInt(0, "--seed must be at least 0", print_usage);
333 break;
334 }
335 case OPT_EMIT_COUNT_TABLES:
336 {
337 emit_count_tables = true;
338 break;
339 }
340 case OPT_COLLAPSE_COND_PROB:
341 {
342 cond_prob_collapse = false;
343 break;
344 }
345 case OPT_USE_COMPAT_MASS:
346 {
347 use_compat_mass = true;
348 break;
349 }
350 case OPT_USE_TOTAL_MASS:
351 {
352 use_total_mass = true;
353 break;
354 }
355 case OPT_USE_FISHER_COVARIANCE:
356 {
357 use_fisher_covariance = true;
358 break;
359 }
360 case OPT_USE_EMPIRICAL_COVARIANCE:
361 {
362 use_fisher_covariance = false;
363 break;
364 }
365 case OPT_SPLIT_MASS:
366 {
367 split_variance = false;
368 break;
369 }
370 case OPT_SPLIT_VARIANCE:
371 {
372 split_variance = true;
373 break;
374 }
375
376 case OPT_MAX_FRAGS_PER_BUNDLE:
377 {
378 max_frags_per_bundle = parseInt(0, "--max-bundle-frags must be at least 0", print_usage);
379 break;
380 }
381 case OPT_READ_SKIP_FRACTION:
382 {
383 read_skip_fraction = parseFloat(0, 1.0, "--read-skip-fraction must be between 0 and 1.0", print_usage);
384 break;
385 }
386 case OPT_NO_READ_PAIRS:
387 {
388 no_read_pairs = true;
389 break;
390 }
391 case OPT_TRIM_READ_LENGTH:
392 {
393 trim_read_length = parseInt(0, "--trim-read-length must be at least 1", print_usage);
394 break;
395 }
396
397 case OPT_NO_DIFF:
398 {
399 no_differential = true;
400 break;
401 }
402 case OPT_GEOMETRIC_NORM:
403 {
404 lib_norm_method_str = "geometric";
405 break;
406 }
407 case OPT_RAW_MAPPED_NORM:
408 {
409 lib_norm_method_str = "classic-fpkm";
410 break;
411 }
412 case OPT_NUM_FRAG_COUNT_DRAWS:
413 {
414 num_frag_count_draws = parseInt(1, "--num-frag-count-draws must be at least 1", print_usage);
415 break;
416 }
417 case OPT_NUM_FRAG_ASSIGN_DRAWS:
418 {
419 num_frag_assignments = parseInt(1, "--num-frag-assign-draws must be at least 1", print_usage);
420 break;
421 }
422 case OPT_FRAG_MAX_MULTIHITS:
423 {
424 max_frag_multihits = parseInt(1, "--max-frag-multihits must be at least 1", print_usage);
425 break;
426 }
427 case OPT_MIN_OUTLIER_P:
428 {
429 min_outlier_p = parseFloat(0, 1.0, "--min-outlier-p must be between 0 and 1.0", print_usage);
430 break;
431 }
432 case OPT_MIN_REPS_FOR_JS_TEST:
433 {
434 min_reps_for_js_test = parseInt(1, "--min-reps-for-js-test must be at least 1", print_usage);
435 break;
436 }
437 case OPT_NO_EFFECTIVE_LENGTH_CORRECTION:
438 {
439 no_effective_length_correction = true;
440 break;
441 }
442 case OPT_NO_LENGTH_CORRECTION:
443 {
444 no_length_correction = true;
445 break;
446 }
447 case OPT_NO_JS_TESTS:
448 {
449 no_js_tests = true;
450 break;
451 }
452 case OPT_DISPERSION_METHOD:
453 {
454 dispersion_method_str = optarg;
455 break;
456 }
457 case OPT_LIB_NORM_METHOD:
458 {
459 lib_norm_method_str = optarg;
460 break;
461 }
462 case OPT_NO_SCV_CORRECTION:
463 {
464 no_scv_correction = true;
465 break;
466 }
467 default:
468 print_usage();
469 return 1;
470 }
471 } while(next_option != -1);
472
473 if (library_type != "")
474 {
475 map<string, ReadGroupProperties>::iterator lib_itr =
476 library_type_table.find(library_type);
477 if (lib_itr == library_type_table.end())
478 {
479 fprintf(stderr, "Error: Library type %s not supported\n", library_type.c_str());
480 exit(1);
481 }
482 else
483 {
484 if (library_type == "transfrags")
485 {
486 allow_junk_filtering = false;
487 }
488 global_read_properties = &lib_itr->second;
489 }
490 }
491 else
492 {
493
494 }
495
496 // Set the count dispersion method to use
497 if (dispersion_method_str == "")
498 {
499 dispersion_method_str = default_dispersion_method;
500 }
501
502 map<string, DispersionMethod>::iterator disp_itr =
503 dispersion_method_table.find(dispersion_method_str);
504 if (disp_itr == dispersion_method_table.end())
505 {
506 fprintf(stderr, "Error: Dispersion method %s not supported\n", dispersion_method_str.c_str());
507 exit(1);
508 }
509 else
510 {
511 dispersion_method = disp_itr->second;
512 }
513
514 // Set the library size normalization method to use
515 if (lib_norm_method_str == "")
516 {
517 lib_norm_method_str = default_lib_norm_method;
518 }
519
520 map<string, LibNormalizationMethod>::iterator lib_norm_itr =
521 lib_norm_method_table.find(lib_norm_method_str);
522 if (lib_norm_itr == lib_norm_method_table.end())
523 {
524 fprintf(stderr, "Error: Dispersion method %s not supported\n", lib_norm_method_str.c_str());
525 exit(1);
526 }
527 else
528 {
529 lib_norm_method = lib_norm_itr->second;
530 }
531
532
533
534 if (use_total_mass && use_compat_mass)
535 {
536 fprintf (stderr, "Error: please supply only one of --compatibile-hits-norm and --total-hits-norm\n");
537 exit(1);
538 }
539
540 tokenize(sample_label_list, ",", sample_labels);
541
542 allow_junk_filtering = false;
543
544 return 0;
545 }
546
print_tests(FILE * fout,const char * sample_1_label,const char * sample_2_label,const SampleDiffs & de_tests)547 void print_tests(FILE* fout,
548 const char* sample_1_label,
549 const char* sample_2_label,
550 const SampleDiffs& de_tests)
551 {
552 for (SampleDiffs::const_iterator itr = de_tests.begin();
553 itr != de_tests.end();
554 ++itr)
555 {
556 const SampleDifference& test = itr->second;
557
558 string all_gene_ids = cat_strings(test.meta_data->gene_ids);
559 if (all_gene_ids == "")
560 all_gene_ids = "-";
561
562 string all_gene_names = cat_strings(test.meta_data->gene_names);
563 if (all_gene_names == "")
564 all_gene_names = "-";
565
566 string all_protein_ids = cat_strings(test.meta_data->protein_ids);
567 if (all_protein_ids == "")
568 all_protein_ids = "-";
569
570 fprintf(fout, "%s\t%s\t%s\t%s\t%s\t%s",
571 itr->first.c_str(),
572 all_gene_ids.c_str(),
573 all_gene_names.c_str(),
574 test.meta_data->locus_desc.c_str(),
575 sample_1_label,
576 sample_2_label);
577
578 double t = test.test_stat;
579 double r1 = test.value_1;
580 double r2 = test.value_2;
581 double d = test.differential;
582 double p = test.p_value;
583 double q = test.corrected_p;
584 const char* sig;
585 if (test.significant && test.test_status == OK)
586 sig = "yes";
587 else
588 sig = "no";
589
590 const char* status = "OK";
591 if (test.test_status == OK)
592 status = "OK";
593 else if (test.test_status == LOWDATA)
594 status = "LOWDATA";
595 else if (test.test_status == HIDATA)
596 status = "HIDATA";
597 else if (test.test_status == NOTEST)
598 status = "NOTEST";
599 else
600 status = "FAIL";
601
602 fprintf(fout, "\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\t%lg\t%s", status, r1, r2, d, t, p, q, sig);
603 fprintf(fout, "\n");
604 }
605 }
606
print_FPKM_tracking(FILE * fout,const FPKMTrackingTable & tracking)607 void print_FPKM_tracking(FILE* fout,
608 const FPKMTrackingTable& tracking)
609 {
610 fprintf(fout,"tracking_id\tclass_code\tnearest_ref_id\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage");
611 FPKMTrackingTable::const_iterator first_itr = tracking.begin();
612 if (first_itr != tracking.end())
613 {
614 const FPKMTracking& track = first_itr->second;
615 const vector<FPKMContext>& fpkms = track.fpkm_series;
616 for (size_t i = 0; i < fpkms.size(); ++i)
617 {
618 fprintf(fout, "\t%s_FPKM\t%s_conf_lo\t%s_conf_hi\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
619 }
620 }
621 fprintf(fout, "\n");
622 for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
623 {
624 const string& description = itr->first;
625 const FPKMTracking& track = itr->second;
626 const vector<FPKMContext>& fpkms = track.fpkm_series;
627
628 AbundanceStatus status = NUMERIC_OK;
629 BOOST_FOREACH (const FPKMContext& c, fpkms)
630 {
631 if (c.status == NUMERIC_FAIL)
632 status = NUMERIC_FAIL;
633 }
634
635 string all_gene_ids = cat_strings(track.gene_ids);
636 if (all_gene_ids == "")
637 all_gene_ids = "-";
638
639 string all_gene_names = cat_strings(track.gene_names);
640 if (all_gene_names == "")
641 all_gene_names = "-";
642
643 string all_tss_ids = cat_strings(track.tss_ids);
644 if (all_tss_ids == "")
645 all_tss_ids = "-";
646
647 char length_buff[33] = "-";
648 if (track.length)
649 sprintf(length_buff, "%d", track.length);
650
651 fprintf(fout, "%s\t%c\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
652 description.c_str(),
653 track.classcode ? track.classcode : '-',
654 track.ref_match.c_str(),
655 all_gene_ids.c_str(),
656 all_gene_names.c_str(),
657 all_tss_ids.c_str(),
658 track.locus_tag.c_str(),
659 length_buff,
660 "-");
661
662 for (size_t i = 0; i < fpkms.size(); ++i)
663 {
664 double fpkm = fpkms[i].FPKM;
665 //double std_dev = sqrt(fpkms[i].FPKM_variance);
666 double fpkm_conf_hi = fpkms[i].FPKM_conf_hi;
667 double fpkm_conf_lo = fpkms[i].FPKM_conf_lo;
668 const char* status_str = "OK";
669
670 if (fpkms[i].status == NUMERIC_OK)
671 {
672 status_str = "OK";
673 }
674 else if (fpkms[i].status == NUMERIC_FAIL)
675 {
676 status_str = "FAIL";
677 }
678 else if (fpkms[i].status == NUMERIC_LOW_DATA)
679 {
680 status_str = "LOWDATA";
681 }
682 else if (fpkms[i].status == NUMERIC_HI_DATA)
683 {
684 status_str = "HIDATA";
685 }
686 else
687 {
688 assert(false);
689 }
690
691 fprintf(fout, "\t%lg\t%lg\t%lg\t%s", fpkm, fpkm_conf_lo, fpkm_conf_hi, status_str);
692 }
693
694 fprintf(fout, "\n");
695 }
696 }
697
print_count_tracking(FILE * fout,const FPKMTrackingTable & tracking)698 void print_count_tracking(FILE* fout,
699 const FPKMTrackingTable& tracking)
700 {
701 fprintf(fout,"tracking_id");
702 FPKMTrackingTable::const_iterator first_itr = tracking.begin();
703 if (first_itr != tracking.end())
704 {
705 const FPKMTracking& track = first_itr->second;
706 const vector<FPKMContext>& fpkms = track.fpkm_series;
707 for (size_t i = 0; i < fpkms.size(); ++i)
708 {
709 fprintf(fout, "\t%s_count\t%s_count_variance\t%s_count_uncertainty_var\t%s_count_dispersion_var\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
710 }
711 }
712 fprintf(fout, "\n");
713 for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
714 {
715 const string& description = itr->first;
716 const FPKMTracking& track = itr->second;
717 const vector<FPKMContext>& fpkms = track.fpkm_series;
718
719 AbundanceStatus status = NUMERIC_OK;
720 BOOST_FOREACH (const FPKMContext& c, fpkms)
721 {
722 if (c.status == NUMERIC_FAIL)
723 status = NUMERIC_FAIL;
724 }
725
726 fprintf(fout, "%s",
727 description.c_str());
728
729 for (size_t i = 0; i < fpkms.size(); ++i)
730 {
731 const char* status_str = "OK";
732
733 if (fpkms[i].status == NUMERIC_OK)
734 {
735 status_str = "OK";
736 }
737 else if (fpkms[i].status == NUMERIC_FAIL)
738 {
739 status_str = "FAIL";
740 }
741 else if (fpkms[i].status == NUMERIC_LOW_DATA)
742 {
743 status_str = "LOWDATA";
744 }
745 else if (fpkms[i].status == NUMERIC_HI_DATA)
746 {
747 status_str = "HIDATA";
748 }
749 else
750 {
751 assert(false);
752 }
753
754 double external_counts = fpkms[i].count_mean;
755 double external_count_var = fpkms[i].count_var;
756 double uncertainty_var = fpkms[i].count_uncertainty_var;
757 double dispersion_var = fpkms[i].count_dispersion_var;
758 fprintf(fout, "\t%lg\t%lg\t%lg\t%lg\t%s", external_counts, external_count_var, uncertainty_var, dispersion_var, status_str);
759 }
760
761 fprintf(fout, "\n");
762 }
763 }
764
print_read_group_tracking(FILE * fout,const FPKMTrackingTable & tracking)765 void print_read_group_tracking(FILE* fout,
766 const FPKMTrackingTable& tracking)
767 {
768 fprintf(fout,"tracking_id\tcondition\treplicate\traw_frags\tinternal_scaled_frags\texternal_scaled_frags\tFPKM\teffective_length\tstatus");
769
770 fprintf(fout, "\n");
771 for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
772 {
773 const string& description = itr->first;
774 const FPKMTracking& track = itr->second;
775 const vector<FPKMContext>& fpkms = track.fpkm_series;
776
777 for (size_t i = 0; i < fpkms.size(); ++i)
778 {
779 for (size_t j = 0; j != fpkms[i].tracking_info_per_rep.size();
780 ++j)
781 {
782 double FPKM = fpkms[i].tracking_info_per_rep[j].fpkm;
783 double internal_count = fpkms[i].tracking_info_per_rep[j].count;
784 double external_count = internal_count / fpkms[i].tracking_info_per_rep[j].rg_props->external_scale_factor();
785 double raw_count = internal_count * fpkms[i].tracking_info_per_rep[j].rg_props->internal_scale_factor();
786 const string& condition_name = fpkms[i].tracking_info_per_rep[j].rg_props->condition_name();
787 AbundanceStatus status = fpkms[i].tracking_info_per_rep[j].status;
788
789 int rep_num = fpkms[i].tracking_info_per_rep[j].rg_props->replicate_num();
790
791 const char* status_str = "OK";
792
793 if (status == NUMERIC_OK)
794 {
795 status_str = "OK";
796 }
797 else if (status == NUMERIC_FAIL)
798 {
799 status_str = "FAIL";
800 }
801 else if (status == NUMERIC_LOW_DATA)
802 {
803 status_str = "LOWDATA";
804 }
805 else if (status == NUMERIC_HI_DATA)
806 {
807 status_str = "HIDATA";
808 }
809 else
810 {
811 assert(false);
812 }
813
814 fprintf(fout, "%s\t%s\t%d\t%lg\t%lg\t%lg\t%lg\t%s\t%s\n",
815 description.c_str(),
816 condition_name.c_str(),
817 rep_num,
818 raw_count,
819 internal_count,
820 external_count,
821 FPKM,
822 "-",
823 status_str);
824 }
825 }
826 }
827 }
828
print_read_group_info(FILE * fout,const vector<boost::shared_ptr<ReadGroupProperties>> & all_read_groups)829 void print_read_group_info(FILE* fout,
830 const vector<boost::shared_ptr<ReadGroupProperties> >& all_read_groups)
831 {
832 fprintf(fout, "file\tcondition\treplicate_num\ttotal_mass\tnorm_mass\tinternal_scale\texternal_scale\n");
833 for (size_t i = 0; i < all_read_groups.size(); ++i)
834 {
835 boost::shared_ptr<ReadGroupProperties const> rg_props = all_read_groups[i];
836 fprintf(fout, "%s\t%s\t%d\t%Lg\t%Lg\t%lg\t%lg\n",
837 rg_props->file_path().c_str(),
838 rg_props->condition_name().c_str(),
839 rg_props->replicate_num(),
840 rg_props->total_map_mass(),
841 rg_props->normalized_map_mass(),
842 rg_props->internal_scale_factor(),
843 rg_props->external_scale_factor());
844
845 }
846 }
847
print_run_info(FILE * fout)848 void print_run_info(FILE* fout)
849 {
850 fprintf(fout, "param\tvalue\n");
851 fprintf(fout, "cmd_line\t%s\n", cmd_str.c_str());
852 fprintf(fout, "version\t%s\n", PACKAGE_VERSION);
853 fprintf(fout, "SVN_revision\t%s\n",SVN_REVISION);
854 fprintf(fout, "boost_version\t%d\n", BOOST_VERSION);
855 }
856
p_value_lt(const SampleDifference * lhs,const SampleDifference * rhs)857 bool p_value_lt(const SampleDifference* lhs, const SampleDifference* rhs)
858 {
859 return lhs->p_value < rhs->p_value;
860 }
861
862 // Benjamani-Hochberg procedure
fdr_significance(double fdr,vector<SampleDifference * > & tests)863 int fdr_significance(double fdr,
864 vector<SampleDifference*>& tests)
865 {
866 sort(tests.begin(), tests.end(), p_value_lt);
867 vector<SampleDifference*> passing;
868
869 for (int k = 0; k < (int)tests.size(); ++k)
870 {
871 if (tests[k]->test_status == OK)
872 {
873 passing.push_back(tests[k]);
874 }
875 else
876 {
877 tests[k]->significant = false;
878 }
879 }
880 int significant = 0;
881 float pmin=1;
882 int n = (int) passing.size();
883 //use the same procedure as p.adjust(...,"BH") in R
884 for (int k = n-1; k >= 0; k--)
885 {
886 double corrected_p = (double) passing[k]->p_value * ((double) n/(double) (k+1));
887 //make sure that no entry with lower p-value will get higher q-value than any entry with higher p-value
888 if (corrected_p < pmin)
889 {
890 pmin = corrected_p;
891 }
892 else
893 {
894 corrected_p = pmin;
895 }
896 // make sure that the q-value is always <= 1
897 passing[k]->corrected_p = (corrected_p < 1 ? corrected_p : 1);
898 passing[k]->significant = (corrected_p <= fdr);
899 significant += passing[k]->significant;
900 }
901
902 return passing.size();
903 }
904
905
extract_sample_diffs(SampleDiffs & diff_map,vector<SampleDifference * > & diffs)906 void extract_sample_diffs(SampleDiffs& diff_map,
907 vector<SampleDifference*>& diffs)
908 {
909 for (SampleDiffs::iterator itr = diff_map.begin();
910 itr != diff_map.end();
911 ++itr)
912 {
913 diffs.push_back(&(itr->second));
914 }
915 }
916
917 #if ENABLE_THREADS
918 boost::mutex inspect_lock;
919 #endif
920
inspect_map_worker(ReplicatedBundleFactory & fac,int & tmp_min_frag_len,int & tmp_max_frag_len,IdToLocusMap & id_to_locus_map)921 void inspect_map_worker(ReplicatedBundleFactory& fac,
922 int& tmp_min_frag_len,
923 int& tmp_max_frag_len,
924 IdToLocusMap& id_to_locus_map)
925 {
926 #if ENABLE_THREADS
927 boost::this_thread::at_thread_exit(decr_pool_count);
928 #endif
929
930 int min_f = std::numeric_limits<int>::max();
931 int max_f = 0;
932
933 fac.inspect_replicate_maps(min_f, max_f, id_to_locus_map);
934
935 #if ENABLE_THREADS
936 inspect_lock.lock();
937 #endif
938 tmp_min_frag_len = min(min_f, tmp_min_frag_len);
939 tmp_max_frag_len = max(max_f, tmp_max_frag_len);
940 #if ENABLE_THREADS
941 inspect_lock.unlock();
942 #endif
943 }
944
learn_bias_worker(boost::shared_ptr<BundleFactory> fac)945 void learn_bias_worker(boost::shared_ptr<BundleFactory> fac)
946 {
947 #if ENABLE_THREADS
948 boost::this_thread::at_thread_exit(decr_pool_count);
949 #endif
950 boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
951 BiasLearner* bl = new BiasLearner(rg_props->frag_len_dist());
952 learn_bias(*fac, *bl, false);
953 rg_props->bias_learner(boost::shared_ptr<BiasLearner>(bl));
954 }
955
956
957 boost::shared_ptr<TestLauncher> test_launcher;
958
quantitate_next_locus(const RefSequenceTable & rt,vector<boost::shared_ptr<ReplicatedBundleFactory>> & bundle_factories,boost::shared_ptr<TestLauncher> launcher)959 bool quantitate_next_locus(const RefSequenceTable& rt,
960 vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
961 boost::shared_ptr<TestLauncher> launcher)
962 {
963 for (size_t i = 0; i < bundle_factories.size(); ++i)
964 {
965 boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
966
967 #if ENABLE_THREADS
968 while(1)
969 {
970 locus_thread_pool_lock.lock();
971 if (locus_curr_threads < locus_num_threads)
972 {
973 break;
974 }
975
976 locus_thread_pool_lock.unlock();
977
978 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
979
980 }
981
982 locus_curr_threads++;
983 locus_thread_pool_lock.unlock();
984
985 boost::shared_ptr<HitBundle> pBundle = boost::shared_ptr<HitBundle>(new HitBundle());
986 bool non_empty = bundle_factories[i]->next_bundle(*pBundle, true);
987
988 if (pBundle->compatible_mass() > 0)
989 {
990 boost::thread quantitate(sample_worker,
991 non_empty,
992 pBundle,
993 boost::ref(rt),
994 boost::ref(*(bundle_factories[i])),
995 s_ab,
996 i,
997 launcher,
998 true);
999 }
1000 else
1001 {
1002 sample_worker(non_empty,
1003 pBundle,
1004 boost::ref(rt),
1005 boost::ref(*(bundle_factories[i])),
1006 s_ab,
1007 i,
1008 launcher,
1009 true);
1010 locus_thread_pool_lock.lock();
1011 locus_curr_threads--;
1012 locus_thread_pool_lock.unlock();
1013 }
1014 #else
1015 HitBundle bundle;
1016 bool non_empty = sample_factory.next_bundle(bundle);
1017
1018 sample_worker(non_emtpy,
1019 pBundle,
1020 boost::ref(rt),
1021 boost::ref(*(bundle_factories[i])),
1022 s_ab,
1023 i,
1024 launcher,
1025 true);
1026 #endif
1027 }
1028 return true;
1029 }
1030
parse_contrast_file(FILE * contrast_file,const vector<boost::shared_ptr<ReplicatedBundleFactory>> & factories,vector<pair<size_t,size_t>> & contrasts)1031 void parse_contrast_file(FILE* contrast_file,
1032 const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories,
1033 vector<pair<size_t, size_t > >& contrasts)
1034 {
1035
1036 char pBuf[10 * 1024];
1037 size_t non_blank_lines_read = 0;
1038
1039 map<string, pair<string, string> > contrast_table;
1040 map<string, size_t > factor_name_to_factory_idx;
1041
1042 for (size_t j = 0; j < factories.size(); ++j)
1043 {
1044 string factor_name = factories[j]->condition_name();
1045 if (factor_name_to_factory_idx.find(factor_name) != factor_name_to_factory_idx.end())
1046 {
1047 fprintf(stderr, "Error in contrast file: condition names must be unique! (%s is duplicated)", factor_name.c_str());
1048 exit(1);
1049 }
1050 factor_name_to_factory_idx[factor_name] = j;
1051 }
1052
1053 while (fgets(pBuf, 10*1024, contrast_file))
1054 {
1055 if (strlen(pBuf) > 0)
1056 {
1057 char* nl = strchr(pBuf, '\n');
1058 if (nl)
1059 *nl = 0;
1060
1061 string pBufstr = pBuf;
1062 string trimmed = boost::trim_copy(pBufstr);
1063
1064 if (trimmed.length() > 0 && trimmed[0] != '#')
1065 {
1066 non_blank_lines_read++;
1067 vector<string> columns;
1068 tokenize(trimmed, "\t", columns);
1069
1070 if (non_blank_lines_read == 1)
1071 continue;
1072
1073 if (columns.size() < 2)
1074 {
1075 if (columns.size() > 0)
1076 fprintf(stderr, "Malformed record in contrast file: \n > %s\n", pBuf);
1077 else
1078 continue;
1079 }
1080
1081 string factor_1 = columns[0];
1082 string factor_2 = columns[1];
1083
1084 if (columns.size() >= 3)
1085 {
1086 string contrast_name = columns[2];
1087 contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
1088 }
1089 else
1090 {
1091 char contrast_name[1024];
1092 sprintf(contrast_name, "contrast_%lu", contrast_table.size());
1093 contrast_table.insert(make_pair(contrast_name, make_pair(factor_1, factor_2)));
1094 }
1095 }
1096 }
1097 }
1098
1099 for (map<string, pair<string, string> >::const_iterator itr = contrast_table.begin();
1100 itr != contrast_table.end(); ++itr)
1101 {
1102 string factor_1 = itr->second.first;
1103 map<string, size_t >::iterator f1_itr = factor_name_to_factory_idx.find(factor_1);
1104 if (f1_itr == factor_name_to_factory_idx.end())
1105 {
1106 fprintf (stderr, "Error: condition %s not found among samples\n", factor_1.c_str());
1107 exit(1);
1108 }
1109 size_t f1_idx = f1_itr->second;
1110
1111 string factor_2 = itr->second.second;
1112 map<string, size_t >::iterator f2_itr = factor_name_to_factory_idx.find(factor_2);
1113 if (f2_itr == factor_name_to_factory_idx.end())
1114 {
1115 fprintf (stderr, "Error: condition %s not found among samples\n", factor_2.c_str());
1116 exit(1);
1117 }
1118 size_t f2_idx = f2_itr->second;
1119 contrasts.push_back(make_pair(f2_idx, f1_idx));
1120 }
1121 }
1122
parse_sample_sheet_file(FILE * sample_sheet_file,vector<string> & sample_labels,vector<string> & sam_hit_filename_lists)1123 void parse_sample_sheet_file(FILE* sample_sheet_file,
1124 vector<string>& sample_labels,
1125 vector<string>& sam_hit_filename_lists)
1126 {
1127
1128 char pBuf[10 * 1024];
1129 size_t non_blank_lines_read = 0;
1130
1131 sample_labels.clear();
1132
1133 map<string, vector<string> > sample_groups;
1134
1135 while (fgets(pBuf, 10*1024, sample_sheet_file))
1136 {
1137 if (strlen(pBuf) > 0)
1138 {
1139 char* nl = strchr(pBuf, '\n');
1140 if (nl)
1141 *nl = 0;
1142
1143 string pBufstr = pBuf;
1144 string trimmed = boost::trim_copy(pBufstr);
1145
1146 if (trimmed.length() > 0 && trimmed[0] != '#')
1147 {
1148 non_blank_lines_read++;
1149 vector<string> columns;
1150 tokenize(trimmed, "\t", columns);
1151
1152 if (non_blank_lines_read == 1)
1153 continue;
1154
1155 if (columns.size() < 2)
1156 {
1157 if (columns.size() > 0)
1158 fprintf(stderr, "Malformed record in sample sheet: \n > %s\n", pBuf);
1159 else
1160 continue;
1161 }
1162
1163 string sam_file = columns[0];
1164 string sample_group = columns[1];
1165
1166 pair<map<string, vector<string> >::iterator, bool> inserted = sample_groups.insert(make_pair(sample_group, vector<string>()));
1167 inserted.first->second.push_back(sam_file);
1168 }
1169 }
1170 }
1171
1172 for (map<string, vector<string> >::iterator itr = sample_groups.begin();
1173 itr != sample_groups.end(); ++itr)
1174 {
1175 sample_labels.push_back(itr->first);
1176 string sam_list = boost::join(itr->second, ",");
1177 sam_hit_filename_lists.push_back(sam_list);
1178 }
1179 }
1180
1181
init_default_contrasts(const vector<boost::shared_ptr<ReplicatedBundleFactory>> & factories,bool samples_are_time_series,vector<pair<size_t,size_t>> & contrasts)1182 void init_default_contrasts(const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories,
1183 bool samples_are_time_series,
1184 vector<pair<size_t, size_t > >& contrasts)
1185 {
1186
1187 for (size_t i = 1; i < factories.size(); ++i)
1188 {
1189 //bool multi_transcript_locus = samples[i]->transcripts.abundances().size() > 1;
1190
1191 int sample_to_start_test_against = 0;
1192
1193 if (samples_are_time_series)
1194 sample_to_start_test_against = i - 1;
1195
1196 for (size_t j = sample_to_start_test_against; j < i; ++j)
1197 {
1198 contrasts.push_back(make_pair(i,j));
1199 }
1200 }
1201 }
1202
parse_norm_standards_file(FILE * norm_standards_file)1203 void parse_norm_standards_file(FILE* norm_standards_file)
1204 {
1205 char pBuf[10 * 1024];
1206 size_t non_blank_lines_read = 0;
1207
1208 boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
1209
1210 while (fgets(pBuf, 10*1024, norm_standards_file))
1211 {
1212 if (strlen(pBuf) > 0)
1213 {
1214 char* nl = strchr(pBuf, '\n');
1215 if (nl)
1216 *nl = 0;
1217
1218 string pBufstr = pBuf;
1219 string trimmed = boost::trim_copy(pBufstr);
1220
1221 if (trimmed.length() > 0 && trimmed[0] != '#')
1222 {
1223 non_blank_lines_read++;
1224 vector<string> columns;
1225 tokenize(trimmed, "\t", columns);
1226
1227 if (non_blank_lines_read == 1)
1228 continue;
1229
1230 if (columns.size() < 1) //
1231 {
1232 continue;
1233 }
1234
1235 string gene_id = columns[0];
1236 LibNormStandards L;
1237 norm_standards->insert(make_pair(gene_id, L));
1238 }
1239 }
1240 }
1241 lib_norm_standards = norm_standards;
1242 }
1243
1244
print_variability_models(FILE * var_model_out,const vector<boost::shared_ptr<ReplicatedBundleFactory>> & factories)1245 void print_variability_models(FILE* var_model_out, const vector<boost::shared_ptr<ReplicatedBundleFactory> >& factories)
1246 {
1247
1248 fprintf(var_model_out, "condition\tlocus\tcompatible_count_mean\tcompatible_count_var\ttotal_count_mean\ttotal_count_var\tfitted_var\n");
1249
1250 for (size_t i = 0; i < factories.size(); ++i)
1251 {
1252 string factor_name = factories[i]->condition_name();
1253 boost::shared_ptr<ReadGroupProperties> rg = factories[i]->factories()[0]->read_group_properties();
1254 boost::shared_ptr<MassDispersionModel const> model = rg->mass_dispersion_model();
1255 // const vector<double>& means = model->scaled_compatible_mass_means();
1256 // const vector<double>& raw_vars = model->scaled_compatible_variances();
1257
1258 const vector<LocusCount>& common_scale_compatible_counts = rg->common_scale_compatible_counts();
1259 for (size_t j = 0; j < common_scale_compatible_counts.size(); ++j)
1260 {
1261 string locus_desc = common_scale_compatible_counts[j].locus_desc;
1262 pair<double, double> compat_mean_and_var = model->get_compatible_mean_and_var(locus_desc);
1263 pair<double, double> total_mean_and_var = model->get_total_mean_and_var(locus_desc);
1264 // double total_compat_count = 0;
1265 // if (itr != locus_to_total_count_table.end())
1266 // total_compat_count = itr->second.count;
1267
1268
1269 fprintf(var_model_out, "%s\t%s\t%lg\t%lg\t%lg\t%lg\t%lg\n",
1270 factor_name.c_str(),
1271 locus_desc.c_str(),
1272 compat_mean_and_var.first,
1273 compat_mean_and_var.second,
1274 total_mean_and_var.first,
1275 total_mean_and_var.second,
1276 model->scale_mass_variance(compat_mean_and_var.first));
1277 }
1278 }
1279 fclose(var_model_out);
1280
1281 }
1282
1283 struct DispModelAverageContext
1284 {
1285 double compatible_count_mean;
1286 double total_count_mean;
1287 double compatible_count_var;
1288 double total_count_var;
1289
1290 double fitted_var;
1291 double weight;
1292 };
1293
fit_dispersions(vector<boost::shared_ptr<ReplicatedBundleFactory>> & bundle_factories)1294 void fit_dispersions(vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories)
1295 {
1296 if (dispersion_method == PER_CONDITION)
1297 {
1298 for (size_t i = 0; i < bundle_factories.size(); ++i)
1299 {
1300 bundle_factories[i]->fit_dispersion_model();
1301 }
1302 }
1303 else if (dispersion_method == BLIND)
1304 {
1305 size_t num_samples = 0;
1306 for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
1307 {
1308 const vector<boost::shared_ptr<BundleFactory> >& factories = bundle_factories[cond_idx]->factories();
1309 for (size_t fac_idx = 0; fac_idx < factories.size(); ++fac_idx)
1310 {
1311 num_samples++;
1312 }
1313 }
1314 vector<double> scale_factors;
1315
1316 vector<LocusCountList> sample_compatible_count_table;
1317 vector<LocusCountList> sample_total_count_table;
1318 size_t curr_fac = 0;
1319 for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
1320 {
1321 vector<boost::shared_ptr<BundleFactory> > factories = bundle_factories[cond_idx]->factories();
1322 for (size_t fac_idx = 0; fac_idx < factories.size(); ++fac_idx)
1323 {
1324 boost::shared_ptr<BundleFactory> fac = factories[fac_idx];
1325
1326 boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
1327 const vector<LocusCount>& compatible_count_table = rg_props->common_scale_compatible_counts();
1328 const vector<LocusCount>& total_count_table = rg_props->common_scale_total_counts();
1329
1330 for (size_t i = 0; i < compatible_count_table.size(); ++i)
1331 {
1332 const LocusCount& c = compatible_count_table[i];
1333 double common_scale_compatible_count = c.count;
1334 double common_scale_total_count = total_count_table[i].count;
1335
1336 if (i >= sample_compatible_count_table.size())
1337 {
1338 LocusCountList locus_count(c.locus_desc, num_samples, c.num_transcripts, c.gene_ids, c.gene_short_names);
1339 sample_compatible_count_table.push_back(locus_count);
1340 sample_compatible_count_table.back().counts[0] = common_scale_compatible_count;
1341 sample_total_count_table.push_back(locus_count);
1342 sample_total_count_table.back().counts[0] = common_scale_total_count;
1343 }
1344 else
1345 {
1346 if (sample_compatible_count_table[i].locus_desc != c.locus_desc)
1347 {
1348 fprintf (stderr, "Error: bundle boundaries don't match across replicates!\n");
1349 exit(1);
1350 }
1351 sample_compatible_count_table[i].counts[curr_fac + fac_idx] = common_scale_compatible_count;
1352 sample_total_count_table[i].counts[curr_fac + fac_idx] = common_scale_total_count;
1353 }
1354 }
1355 scale_factors.push_back(rg_props->internal_scale_factor());
1356
1357 }
1358
1359 curr_fac += factories.size();
1360 }
1361
1362 boost::shared_ptr<MassDispersionModel> disperser = fit_dispersion_model("blind", scale_factors, sample_compatible_count_table);
1363
1364 vector<pair<double, double> > compatible_means_and_vars;
1365 calculate_count_means_and_vars(sample_compatible_count_table,
1366 compatible_means_and_vars);
1367
1368 for (size_t i = 0; i < sample_compatible_count_table.size(); ++i)
1369 {
1370 const LocusCountList& p = sample_compatible_count_table[i];
1371 double mean = compatible_means_and_vars[i].first;
1372 double var = compatible_means_and_vars[i].second;
1373 disperser->set_compatible_mean_and_var(p.locus_desc, make_pair(mean, var));
1374 }
1375
1376 vector<pair<double, double> > total_means_and_vars;
1377 calculate_count_means_and_vars(sample_total_count_table,
1378 total_means_and_vars);
1379
1380 for (size_t i = 0; i < sample_total_count_table.size(); ++i)
1381 {
1382 const LocusCountList& p = sample_compatible_count_table[i];
1383 double mean = total_means_and_vars[i].first;
1384 double var = total_means_and_vars[i].second;
1385 disperser->set_total_mean_and_var(p.locus_desc, make_pair(mean, var));
1386 }
1387
1388 for (size_t cond_idx = 0; cond_idx < bundle_factories.size(); ++cond_idx)
1389 {
1390 bundle_factories[cond_idx]->mass_dispersion_model(disperser);
1391 }
1392 }
1393 else if (dispersion_method == POOLED)
1394 {
1395 for (size_t i = 0; i < bundle_factories.size(); ++i)
1396 {
1397 bundle_factories[i]->fit_dispersion_model();
1398 }
1399 // now need to replace them with the average
1400
1401 boost::shared_ptr<MassDispersionModel> pooled_model;
1402 // Let's compute the pooled average of the dispersion models
1403 if (dispersion_method != BLIND)
1404 {
1405 vector<boost::shared_ptr<MassDispersionModel const> > disp_models;
1406 double total_replicates = 0.0;
1407 vector<double> disp_model_weight;
1408 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1409 {
1410 total_replicates += fac->num_replicates();
1411 }
1412 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1413 {
1414 if (fac->num_replicates() > 1)
1415 {
1416 disp_models.push_back(fac->mass_dispersion_model());
1417 disp_model_weight.push_back((double)fac->num_replicates() / total_replicates);
1418 }
1419 }
1420
1421 double max_mass = 0.0;
1422
1423 BOOST_FOREACH(boost::shared_ptr<MassDispersionModel const> disp, disp_models)
1424 {
1425 if (disp->scaled_compatible_mass_means().empty() == false && max_mass < disp->scaled_compatible_mass_means().back())
1426 {
1427 max_mass = disp->scaled_compatible_mass_means().back();
1428 }
1429 }
1430
1431 map<std::string, vector<DispModelAverageContext> > disp_info_by_locus;
1432 for (size_t disp_idx = 0; disp_idx < disp_models.size(); ++disp_idx)
1433 {
1434 boost::shared_ptr<MassDispersionModel const> disp = disp_models[disp_idx];
1435
1436 const std::map<std::string, std::pair<double, double> >& total_mv_by_locus = disp->total_mv_by_locus();
1437 const std::map<std::string, std::pair<double, double> >& compatible_mv_by_locus = disp->compatible_mv_by_locus();
1438
1439 for (map<std::string, std::pair<double, double> >::const_iterator itr = compatible_mv_by_locus.begin();
1440 itr != compatible_mv_by_locus.end(); ++itr)
1441 {
1442
1443 std::map<std::string, std::pair<double, double> >::const_iterator total_itr = total_mv_by_locus.find(itr->first);
1444 if (total_itr == total_mv_by_locus.end())
1445 continue;
1446
1447 pair<map<std::string, vector<DispModelAverageContext> >::iterator, bool> ins_pair =
1448 disp_info_by_locus.insert(make_pair(itr->first, vector<DispModelAverageContext>()));
1449
1450
1451 DispModelAverageContext ctx;
1452 ctx.compatible_count_mean = itr->second.first;
1453 ctx.compatible_count_var = itr->second.second;
1454 ctx.total_count_mean = total_itr->second.first;
1455 ctx.total_count_var = total_itr->second.second;
1456 if (use_compat_mass)
1457 ctx.fitted_var = disp->scale_mass_variance(ctx.compatible_count_mean);
1458 else
1459 ctx.fitted_var = disp->scale_mass_variance(ctx.total_count_mean);
1460
1461 ctx.weight = disp_model_weight[disp_idx];
1462
1463 ins_pair.first->second.push_back(ctx);
1464 }
1465 }
1466
1467 map<string, DispModelAverageContext> pooled_info_by_locus;
1468
1469 for (map<std::string, vector<DispModelAverageContext> >::const_iterator itr = disp_info_by_locus.begin();
1470 itr != disp_info_by_locus.end();
1471 ++itr)
1472 {
1473 DispModelAverageContext avg_ctx;
1474 avg_ctx.compatible_count_mean = 0;
1475 avg_ctx.compatible_count_var = 0;
1476 avg_ctx.total_count_mean = 0;
1477 avg_ctx.total_count_var = 0;
1478 avg_ctx.fitted_var = 0;
1479
1480 double total_weight = 0.0;
1481 for (size_t i = 0; i < itr->second.size(); ++i)
1482 {
1483 total_weight += itr->second[i].weight;
1484 }
1485
1486 for (size_t i = 0; i < itr->second.size(); ++i)
1487 {
1488 avg_ctx.compatible_count_mean += (itr->second[i].weight / total_weight) * itr->second[i].compatible_count_mean;
1489 avg_ctx.compatible_count_var += (itr->second[i].weight / total_weight) * itr->second[i].compatible_count_var;
1490 avg_ctx.total_count_mean += (itr->second[i].weight / total_weight) * itr->second[i].total_count_mean;
1491 avg_ctx.total_count_var += (itr->second[i].weight / total_weight) * itr->second[i].total_count_var;
1492 }
1493 pooled_info_by_locus[itr->first] = avg_ctx;
1494 }
1495
1496 vector<double> compatible_mass;
1497 vector<double> compatible_variances;
1498 vector<double> est_fitted_var;
1499 double epsilon = 0.2;
1500 for (double frag_idx = 0.0; frag_idx < max_mass; frag_idx += epsilon)
1501 {
1502 compatible_mass.push_back(frag_idx);
1503 double var_est = 0.0;
1504 for(size_t i = 0; i < disp_models.size(); ++i)
1505 {
1506 boost::shared_ptr<MassDispersionModel const> disp = disp_models[i];
1507 double weight = disp_model_weight[i];
1508 var_est += disp->scale_mass_variance(frag_idx) * weight;
1509 }
1510 compatible_variances.push_back(var_est);
1511 est_fitted_var.push_back(var_est);
1512 }
1513
1514 pooled_model = boost::shared_ptr<MassDispersionModel>(new MassDispersionModel("pooled", compatible_mass, compatible_variances, est_fitted_var));
1515
1516 for (map<std::string, DispModelAverageContext>::iterator itr = pooled_info_by_locus.begin();
1517 itr != pooled_info_by_locus.end();
1518 ++itr)
1519 {
1520 const string& locus = itr->first;
1521 pair<double, double> cmv = make_pair(itr->second.compatible_count_mean, itr->second.compatible_count_var);
1522 pooled_model->set_compatible_mean_and_var(locus, cmv);
1523
1524 pair<double, double> tmv = make_pair(itr->second.total_count_mean, itr->second.total_count_var);
1525 pooled_model->set_total_mean_and_var(locus, tmv);
1526 }
1527 }
1528
1529 if (dispersion_method == POOLED)
1530 {
1531 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1532 {
1533 fac->mass_dispersion_model(pooled_model);
1534 }
1535 }
1536
1537
1538 }
1539 else if (dispersion_method == POISSON)
1540 {
1541 boost::shared_ptr<MassDispersionModel> disperser = boost::shared_ptr<MassDispersionModel>(new PoissonDispersionModel(""));
1542 for (size_t i = 0; i < bundle_factories.size(); ++i)
1543 {
1544 bundle_factories[i]->mass_dispersion_model(disperser);
1545 }
1546
1547 }
1548 else
1549 {
1550 fprintf (stderr, "Error: unknown dispersion method requested\n");
1551 }
1552 }
1553
driver(FILE * ref_gtf,FILE * mask_gtf,FILE * contrast_file,FILE * norm_standards_file,vector<string> & sam_hit_filename_lists,Outfiles & outfiles)1554 void driver(FILE* ref_gtf, FILE* mask_gtf, FILE* contrast_file, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
1555 {
1556
1557 ReadTable it;
1558 RefSequenceTable rt(true, false);
1559
1560 vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
1561
1562 vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
1563 vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
1564
1565 for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
1566 {
1567 vector<string> sam_hit_filenames;
1568 tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
1569
1570 vector<boost::shared_ptr<BundleFactory> > replicate_factories;
1571
1572 string condition_name = sample_labels[i];
1573
1574 for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
1575 {
1576 boost::shared_ptr<HitFactory> hs;
1577 boost::shared_ptr<BundleFactory> hf;
1578 try
1579 {
1580 hs = boost::shared_ptr<HitFactory>(new PrecomputedExpressionHitFactory(sam_hit_filenames[j], it, rt));
1581 hf = boost::shared_ptr<BundleFactory>(new PrecomputedExpressionBundleFactory(static_pointer_cast<PrecomputedExpressionHitFactory>(hs)));
1582 }
1583
1584 catch(boost::archive::archive_exception & e)
1585 {
1586 hs = boost::shared_ptr<HitFactory>(createSamHitFactory(sam_hit_filenames[j], it, rt));
1587 hf = boost::shared_ptr<BundleFactory>(new BundleFactory(hs, REF_DRIVEN));
1588 }
1589
1590
1591 boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
1592
1593 if (global_read_properties)
1594 {
1595 *rg_props = *global_read_properties;
1596 }
1597 else
1598 {
1599 *rg_props = hs->read_group_properties();
1600 }
1601
1602 rg_props->checked_parameters(hs->read_group_properties().checked_parameters());
1603 rg_props->condition_name(condition_name);
1604 rg_props->replicate_num(j);
1605 rg_props->file_path(sam_hit_filenames[j]);
1606
1607 all_read_groups.push_back(rg_props);
1608
1609 hf->read_group_properties(rg_props);
1610
1611 replicate_factories.push_back(hf);
1612 //replicate_factories.back()->set_ref_rnas(ref_mRNAs);
1613 }
1614
1615 bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
1616 }
1617
1618 boost::crc_32_type ref_gtf_crc_result;
1619 ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
1620 if (ref_mRNAs.empty())
1621 return;
1622
1623 vector<boost::shared_ptr<Scaffold> > mask_rnas;
1624 if (mask_gtf)
1625 {
1626 boost::crc_32_type mask_gtf_crc_result;
1627 ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
1628 }
1629
1630 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1631 {
1632 fac->set_ref_rnas(ref_mRNAs);
1633 if (mask_gtf)
1634 fac->set_mask_rnas(mask_rnas);
1635 }
1636
1637 vector<pair<size_t, size_t > > contrasts;
1638 if (contrast_file != NULL)
1639 {
1640 parse_contrast_file(contrast_file, bundle_factories, contrasts);
1641 }
1642 else
1643 {
1644 init_default_contrasts(bundle_factories, samples_are_time_series, contrasts);
1645 }
1646
1647 if (norm_standards_file != NULL)
1648 {
1649 parse_norm_standards_file(norm_standards_file);
1650 }
1651
1652 validate_cross_sample_parameters(all_read_groups);
1653
1654 #if ENABLE_THREADS
1655 locus_num_threads = num_threads;
1656 #endif
1657
1658 // Validate the dispersion method the user's chosen.
1659 int most_reps = -1;
1660 int most_reps_idx = 0;
1661
1662 bool single_replicate_fac = false;
1663
1664 for (size_t i = 0; i < bundle_factories.size(); ++i)
1665 {
1666 ReplicatedBundleFactory& fac = *(bundle_factories[i]);
1667 if (fac.num_replicates() > most_reps)
1668 {
1669 most_reps = fac.num_replicates();
1670 most_reps_idx = i;
1671 }
1672 if (most_reps == 1)
1673 {
1674 single_replicate_fac = true;
1675 if (dispersion_method == PER_CONDITION)
1676 {
1677 fprintf(stderr, "Error: Dispersion method 'per-condition' requires that all conditions have at least 2 replicates. Please use either 'pooled' or 'blind'\n");
1678 exit(1);
1679 }
1680 }
1681 }
1682
1683 if (most_reps == 1 && (dispersion_method != BLIND || dispersion_method != POISSON))
1684 {
1685 fprintf(stderr, "Warning: No conditions are replicated, switching to 'blind' dispersion method\n");
1686 dispersion_method = BLIND;
1687 }
1688
1689
1690 //bool pool_all_samples = ((most_reps <= 1 && dispersion_method == NOT_SET) || dispersion_method == BLIND);
1691
1692
1693 int tmp_min_frag_len = numeric_limits<int>::max();
1694 int tmp_max_frag_len = 0;
1695
1696 ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
1697
1698 IdToLocusMap id_to_locus_map(boost::shared_ptr<map<string, set<string> > >(new map<string, set<string> >()));
1699 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1700 {
1701 #if ENABLE_THREADS
1702 while(1)
1703 {
1704 locus_thread_pool_lock.lock();
1705 if (locus_curr_threads < locus_num_threads)
1706 {
1707 break;
1708 }
1709
1710 locus_thread_pool_lock.unlock();
1711
1712 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1713 }
1714
1715 locus_curr_threads++;
1716 locus_thread_pool_lock.unlock();
1717
1718 boost::thread inspect(inspect_map_worker,
1719 boost::ref(*fac),
1720 boost::ref(tmp_min_frag_len),
1721 boost::ref(tmp_max_frag_len),
1722 boost::ref(id_to_locus_map));
1723 #else
1724 inspect_map_worker(boost::ref(*fac),
1725 boost::ref(tmp_min_frag_len),
1726 boost::ref(tmp_max_frag_len),
1727 id_to_locus_map);
1728 #endif
1729 }
1730
1731 // wait for the workers to finish up before reporting everthing.
1732 #if ENABLE_THREADS
1733 while(1)
1734 {
1735 locus_thread_pool_lock.lock();
1736 if (locus_curr_threads == 0)
1737 {
1738 locus_thread_pool_lock.unlock();
1739 break;
1740 }
1741 locus_thread_pool_lock.unlock();
1742
1743 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1744 }
1745 #endif
1746
1747 normalize_counts(all_read_groups);
1748 fit_dispersions(bundle_factories);
1749
1750 print_variability_models(outfiles.var_model_out, bundle_factories);
1751
1752 for (size_t i = 0; i < all_read_groups.size(); ++i)
1753 {
1754 boost::shared_ptr<ReadGroupProperties> rg = all_read_groups[i];
1755 fprintf(stderr, "> Map Properties:\n");
1756
1757 fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", rg->normalized_map_mass());
1758 fprintf(stderr, ">\tRaw Map Mass: %.2Lf\n", rg->total_map_mass());
1759 if (corr_multi)
1760 fprintf(stderr,">\tNumber of Multi-Reads: %zu (with %zu total hits)\n", rg->multi_read_table()->num_multireads(), rg->multi_read_table()->num_multihits());
1761
1762 if (rg->frag_len_dist()->source() == LEARNED)
1763 {
1764 fprintf(stderr, ">\tFragment Length Distribution: Empirical (learned)\n");
1765 fprintf(stderr, ">\t Estimated Mean: %.2f\n", rg->frag_len_dist()->mean());
1766 fprintf(stderr, ">\t Estimated Std Dev: %.2f\n", rg->frag_len_dist()->std_dev());
1767 }
1768 else
1769 {
1770 if (rg->frag_len_dist()->source() == USER)
1771 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (user-specified)\n");
1772 else //rg->frag_len_dist()->source == FLD::DEFAULT
1773 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (default)\n");
1774 fprintf(stderr, ">\t Default Mean: %d\n", def_frag_len_mean);
1775 fprintf(stderr, ">\t Default Std Dev: %d\n", def_frag_len_std_dev);
1776 }
1777 }
1778
1779 long double total_norm_mass = 0.0;
1780 long double total_mass = 0.0;
1781 BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1782 {
1783 total_norm_mass += rg_props->normalized_map_mass();
1784 total_mass += rg_props->total_map_mass();
1785 }
1786
1787 min_frag_len = tmp_min_frag_len;
1788 max_frag_len = tmp_max_frag_len;
1789
1790 final_est_run = false;
1791
1792 double num_bundles = (double)bundle_factories[0]->num_bundles();
1793
1794 // if (corr_bias && corr_multi)
1795 // p_bar = ProgressBar("Calculating initial abundance estimates for bias and multi-read correction.", num_bundles);
1796 // else if (corr_bias)
1797 // p_bar = ProgressBar("Calculating initial abundance estimates for bias correction.", num_bundles);
1798 // else if (corr_multi)
1799 // p_bar = ProgressBar("Calculating initial abundance estimates for multi-read correction.", num_bundles);
1800 // else
1801
1802 p_bar = ProgressBar("Calculating preliminary abundance estimates", num_bundles);
1803
1804 Tracking tracking;
1805
1806 test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, NULL, &tracking, &p_bar, true));
1807
1808 if (model_mle_error || corr_bias || corr_multi) // Only run initial estimation if correcting bias or multi-reads
1809 {
1810 while (1)
1811 {
1812 boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1813 quantitate_next_locus(rt, bundle_factories, test_launcher);
1814 bool more_loci_remain = false;
1815 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1816 {
1817 if (rep_fac->bundles_remain())
1818 {
1819 more_loci_remain = true;
1820 break;
1821 }
1822 }
1823
1824 if (!more_loci_remain)
1825 {
1826 // wait for the workers to finish up before breaking out.
1827 #if ENABLE_THREADS
1828 while(1)
1829 {
1830 locus_thread_pool_lock.lock();
1831 if (locus_curr_threads == 0)
1832 {
1833 locus_thread_pool_lock.unlock();
1834 break;
1835 }
1836
1837 locus_thread_pool_lock.unlock();
1838
1839 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1840
1841 }
1842 #endif
1843 break;
1844 }
1845 }
1846
1847 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1848 {
1849 rep_fac->reset();
1850 }
1851
1852 p_bar.complete();
1853 }
1854 if (corr_bias)
1855 {
1856 bias_run = true;
1857 p_bar = ProgressBar("Learning bias parameters.", 0);
1858 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1859 {
1860 BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, rep_fac->factories())
1861 {
1862 #if ENABLE_THREADS
1863 while(1)
1864 {
1865 locus_thread_pool_lock.lock();
1866 if (locus_curr_threads < locus_num_threads)
1867 {
1868 break;
1869 }
1870
1871 locus_thread_pool_lock.unlock();
1872
1873 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1874 }
1875 locus_curr_threads++;
1876 locus_thread_pool_lock.unlock();
1877
1878 boost::thread bias(learn_bias_worker, fac);
1879 #else
1880 learn_bias_worker(fac);
1881 #endif
1882 }
1883 }
1884
1885 // wait for the workers to finish up before reporting everthing.
1886 #if ENABLE_THREADS
1887 while(1)
1888 {
1889 locus_thread_pool_lock.lock();
1890 if (locus_curr_threads == 0)
1891 {
1892 locus_thread_pool_lock.unlock();
1893 break;
1894 }
1895
1896 locus_thread_pool_lock.unlock();
1897
1898 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1899 }
1900 #endif
1901 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1902 {
1903 rep_fac->reset();
1904 }
1905 bias_run = false;
1906 }
1907
1908 fprintf(outfiles.bias_out, "condition_name\treplicate_num\tparam\tpos_i\tpos_j\tvalue\n");
1909 BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1910 {
1911 if (rg_props->bias_learner())
1912 rg_props->bias_learner()->output(outfiles.bias_out, rg_props->condition_name(), rg_props->replicate_num());
1913 }
1914
1915
1916 // Allow the multiread tables to do their thing...
1917 BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1918 {
1919 rg_props->multi_read_table()->valid_mass(true);
1920 }
1921
1922 test_launcher->clear_tracking_data();
1923
1924 Tests tests;
1925
1926 int N = (int)sam_hit_filename_lists.size();
1927
1928 tests.isoform_de_tests = vector<vector<SampleDiffs> >(N);
1929 tests.tss_group_de_tests = vector<vector<SampleDiffs> >(N);
1930 tests.gene_de_tests = vector<vector<SampleDiffs> >(N);
1931 tests.cds_de_tests = vector<vector<SampleDiffs> >(N);
1932 tests.diff_splicing_tests = vector<vector<SampleDiffs> >(N);
1933 tests.diff_promoter_tests = vector<vector<SampleDiffs> >(N);
1934 tests.diff_cds_tests = vector<vector<SampleDiffs> >(N);
1935
1936 for (int i = 0; i < N; ++i)
1937 {
1938 tests.isoform_de_tests[i] = vector<SampleDiffs>(N);
1939 tests.tss_group_de_tests[i] = vector<SampleDiffs>(N);
1940 tests.gene_de_tests[i] = vector<SampleDiffs>(N);
1941 tests.cds_de_tests[i] = vector<SampleDiffs>(N);
1942 tests.diff_splicing_tests[i] = vector<SampleDiffs>(N);
1943 tests.diff_promoter_tests[i] = vector<SampleDiffs>(N);
1944 tests.diff_cds_tests[i] = vector<SampleDiffs>(N);
1945 }
1946
1947 final_est_run = true;
1948 p_bar = ProgressBar("Testing for differential expression and regulation in locus.", num_bundles);
1949
1950 test_launcher = boost::shared_ptr<TestLauncher>(new TestLauncher(bundle_factories.size(), contrasts, &tests, &tracking, &p_bar));
1951
1952 while (true)
1953 {
1954 //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1955 quantitate_next_locus(rt, bundle_factories, test_launcher);
1956 bool more_loci_remain = false;
1957 BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1958 {
1959 if (rep_fac->bundles_remain())
1960 {
1961 more_loci_remain = true;
1962 break;
1963 }
1964 }
1965 if (!more_loci_remain)
1966 {
1967 // wait for the workers to finish up before doing the cross-sample testing.
1968 #if ENABLE_THREADS
1969 while(1)
1970 {
1971 locus_thread_pool_lock.lock();
1972 if (locus_curr_threads == 0)
1973 {
1974 locus_thread_pool_lock.unlock();
1975 break;
1976 }
1977
1978 locus_thread_pool_lock.unlock();
1979
1980 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1981
1982 }
1983 #endif
1984 break;
1985 }
1986 }
1987
1988 p_bar.complete();
1989
1990 //double FDR = 0.05;
1991 int total_iso_de_tests = 0;
1992
1993 vector<SampleDifference*> isoform_exp_diffs;
1994 fprintf(outfiles.isoform_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
1995
1996
1997 for (size_t i = 0; i < tests.isoform_de_tests.size(); ++i)
1998 {
1999 for (size_t j = 0; j < i; ++j)
2000 {
2001 total_iso_de_tests += tests.isoform_de_tests[i][j].size();
2002 extract_sample_diffs(tests.isoform_de_tests[i][j], isoform_exp_diffs);
2003 }
2004 }
2005
2006 int iso_exp_tests = fdr_significance(FDR, isoform_exp_diffs);
2007 fprintf(stderr, "Performed %d isoform-level transcription difference tests\n", iso_exp_tests);
2008
2009 for (size_t i = 0; i < tests.isoform_de_tests.size(); ++i)
2010 {
2011 for (size_t j = 0; j < i; ++j)
2012 {
2013 print_tests(outfiles.isoform_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.isoform_de_tests[i][j]);
2014 }
2015 }
2016
2017
2018 int total_group_de_tests = 0;
2019 vector<SampleDifference*> tss_group_exp_diffs;
2020 fprintf(outfiles.group_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
2021
2022 for (size_t i = 0; i < tests.tss_group_de_tests.size(); ++i)
2023 {
2024 for (size_t j = 0; j < i; ++j)
2025 {
2026 extract_sample_diffs(tests.tss_group_de_tests[i][j], tss_group_exp_diffs);
2027 total_group_de_tests += tests.tss_group_de_tests[i][j].size();
2028 }
2029 }
2030
2031 int tss_group_exp_tests = fdr_significance(FDR, tss_group_exp_diffs);
2032 fprintf(stderr, "Performed %d tss-level transcription difference tests\n", tss_group_exp_tests);
2033
2034 for (size_t i = 0; i < tests.tss_group_de_tests.size(); ++i)
2035 {
2036 for (size_t j = 0; j < i; ++j)
2037 {
2038 print_tests(outfiles.group_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.tss_group_de_tests[i][j]);
2039 }
2040 }
2041
2042
2043 int total_gene_de_tests = 0;
2044 vector<SampleDifference*> gene_exp_diffs;
2045 fprintf(outfiles.gene_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
2046
2047 for (size_t i = 0; i < tests.gene_de_tests.size(); ++i)
2048 {
2049 for (size_t j = 0; j < tests.gene_de_tests.size(); ++j)
2050 {
2051 total_gene_de_tests += tests.gene_de_tests[i][j].size();
2052 extract_sample_diffs(tests.gene_de_tests[i][j], gene_exp_diffs);
2053 }
2054 }
2055
2056 //fprintf(stderr, "***There are %lu difference records in gene_exp_diffs\n", gene_exp_diffs.size());
2057 int gene_exp_tests = fdr_significance(FDR, gene_exp_diffs);
2058 fprintf(stderr, "Performed %d gene-level transcription difference tests\n", gene_exp_tests);
2059
2060 for (size_t i = 0; i < tests.gene_de_tests.size(); ++i)
2061 {
2062 for (size_t j = 0; j < tests.gene_de_tests.size(); ++j)
2063 {
2064 print_tests(outfiles.gene_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.gene_de_tests[i][j]);
2065 }
2066 }
2067
2068
2069 int total_cds_de_tests = 0;
2070 vector<SampleDifference*> cds_exp_diffs;
2071 fprintf(outfiles.cds_de_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tlog2(fold_change)\ttest_stat\tp_value\tq_value\tsignificant\n");
2072
2073
2074 for (size_t i = 0; i < tests.cds_de_tests.size(); ++i)
2075 {
2076 for (size_t j = 0; j < tests.cds_de_tests.size(); ++j)
2077 {
2078 total_cds_de_tests += tests.cds_de_tests[i][j].size();
2079 extract_sample_diffs(tests.cds_de_tests[i][j], cds_exp_diffs);
2080 }
2081 }
2082
2083
2084 int cds_exp_tests = fdr_significance(FDR, cds_exp_diffs);
2085 fprintf(stderr, "Performed %d CDS-level transcription difference tests\n", cds_exp_tests);
2086
2087 for (size_t i = 0; i < tests.cds_de_tests.size(); ++i)
2088 {
2089 for (size_t j = 0; j < tests.cds_de_tests.size(); ++j)
2090 {
2091 print_tests(outfiles.cds_de_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.cds_de_tests[i][j]);
2092 }
2093 }
2094
2095
2096 int total_diff_splice_tests = 0;
2097 vector<SampleDifference*> splicing_diffs;
2098 fprintf(outfiles.diff_splicing_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tsqrt(JS)\ttest_stat\tp_value\tq_value\tsignificant\n");
2099
2100 for (size_t i = 0; i < tests.diff_splicing_tests.size(); ++i)
2101 {
2102 for (size_t j = 0; j < tests.diff_splicing_tests.size(); ++j)
2103 {
2104 total_diff_splice_tests += tests.diff_splicing_tests[i][j].size();
2105 extract_sample_diffs(tests.diff_splicing_tests[i][j], splicing_diffs);
2106 }
2107 }
2108
2109 int splicing_tests = fdr_significance(FDR, splicing_diffs);
2110 fprintf(stderr, "Performed %d splicing tests\n", splicing_tests);
2111
2112 for (size_t i = 0; i < tests.diff_splicing_tests.size(); ++i)
2113 {
2114 for (size_t j = 0; j < tests.diff_splicing_tests.size(); ++j)
2115 {
2116 const SampleDiffs& diffs = tests.diff_splicing_tests[i][j];
2117 print_tests(outfiles.diff_splicing_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), diffs);
2118 }
2119 }
2120
2121
2122 int total_diff_promoter_tests = 0;
2123 vector<SampleDifference*> promoter_diffs;
2124 fprintf(outfiles.diff_promoter_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tsqrt(JS)\ttest_stat\tp_value\tq_value\tsignificant\n");
2125
2126 for (size_t i = 1; i < tests.diff_splicing_tests.size(); ++i)
2127 {
2128 for (size_t j = 0; j < tests.diff_splicing_tests.size(); ++j)
2129 {
2130 total_diff_promoter_tests += tests.diff_promoter_tests[i][j].size();
2131 extract_sample_diffs(tests.diff_promoter_tests[i][j], promoter_diffs);
2132 }
2133 }
2134
2135
2136 int promoter_tests = fdr_significance(FDR, promoter_diffs);
2137 fprintf(stderr, "Performed %d promoter preference tests\n", promoter_tests);
2138
2139 for (size_t i = 1; i < tests.diff_promoter_tests.size(); ++i)
2140 {
2141 for (size_t j = 0; j < tests.diff_promoter_tests.size(); ++j)
2142 {
2143 print_tests(outfiles.diff_promoter_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.diff_promoter_tests[i][j]);
2144 }
2145 }
2146
2147 int total_diff_cds_tests = 0;
2148 vector<SampleDifference*> cds_use_diffs;
2149 fprintf(outfiles.diff_cds_outfile, "test_id\tgene_id\tgene\tlocus\tsample_1\tsample_2\tstatus\tvalue_1\tvalue_2\tsqrt(JS)\ttest_stat\tp_value\tq_value\tsignificant\n");
2150
2151 for (size_t i = 1; i < tests.diff_cds_tests.size(); ++i)
2152 {
2153 for (size_t j = 0; j < tests.diff_cds_tests.size(); ++j)
2154 {
2155 extract_sample_diffs(tests.diff_cds_tests[i][j], cds_use_diffs);
2156 total_diff_cds_tests += tests.diff_cds_tests[i][j].size();
2157 }
2158 }
2159
2160
2161 int cds_use_tests = fdr_significance(FDR, cds_use_diffs);
2162 fprintf(stderr, "Performing %d relative CDS output tests\n", cds_use_tests);
2163
2164 for (size_t i = 1; i < tests.diff_cds_tests.size(); ++i)
2165 {
2166 for (size_t j = 0; j < tests.diff_cds_tests.size(); ++j)
2167 {
2168 print_tests(outfiles.diff_cds_outfile, sample_labels[j].c_str(), sample_labels[i].c_str(), tests.diff_cds_tests[i][j]);
2169 }
2170 }
2171
2172
2173 // FPKM tracking
2174
2175 FILE* fiso_fpkm_tracking = outfiles.isoform_fpkm_tracking_out;
2176 fprintf(stderr, "Writing isoform-level FPKM tracking\n");
2177 print_FPKM_tracking(fiso_fpkm_tracking,tracking.isoform_fpkm_tracking);
2178
2179 FILE* ftss_fpkm_tracking = outfiles.tss_group_fpkm_tracking_out;
2180 fprintf(stderr, "Writing TSS group-level FPKM tracking\n");
2181 print_FPKM_tracking(ftss_fpkm_tracking,tracking.tss_group_fpkm_tracking);
2182
2183 FILE* fgene_fpkm_tracking = outfiles.gene_fpkm_tracking_out;
2184 fprintf(stderr, "Writing gene-level FPKM tracking\n");
2185 print_FPKM_tracking(fgene_fpkm_tracking,tracking.gene_fpkm_tracking);
2186
2187 FILE* fcds_fpkm_tracking = outfiles.cds_fpkm_tracking_out;
2188 fprintf(stderr, "Writing CDS-level FPKM tracking\n");
2189 print_FPKM_tracking(fcds_fpkm_tracking,tracking.cds_fpkm_tracking);
2190
2191 // Count tracking
2192
2193 FILE* fiso_count_tracking = outfiles.isoform_count_tracking_out;
2194 fprintf(stderr, "Writing isoform-level count tracking\n");
2195 print_count_tracking(fiso_count_tracking,tracking.isoform_fpkm_tracking);
2196
2197 FILE* ftss_count_tracking = outfiles.tss_group_count_tracking_out;
2198 fprintf(stderr, "Writing TSS group-level count tracking\n");
2199 print_count_tracking(ftss_count_tracking,tracking.tss_group_fpkm_tracking);
2200
2201 FILE* fgene_count_tracking = outfiles.gene_count_tracking_out;
2202 fprintf(stderr, "Writing gene-level count tracking\n");
2203 print_count_tracking(fgene_count_tracking,tracking.gene_fpkm_tracking);
2204
2205 FILE* fcds_count_tracking = outfiles.cds_count_tracking_out;
2206 fprintf(stderr, "Writing CDS-level count tracking\n");
2207 print_count_tracking(fcds_count_tracking,tracking.cds_fpkm_tracking);
2208
2209 // Read group tracking
2210
2211 FILE* fiso_rep_tracking = outfiles.isoform_rep_tracking_out;
2212 fprintf(stderr, "Writing isoform-level read group tracking\n");
2213 print_read_group_tracking(fiso_rep_tracking,tracking.isoform_fpkm_tracking);
2214
2215 FILE* ftss_rep_tracking = outfiles.tss_group_rep_tracking_out;
2216 fprintf(stderr, "Writing TSS group-level read group tracking\n");
2217 print_read_group_tracking(ftss_rep_tracking,tracking.tss_group_fpkm_tracking);
2218
2219 FILE* fgene_rep_tracking = outfiles.gene_rep_tracking_out;
2220 fprintf(stderr, "Writing gene-level read group tracking\n");
2221 print_read_group_tracking(fgene_rep_tracking,tracking.gene_fpkm_tracking);
2222
2223 FILE* fcds_rep_tracking = outfiles.cds_rep_tracking_out;
2224 fprintf(stderr, "Writing CDS-level read group tracking\n");
2225 print_read_group_tracking(fcds_rep_tracking,tracking.cds_fpkm_tracking);
2226
2227 FILE* fread_group_info = outfiles.read_group_info_out;
2228 fprintf(stderr, "Writing read group info\n");
2229 print_read_group_info(fread_group_info,all_read_groups);
2230
2231 FILE* frun_info = outfiles.run_info_out;
2232 fprintf(stderr, "Writing run info\n");
2233 print_run_info(frun_info);
2234 }
2235
main(int argc,char ** argv)2236 int main(int argc, char** argv)
2237 {
2238 for (int i = 0; i < argc; ++i)
2239 {
2240 cmd_str += string(argv[i]) + " ";
2241 }
2242
2243 init_library_table();
2244 init_dispersion_method_table();
2245 init_lib_norm_method_table();
2246
2247 min_isoform_fraction = 1e-5;
2248
2249 int parse_ret = parse_options(argc,argv);
2250 if (parse_ret)
2251 return parse_ret;
2252
2253 if (!use_total_mass && !use_compat_mass)
2254 {
2255 use_total_mass = false;
2256 use_compat_mass = true;
2257 }
2258
2259 if(optind >= argc)
2260 {
2261 print_usage();
2262 return 1;
2263 }
2264
2265 if (!no_update_check)
2266 check_version(PACKAGE_VERSION);
2267
2268 string ref_gtf_filename = argv[optind++];
2269 vector<string> sam_hit_filenames;
2270
2271 if (use_sample_sheet)
2272 {
2273 if (optind < argc)
2274 {
2275
2276 string sample_sheet_filename = argv[optind++];
2277 FILE* sample_sheet_file = NULL;
2278 if (sample_sheet_filename != "")
2279 {
2280 sample_sheet_file = fopen(sample_sheet_filename.c_str(), "r");
2281 if (!sample_sheet_file)
2282 {
2283 fprintf(stderr, "Error: cannot open sample sheet file %s for reading\n",
2284 sample_sheet_filename.c_str());
2285 exit(1);
2286 }
2287 }
2288 parse_sample_sheet_file(sample_sheet_file, sample_labels, sam_hit_filenames);
2289 }
2290 else
2291 {
2292 fprintf(stderr, "Error: option --use-sample-sheet requires a single sample sheet filename instead of a list of SAM/BAM files\n");
2293 }
2294 }
2295 else
2296 {
2297 while(optind < argc)
2298 {
2299 string sam_hits_file_name = argv[optind++];
2300 sam_hit_filenames.push_back(sam_hits_file_name);
2301 }
2302
2303 if (sample_labels.size() == 0)
2304 {
2305 for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
2306 {
2307 char buf[256];
2308 sprintf(buf, "q%lu", i);
2309 sample_labels.push_back(buf);
2310 }
2311 }
2312 }
2313
2314 while (sam_hit_filenames.size() < 2)
2315 {
2316 fprintf(stderr, "Error: cuffdiff requires at least 2 SAM files\n");
2317 exit(1);
2318 }
2319
2320
2321 if (sam_hit_filenames.size() != sample_labels.size())
2322 {
2323 fprintf(stderr, "Error: number of labels must match number of conditions\n");
2324 exit(1);
2325 }
2326
2327 if (random_seed == -1)
2328 random_seed = boost::mt19937::default_seed;
2329
2330 // seed the random number generator - we'll need it for the importance
2331 // sampling during MAP estimation of the gammas
2332 srand48(random_seed);
2333
2334 FILE* ref_gtf = NULL;
2335 if (ref_gtf_filename != "")
2336 {
2337 ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
2338 if (!ref_gtf)
2339 {
2340 fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
2341 ref_gtf_filename.c_str());
2342 exit(1);
2343 }
2344 }
2345
2346 FILE* mask_gtf = NULL;
2347 if (mask_gtf_filename != "")
2348 {
2349 mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
2350 if (!mask_gtf)
2351 {
2352 fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
2353 mask_gtf_filename.c_str());
2354 exit(1);
2355 }
2356 }
2357
2358
2359 FILE* contrast_file = NULL;
2360 if (contrast_filename != "")
2361 {
2362 contrast_file = fopen(contrast_filename.c_str(), "r");
2363 if (!contrast_file)
2364 {
2365 fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
2366 contrast_filename.c_str());
2367 exit(1);
2368 }
2369 }
2370
2371 FILE* norm_standards_file = NULL;
2372 if (norm_standards_filename != "")
2373 {
2374 norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
2375 if (!norm_standards_file)
2376 {
2377 fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
2378 norm_standards_filename.c_str());
2379 exit(1);
2380 }
2381 }
2382
2383
2384 // Note: we don't want the assembly filters interfering with calculations
2385 // here
2386
2387 pre_mrna_fraction = 0.0;
2388 olap_radius = 0;
2389
2390 Outfiles outfiles;
2391
2392 if (output_dir != "")
2393 {
2394 int retcode = mkpath(output_dir.c_str(), 0777);
2395 if (retcode == -1)
2396 {
2397 if (errno != EEXIST)
2398 {
2399 fprintf (stderr,
2400 "Error: cannot create directory %s\n",
2401 output_dir.c_str());
2402 exit(1);
2403 }
2404 }
2405 }
2406
2407 static const int filename_buf_size = 2048;
2408
2409 char out_file_prefix[filename_buf_size];
2410 sprintf(out_file_prefix, "%s/", output_dir.c_str());
2411 char iso_out_file_name[filename_buf_size];
2412 sprintf(iso_out_file_name, "%sisoform_exp.diff", out_file_prefix);
2413 FILE* iso_out = fopen(iso_out_file_name, "w");
2414 if (!iso_out)
2415 {
2416 fprintf(stderr, "Error: cannot open differential isoform transcription file %s for writing\n",
2417 iso_out_file_name);
2418 exit(1);
2419 }
2420
2421 char group_out_file_name[filename_buf_size];
2422 sprintf(group_out_file_name, "%stss_group_exp.diff", out_file_prefix);
2423 FILE* group_out = fopen(group_out_file_name, "w");
2424 if (!group_out)
2425 {
2426 fprintf(stderr, "Error: cannot open differential TSS group transcription file %s for writing\n",
2427 group_out_file_name);
2428 exit(1);
2429 }
2430
2431 char gene_out_file_name[filename_buf_size];
2432 sprintf(gene_out_file_name, "%sgene_exp.diff", out_file_prefix);
2433 FILE* gene_out = fopen(gene_out_file_name, "w");
2434 if (!group_out)
2435 {
2436 fprintf(stderr, "Error: cannot open gene expression file %s for writing\n",
2437 gene_out_file_name);
2438 exit(1);
2439 }
2440
2441 char cds_out_file_name[filename_buf_size];
2442 sprintf(cds_out_file_name, "%scds_exp.diff", out_file_prefix);
2443 FILE* cds_out = fopen(cds_out_file_name, "w");
2444 if (!cds_out)
2445 {
2446 fprintf(stderr, "Error: cannot open cds expression file %s for writing\n",
2447 cds_out_file_name);
2448 exit(1);
2449 }
2450
2451 char diff_splicing_out_file_name[filename_buf_size];
2452 sprintf(diff_splicing_out_file_name, "%ssplicing.diff", out_file_prefix);
2453 FILE* diff_splicing_out = fopen(diff_splicing_out_file_name, "w");
2454 if (!diff_splicing_out)
2455 {
2456 fprintf(stderr, "Error: cannot open differential splicing file %s for writing\n",
2457 diff_splicing_out_file_name);
2458 exit(1);
2459 }
2460
2461 char diff_promoter_out_file_name[filename_buf_size];
2462 sprintf(diff_promoter_out_file_name, "%spromoters.diff", out_file_prefix);
2463 FILE* diff_promoter_out = fopen(diff_promoter_out_file_name, "w");
2464 if (!diff_promoter_out)
2465 {
2466 fprintf(stderr, "Error: cannot open differential transcription start file %s for writing\n",
2467 diff_promoter_out_file_name);
2468 exit(1);
2469 }
2470
2471 char diff_cds_out_file_name[filename_buf_size];
2472 sprintf(diff_cds_out_file_name, "%scds.diff", out_file_prefix);
2473 FILE* diff_cds_out = fopen(diff_cds_out_file_name, "w");
2474 if (!diff_cds_out)
2475 {
2476 fprintf(stderr, "Error: cannot open differential relative CDS file %s for writing\n",
2477 diff_cds_out_file_name);
2478 exit(1);
2479 }
2480
2481 outfiles.isoform_de_outfile = iso_out;
2482 outfiles.group_de_outfile = group_out;
2483 outfiles.gene_de_outfile = gene_out;
2484 outfiles.cds_de_outfile = cds_out;
2485 outfiles.diff_splicing_outfile = diff_splicing_out;
2486 outfiles.diff_promoter_outfile = diff_promoter_out;
2487 outfiles.diff_cds_outfile = diff_cds_out;
2488
2489 char isoform_fpkm_tracking_name[filename_buf_size];
2490 sprintf(isoform_fpkm_tracking_name, "%s/isoforms.fpkm_tracking", output_dir.c_str());
2491 FILE* isoform_fpkm_out = fopen(isoform_fpkm_tracking_name, "w");
2492 if (!isoform_fpkm_out)
2493 {
2494 fprintf(stderr, "Error: cannot open isoform-level FPKM tracking file %s for writing\n",
2495 isoform_fpkm_tracking_name);
2496 exit(1);
2497 }
2498 outfiles.isoform_fpkm_tracking_out = isoform_fpkm_out;
2499
2500 char tss_group_fpkm_tracking_name[filename_buf_size];
2501 sprintf(tss_group_fpkm_tracking_name, "%s/tss_groups.fpkm_tracking", output_dir.c_str());
2502 FILE* tss_group_fpkm_out = fopen(tss_group_fpkm_tracking_name, "w");
2503 if (!tss_group_fpkm_out)
2504 {
2505 fprintf(stderr, "Error: cannot open TSS group-level FPKM tracking file %s for writing\n",
2506 tss_group_fpkm_tracking_name);
2507 exit(1);
2508 }
2509 outfiles.tss_group_fpkm_tracking_out = tss_group_fpkm_out;
2510
2511 char cds_fpkm_tracking_name[filename_buf_size];
2512 sprintf(cds_fpkm_tracking_name, "%s/cds.fpkm_tracking", output_dir.c_str());
2513 FILE* cds_fpkm_out = fopen(cds_fpkm_tracking_name, "w");
2514 if (!cds_fpkm_out)
2515 {
2516 fprintf(stderr, "Error: cannot open CDS level FPKM tracking file %s for writing\n",
2517 cds_fpkm_tracking_name);
2518 exit(1);
2519 }
2520 outfiles.cds_fpkm_tracking_out = cds_fpkm_out;
2521
2522 char gene_fpkm_tracking_name[filename_buf_size];
2523 sprintf(gene_fpkm_tracking_name, "%s/genes.fpkm_tracking", output_dir.c_str());
2524 FILE* gene_fpkm_out = fopen(gene_fpkm_tracking_name, "w");
2525 if (!gene_fpkm_out)
2526 {
2527 fprintf(stderr, "Error: cannot open gene-level FPKM tracking file %s for writing\n",
2528 gene_fpkm_tracking_name);
2529 exit(1);
2530 }
2531 outfiles.gene_fpkm_tracking_out = gene_fpkm_out;
2532
2533 char isoform_count_tracking_name[filename_buf_size];
2534 sprintf(isoform_count_tracking_name, "%s/isoforms.count_tracking", output_dir.c_str());
2535 FILE* isoform_count_out = fopen(isoform_count_tracking_name, "w");
2536 if (!isoform_count_out)
2537 {
2538 fprintf(stderr, "Error: cannot open isoform-level count tracking file %s for writing\n",
2539 isoform_count_tracking_name);
2540 exit(1);
2541 }
2542 outfiles.isoform_count_tracking_out = isoform_count_out;
2543
2544 char tss_group_count_tracking_name[filename_buf_size];
2545 sprintf(tss_group_count_tracking_name, "%s/tss_groups.count_tracking", output_dir.c_str());
2546 FILE* tss_group_count_out = fopen(tss_group_count_tracking_name, "w");
2547 if (!tss_group_count_out)
2548 {
2549 fprintf(stderr, "Error: cannot open TSS group-level count tracking file %s for writing\n",
2550 tss_group_count_tracking_name);
2551 exit(1);
2552 }
2553 outfiles.tss_group_count_tracking_out = tss_group_count_out;
2554
2555 char cds_count_tracking_name[filename_buf_size];
2556 sprintf(cds_count_tracking_name, "%s/cds.count_tracking", output_dir.c_str());
2557 FILE* cds_count_out = fopen(cds_count_tracking_name, "w");
2558 if (!cds_count_out)
2559 {
2560 fprintf(stderr, "Error: cannot open CDS level count tracking file %s for writing\n",
2561 cds_count_tracking_name);
2562 exit(1);
2563 }
2564 outfiles.cds_count_tracking_out = cds_count_out;
2565
2566 char gene_count_tracking_name[filename_buf_size];
2567 sprintf(gene_count_tracking_name, "%s/genes.count_tracking", output_dir.c_str());
2568 FILE* gene_count_out = fopen(gene_count_tracking_name, "w");
2569 if (!gene_count_out)
2570 {
2571 fprintf(stderr, "Error: cannot open gene-level count tracking file %s for writing\n",
2572 gene_count_tracking_name);
2573 exit(1);
2574 }
2575 outfiles.gene_count_tracking_out = gene_count_out;
2576
2577 char isoform_rep_tracking_name[filename_buf_size];
2578 sprintf(isoform_rep_tracking_name, "%s/isoforms.read_group_tracking", output_dir.c_str());
2579 FILE* isoform_rep_out = fopen(isoform_rep_tracking_name, "w");
2580 if (!isoform_rep_out)
2581 {
2582 fprintf(stderr, "Error: cannot open isoform-level read group tracking file %s for writing\n",
2583 isoform_rep_tracking_name);
2584 exit(1);
2585 }
2586 outfiles.isoform_rep_tracking_out = isoform_rep_out;
2587
2588 char tss_group_rep_tracking_name[filename_buf_size];
2589 sprintf(tss_group_rep_tracking_name, "%s/tss_groups.read_group_tracking", output_dir.c_str());
2590 FILE* tss_group_rep_out = fopen(tss_group_rep_tracking_name, "w");
2591 if (!tss_group_rep_out)
2592 {
2593 fprintf(stderr, "Error: cannot open TSS group-level read group tracking file %s for writing\n",
2594 tss_group_rep_tracking_name);
2595 exit(1);
2596 }
2597 outfiles.tss_group_rep_tracking_out = tss_group_rep_out;
2598
2599 char cds_rep_tracking_name[filename_buf_size];
2600 sprintf(cds_rep_tracking_name, "%s/cds.read_group_tracking", output_dir.c_str());
2601 FILE* cds_rep_out = fopen(cds_rep_tracking_name, "w");
2602 if (!cds_rep_out)
2603 {
2604 fprintf(stderr, "Error: cannot open CDS level read group tracking file %s for writing\n",
2605 cds_rep_tracking_name);
2606 exit(1);
2607 }
2608 outfiles.cds_rep_tracking_out = cds_rep_out;
2609
2610 char gene_rep_tracking_name[filename_buf_size];
2611 sprintf(gene_rep_tracking_name, "%s/genes.read_group_tracking", output_dir.c_str());
2612 FILE* gene_rep_out = fopen(gene_rep_tracking_name, "w");
2613 if (!gene_rep_out)
2614 {
2615 fprintf(stderr, "Error: cannot open gene-level read group tracking file %s for writing\n",
2616 gene_rep_tracking_name);
2617 exit(1);
2618 }
2619 outfiles.gene_rep_tracking_out = gene_rep_out;
2620
2621 char read_group_info_name[filename_buf_size];
2622 sprintf(read_group_info_name, "%s/read_groups.info", output_dir.c_str());
2623 FILE* read_group_out = fopen(read_group_info_name, "w");
2624 if (!read_group_out)
2625 {
2626 fprintf(stderr, "Error: cannot open read group info file %s for writing\n",
2627 read_group_info_name);
2628 exit(1);
2629 }
2630 outfiles.read_group_info_out = read_group_out;
2631
2632 char run_info_name[filename_buf_size];
2633 sprintf(run_info_name, "%s/run.info", output_dir.c_str());
2634 FILE* run_info_out = fopen(run_info_name, "w");
2635 if (!run_info_out)
2636 {
2637 fprintf(stderr, "Error: cannot open run info file %s for writing\n",
2638 run_info_name);
2639 exit(1);
2640 }
2641 outfiles.run_info_out = run_info_out;
2642
2643 char bias_name[filename_buf_size];
2644 sprintf(bias_name, "%s/bias_params.info", output_dir.c_str());
2645 FILE* bias_out = fopen(bias_name, "w");
2646 if (!bias_out)
2647 {
2648 fprintf(stderr, "Error: cannot open run info file %s for writing\n",
2649 bias_name);
2650 exit(1);
2651 }
2652 outfiles.bias_out = bias_out;
2653
2654 char var_model_name[filename_buf_size];
2655 sprintf(var_model_name, "%s/var_model.info", output_dir.c_str());
2656 FILE* var_model_out = fopen(var_model_name, "w");
2657 if (!var_model_out)
2658 {
2659 fprintf(stderr, "Error: cannot open run info file %s for writing\n",
2660 var_model_name);
2661 exit(1);
2662 }
2663 outfiles.var_model_out = var_model_out;
2664
2665
2666 driver(ref_gtf, mask_gtf, contrast_file, norm_standards_file, sam_hit_filenames, outfiles);
2667
2668 #if 0
2669 if (emit_count_tables)
2670 {
2671 dump_locus_variance_info(output_dir + string("/locus_var.txt"));
2672 }
2673 #endif
2674
2675 return 0;
2676 }
2677