1 /*
2  *  cuffdiff.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 10/21/09.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 #include <numeric>
14 #include <cfloat>
15 #include <iostream>
16 #include <fstream>
17 
18 #include "common.h"
19 #include "hits.h"
20 #include "bundles.h"
21 #include "abundances.h"
22 #include "tokenize.h"
23 #include "biascorrection.h"
24 #include "update_check.h"
25 
26 #include <boost/thread.hpp>
27 #include <boost/version.hpp>
28 #include <boost/graph/adjacency_list.hpp>
29 #include <boost/graph/graph_traits.hpp>
30 #include <boost/numeric/ublas/matrix.hpp>
31 #include <boost/numeric/ublas/matrix_proxy.hpp>
32 #include <boost/numeric/ublas/vector.hpp>
33 #include <boost/numeric/ublas/vector_proxy.hpp>
34 #include <boost/numeric/ublas/io.hpp>
35 #include <boost/algorithm/string.hpp>
36 
37 #include "replicates.h"
38 #include "tracking.h"
39 
40 // Need at least this many reads in a locus to do any testing on it
41 
42 vector<string> sample_labels;
43 
44 using namespace std;
45 using namespace boost;
46 
47 // We leave out the short codes for options that don't take an argument
48 #if ENABLE_THREADS
49 const char *short_options = "m:p:s:c:I:j:L:M:o:b:TNqvuF:C:";
50 #else
51 const char *short_options = "m:s:c:I:j:L:M:o:b:TNqvuF:C:";
52 #endif
53 
54 
55 
56 static struct option long_options[] = {
57 {"frag-len-mean",			required_argument,       0,          'm'},
58 {"frag-len-std-dev",        required_argument,       0,          's'},
59 {"seed",                    required_argument,		 0,			 OPT_RANDOM_SEED},
60 {"mask-file",               required_argument,		 0,			 'M'},
61 {"output-dir",			    required_argument,		 0,			 'o'},
62 {"verbose",			    	no_argument,			 0,			 'v'},
63 {"quiet",			    	no_argument,			 0,			 'q'},
64 {"frag-bias-correct",       required_argument,		 0,			 'b'},
65 {"multi-read-correct",      no_argument,			 0,			 'u'},
66 #if ENABLE_THREADS
67 {"num-threads",				required_argument,       0,          'p'},
68 #endif
69 {"library-type",		    required_argument,		 0,			 OPT_LIBRARY_TYPE},
70 {"seed",                    required_argument,		 0,			 OPT_RANDOM_SEED},
71 {"no-collapse-cond-prob",   no_argument,             0,			 OPT_COLLAPSE_COND_PROB},
72 {"max-mle-iterations",		required_argument,		 0,			 OPT_MLE_MAX_ITER},
73 {"min-mle-accuracy",		required_argument,		 0,			 OPT_MLE_MIN_ACC},
74 {"bias-mode",               required_argument,		 0,			 OPT_BIAS_MODE},
75 {"no-update-check",         no_argument,             0,          OPT_NO_UPDATE_CHECK},
76 
77 // Some options for testing different stats policies
78 {"max-bundle-frags",        required_argument,       0,          OPT_MAX_FRAGS_PER_BUNDLE},
79 {"max-frag-multihits",      required_argument,       0,          OPT_FRAG_MAX_MULTIHITS},
80 {"no-effective-length-correction",  no_argument,     0,          OPT_NO_EFFECTIVE_LENGTH_CORRECTION},
81 {"no-length-correction",    no_argument,             0,          OPT_NO_LENGTH_CORRECTION},
82 {0, 0, 0, 0} // terminator
83 };
84 
print_usage()85 void print_usage()
86 {
87 	fprintf(stderr, "cuffquant v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
88 	fprintf(stderr, "-----------------------------\n");
89 
90 	//NOTE: SPACES ONLY, bozo
91     fprintf(stderr, "Usage:   cuffquant [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]\n");
92 	fprintf(stderr, "   Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam\n");
93     fprintf(stderr, "General Options:\n");
94     fprintf(stderr, "  -o/--output-dir              write all output files to this directory              [ default:     ./ ]\n");
95 	fprintf(stderr, "  -M/--mask-file               ignore all alignment within transcripts in this file  [ default:   NULL ]\n");
96     //fprintf(stderr, "  --norm-standards-file        Housekeeping/spike genes to normalize libraries       [ default:   NULL ]\n"); // NOT YET DOCUMENTED, keep secret for now
97     fprintf(stderr, "  -b/--frag-bias-correct       use bias correction - reference fasta required        [ default:   NULL ]\n");
98     fprintf(stderr, "  -u/--multi-read-correct      use 'rescue method' for multi-reads                   [ default:  FALSE ]\n");
99 #if ENABLE_THREADS
100 	fprintf(stderr, "  -p/--num-threads             number of threads used during quantification          [ default:      1 ]\n");
101 #endif
102     fprintf(stderr, "  --library-type               Library prep used for input reads                     [ default:  below ]\n");
103 
104     fprintf(stderr, "\nAdvanced Options:\n");
105     fprintf(stderr, "  -m/--frag-len-mean           average fragment length (unpaired reads only)         [ default:    200 ]\n");
106     fprintf(stderr, "  -s/--frag-len-std-dev        fragment length std deviation (unpaired reads only)   [ default:     80 ]\n");
107     fprintf(stderr, "  -c/--min-alignment-count     minimum number of alignments in a locus for testing   [ default:   10 ]\n");
108     fprintf(stderr, "  --max-mle-iterations         maximum iterations allowed for MLE calculation        [ default:   5000 ]\n");
109     fprintf(stderr, "  -v/--verbose                 log-friendly verbose processing (no progress bar)     [ default:  FALSE ]\n");
110 	fprintf(stderr, "  -q/--quiet                   log-friendly quiet processing (no progress bar)       [ default:  FALSE ]\n");
111     fprintf(stderr, "  --seed                       value of random number generator seed                 [ default:      0 ]\n");
112     fprintf(stderr, "  --no-update-check            do not contact server to check for update availability[ default:  FALSE ]\n");
113     fprintf(stderr, "  --max-bundle-frags           maximum fragments allowed in a bundle before skipping [ default: 500000 ]\n");
114     fprintf(stderr, "  --max-frag-multihits         Maximum number of alignments allowed per fragment     [ default: unlim  ]\n");
115     fprintf(stderr, "  --no-effective-length-correction   No effective length correction                  [ default:  FALSE ]\n");
116     fprintf(stderr, "  --no-length-correction       No length correction                                  [ default:  FALSE ]\n");
117 
118     fprintf(stderr, "\nDebugging use only:\n");
119     fprintf(stderr, "  --read-skip-fraction         Skip a random subset of reads this size               [ default:    0.0 ]\n");
120     fprintf(stderr, "  --no-read-pairs              Break all read pairs                                  [ default:  FALSE ]\n");
121     fprintf(stderr, "  --trim-read-length           Trim reads to be this long (keep 5' end)              [ default:   none ]\n");
122     fprintf(stderr, "  --no-scv-correction          Disable SCV correction                                [ default:  FALSE ]\n");
123     print_library_table();
124 }
125 
parse_options(int argc,char ** argv)126 int parse_options(int argc, char** argv)
127 {
128     int option_index = 0;
129     int next_option;
130     string sample_label_list;
131     string dispersion_method_str;
132     string lib_norm_method_str;
133     do {
134         next_option = getopt_long_only(argc, argv, short_options, long_options, &option_index);
135         if (next_option == -1)     /* Done with options. */
136             break;
137         switch (next_option) {
138             case 0:
139                 /* If this option set a flag, do nothing else now. */
140                 if (long_options[option_index].flag != 0)
141                     break;
142                 break;
143 
144 			case 'm':
145 				user_provided_fld = true;
146 				def_frag_len_mean = (uint32_t)parseInt(0, "-m/--frag-len-mean arg must be at least 0", print_usage);
147 				break;
148 			case 's':
149 				user_provided_fld = true;
150 				def_frag_len_std_dev = (uint32_t)parseInt(0, "-s/--frag-len-std-dev arg must be at least 0", print_usage);
151 				break;
152 			case 'p':
153 				num_threads = (uint32_t)parseInt(1, "-p/--num-threads arg must be at least 1", print_usage);
154 				break;
155             case 'L':
156 				sample_label_list = optarg;
157 				break;
158 			case OPT_NUM_IMP_SAMPLES:
159 				num_importance_samples = parseInt(1, "--num-importance-samples must be at least 1", print_usage);
160 				break;
161 			case OPT_MLE_MAX_ITER:
162 				max_mle_iterations = parseInt(1, "--max-mle-iterations must be at least 1", print_usage);
163 				break;
164 			case OPT_BIAS_MODE:
165 				if (!strcmp(optarg, "site"))
166 					bias_mode = SITE;
167 				else if (!strcmp(optarg, "pos"))
168 					bias_mode = POS;
169 				else if (!strcmp(optarg, "pos_vlmm"))
170 					bias_mode = POS_VLMM;
171 				else if (!strcmp(optarg, "vlmm"))
172 					bias_mode = VLMM;
173                 else if (!strcmp(optarg, "pos_site"))
174 					bias_mode = POS_SITE;
175 				else
176 				{
177 					fprintf(stderr, "Unknown bias mode.\n");
178 					exit(1);
179 				}
180 				break;
181 			case 'M':
182 			{
183 				mask_gtf_filename = optarg;
184 				break;
185 			}
186 			case OPT_NORM_STANDARDS_FILE:
187 			{
188 				norm_standards_filename = optarg;
189 				break;
190 			}
191             case 'v':
192 			{
193 				if (cuff_quiet)
194 				{
195 					fprintf(stderr, "Warning: Can't be both verbose and quiet!  Setting verbose only.\n");
196 				}
197 				cuff_quiet = false;
198 				cuff_verbose = true;
199 				break;
200 			}
201 			case 'q':
202 			{
203 				if (cuff_verbose)
204 				{
205 					fprintf(stderr, "Warning: Can't be both verbose and quiet!  Setting quiet only.\n");
206 				}
207 				cuff_verbose = false;
208 				cuff_quiet = true;
209 				break;
210 			}
211             case 'o':
212 			{
213 				output_dir = optarg;
214 				break;
215 			}
216 			case 'b':
217 			{
218 				fasta_dir = optarg;
219 				corr_bias = true;
220 				break;
221             }
222 
223             case 'u':
224             {
225                 corr_multi = true;
226                 break;
227             }
228             case OPT_LIBRARY_TYPE:
229 			{
230 				library_type = optarg;
231 				break;
232 			}
233             case OPT_NO_UPDATE_CHECK:
234             {
235                 no_update_check = true;
236                 break;
237             }
238             case OPT_RANDOM_SEED:
239             {
240                 random_seed = parseInt(0, "--seed must be at least 0", print_usage);
241                 break;
242             }
243             case OPT_COLLAPSE_COND_PROB:
244             {
245                 cond_prob_collapse = false;
246                 break;
247             }
248             case OPT_USE_COMPAT_MASS:
249             {
250                 use_compat_mass = true;
251                 break;
252             }
253             case OPT_USE_TOTAL_MASS:
254             {
255                 use_total_mass = true;
256                 break;
257             }
258             case OPT_MAX_FRAGS_PER_BUNDLE:
259             {
260                 max_frags_per_bundle = parseInt(0, "--max-bundle-frags must be at least 0", print_usage);
261                 break;
262             }
263             case OPT_READ_SKIP_FRACTION:
264             {
265                 read_skip_fraction = parseFloat(0, 1.0, "--read-skip-fraction must be between 0 and 1.0", print_usage);
266                 break;
267             }
268             case OPT_NO_READ_PAIRS:
269             {
270                 no_read_pairs = true;
271                 break;
272             }
273             case OPT_TRIM_READ_LENGTH:
274             {
275                 trim_read_length = parseInt(0, "--trim-read-length must be at least 1", print_usage);
276                 break;
277             }
278             case OPT_FRAG_MAX_MULTIHITS:
279             {
280                 max_frag_multihits = parseInt(1, "--max-frag-multihits must be at least 1", print_usage);
281                 break;
282             }
283             case OPT_NO_EFFECTIVE_LENGTH_CORRECTION:
284             {
285                 no_effective_length_correction = true;
286                 break;
287             }
288             case OPT_NO_LENGTH_CORRECTION:
289             {
290                 no_length_correction = true;
291                 break;
292             }
293             case OPT_LIB_NORM_METHOD:
294 			{
295 				lib_norm_method_str = optarg;
296 				break;
297 			}
298 
299 			default:
300 				print_usage();
301 				return 1;
302         }
303     } while(next_option != -1);
304 
305 	if (library_type != "")
306     {
307         map<string, ReadGroupProperties>::iterator lib_itr =
308 		library_type_table.find(library_type);
309         if (lib_itr == library_type_table.end())
310         {
311             fprintf(stderr, "Error: Library type %s not supported\n", library_type.c_str());
312             exit(1);
313         }
314         else
315         {
316             if (library_type == "transfrags")
317             {
318                 allow_junk_filtering = false;
319             }
320             global_read_properties = &lib_itr->second;
321         }
322     }
323     else
324     {
325 
326     }
327 
328     // Set the count dispersion method to use
329     if (dispersion_method_str == "")
330     {
331         dispersion_method_str = default_dispersion_method;
332     }
333 
334     map<string, DispersionMethod>::iterator disp_itr =
335     dispersion_method_table.find(dispersion_method_str);
336     if (disp_itr == dispersion_method_table.end())
337     {
338         fprintf(stderr, "Error: Dispersion method %s not supported\n", dispersion_method_str.c_str());
339         exit(1);
340     }
341     else
342     {
343         dispersion_method = disp_itr->second;
344     }
345 
346     // Set the library size normalization method to use
347     if (lib_norm_method_str == "")
348     {
349         lib_norm_method_str = default_lib_norm_method;
350     }
351 
352     map<string, LibNormalizationMethod>::iterator lib_norm_itr =
353     lib_norm_method_table.find(lib_norm_method_str);
354     if (lib_norm_itr == lib_norm_method_table.end())
355     {
356         fprintf(stderr, "Error: Normalization method %s not supported\n", lib_norm_method_str.c_str());
357         exit(1);
358     }
359     else
360     {
361         lib_norm_method = lib_norm_itr->second;
362     }
363 
364 
365 
366     if (use_total_mass && use_compat_mass)
367     {
368         fprintf (stderr, "Error: please supply only one of --compatibile-hits-norm and --total-hits-norm\n");
369         exit(1);
370     }
371 
372     tokenize(sample_label_list, ",", sample_labels);
373 
374 	allow_junk_filtering = false;
375 
376 	return 0;
377 }
378 
379 struct Outfiles
380 {
381 	FILE* isoform_fpkm_tracking_out;
382 	FILE* tss_group_fpkm_tracking_out;
383 	FILE* gene_fpkm_tracking_out;
384 	FILE* cds_fpkm_tracking_out;
385 
386     FILE* isoform_count_tracking_out;
387 	FILE* tss_group_count_tracking_out;
388 	FILE* gene_count_tracking_out;
389 	FILE* cds_count_tracking_out;
390 
391     FILE* run_info_out;
392     FILE* read_group_info_out;
393     FILE* bias_out;
394     FILE* var_model_out;
395 };
396 
print_FPKM_tracking(FILE * fout,const FPKMTrackingTable & tracking)397 void print_FPKM_tracking(FILE* fout,
398 						 const FPKMTrackingTable& tracking)
399 {
400 	fprintf(fout,"tracking_id\tclass_code\tnearest_ref_id\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage");
401 	FPKMTrackingTable::const_iterator first_itr = tracking.begin();
402 	if (first_itr != tracking.end())
403 	{
404 		const FPKMTracking& track = first_itr->second;
405 		const vector<FPKMContext>& fpkms = track.fpkm_series;
406 		for (size_t i = 0; i < fpkms.size(); ++i)
407 		{
408 			fprintf(fout, "\t%s_FPKM\t%s_conf_lo\t%s_conf_hi\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
409 		}
410 	}
411 	fprintf(fout, "\n");
412 	for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
413 	{
414 		const string& description = itr->first;
415 		const FPKMTracking& track = itr->second;
416 		const vector<FPKMContext>& fpkms = track.fpkm_series;
417 
418         AbundanceStatus status = NUMERIC_OK;
419         BOOST_FOREACH (const FPKMContext& c, fpkms)
420         {
421             if (c.status == NUMERIC_FAIL)
422                 status = NUMERIC_FAIL;
423         }
424 
425         string all_gene_ids = cat_strings(track.gene_ids);
426 		if (all_gene_ids == "")
427 			all_gene_ids = "-";
428 
429 		string all_gene_names = cat_strings(track.gene_names);
430 		if (all_gene_names == "")
431 			all_gene_names = "-";
432 
433 		string all_tss_ids = cat_strings(track.tss_ids);
434 		if (all_tss_ids == "")
435 			all_tss_ids = "-";
436 
437         char length_buff[33] = "-";
438         if (track.length)
439             sprintf(length_buff, "%d", track.length);
440 
441         fprintf(fout, "%s\t%c\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
442                 description.c_str(),
443                 track.classcode ? track.classcode : '-',
444                 track.ref_match.c_str(),
445                 all_gene_ids.c_str(),
446                 all_gene_names.c_str(),
447                 all_tss_ids.c_str(),
448                 track.locus_tag.c_str(),
449                 length_buff,
450                 "-");
451 
452 		for (size_t i = 0; i < fpkms.size(); ++i)
453 		{
454 			double fpkm = fpkms[i].FPKM;
455 			//double std_dev = sqrt(fpkms[i].FPKM_variance);
456 			double fpkm_conf_hi = fpkms[i].FPKM_conf_hi;
457 			double fpkm_conf_lo = fpkms[i].FPKM_conf_lo;
458             const char* status_str = "OK";
459 
460             if (fpkms[i].status == NUMERIC_OK)
461             {
462                 status_str = "OK";
463             }
464             else if (fpkms[i].status == NUMERIC_FAIL)
465             {
466                 status_str = "FAIL";
467             }
468             else if (fpkms[i].status == NUMERIC_LOW_DATA)
469             {
470                 status_str = "LOWDATA";
471             }
472             else if (fpkms[i].status == NUMERIC_HI_DATA)
473             {
474                 status_str = "HIDATA";
475             }
476             else
477             {
478                 assert(false);
479             }
480 
481 			fprintf(fout, "\t%lg\t%lg\t%lg\t%s", fpkm, fpkm_conf_lo, fpkm_conf_hi, status_str);
482 		}
483 
484 		fprintf(fout, "\n");
485 	}
486 }
487 
print_count_tracking(FILE * fout,const FPKMTrackingTable & tracking)488 void print_count_tracking(FILE* fout,
489 						  const FPKMTrackingTable& tracking)
490 {
491 	fprintf(fout,"tracking_id");
492 	FPKMTrackingTable::const_iterator first_itr = tracking.begin();
493 	if (first_itr != tracking.end())
494 	{
495 		const FPKMTracking& track = first_itr->second;
496 		const vector<FPKMContext>& fpkms = track.fpkm_series;
497 		for (size_t i = 0; i < fpkms.size(); ++i)
498 		{
499 			fprintf(fout, "\t%s_count\t%s_count_variance\t%s_count_uncertainty_var\t%s_count_dispersion_var\t%s_status", sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str(), sample_labels[i].c_str());
500 		}
501 	}
502 	fprintf(fout, "\n");
503 	for (FPKMTrackingTable::const_iterator itr = tracking.begin(); itr != tracking.end(); ++itr)
504 	{
505 		const string& description = itr->first;
506 		const FPKMTracking& track = itr->second;
507 		const vector<FPKMContext>& fpkms = track.fpkm_series;
508 
509         AbundanceStatus status = NUMERIC_OK;
510         BOOST_FOREACH (const FPKMContext& c, fpkms)
511         {
512             if (c.status == NUMERIC_FAIL)
513                 status = NUMERIC_FAIL;
514         }
515 
516         fprintf(fout, "%s",
517                 description.c_str());
518 
519 		for (size_t i = 0; i < fpkms.size(); ++i)
520 		{
521             const char* status_str = "OK";
522 
523             if (fpkms[i].status == NUMERIC_OK)
524             {
525                 status_str = "OK";
526             }
527             else if (fpkms[i].status == NUMERIC_FAIL)
528             {
529                 status_str = "FAIL";
530             }
531             else if (fpkms[i].status == NUMERIC_LOW_DATA)
532             {
533                 status_str = "LOWDATA";
534             }
535             else if (fpkms[i].status == NUMERIC_HI_DATA)
536             {
537                 status_str = "HIDATA";
538             }
539             else
540             {
541                 assert(false);
542             }
543 
544             double external_counts = fpkms[i].count_mean;
545             double external_count_var = fpkms[i].count_var;
546             double uncertainty_var = fpkms[i].count_uncertainty_var;
547             double dispersion_var = fpkms[i].count_dispersion_var;
548 			fprintf(fout, "\t%lg\t%lg\t%lg\t%lg\t%s", external_counts, external_count_var, uncertainty_var, dispersion_var, status_str);
549 		}
550 
551 		fprintf(fout, "\n");
552 	}
553 }
554 
print_run_info(FILE * fout)555 void print_run_info(FILE* fout)
556 {
557     fprintf(fout, "param\tvalue\n");
558     fprintf(fout, "cmd_line\t%s\n", cmd_str.c_str());
559     fprintf(fout, "version\t%s\n", PACKAGE_VERSION);
560     fprintf(fout, "SVN_revision\t%s\n",SVN_REVISION);
561     fprintf(fout, "boost_version\t%d\n", BOOST_VERSION);
562 }
563 
564 
565 #if ENABLE_THREADS
566 boost::mutex inspect_lock;
567 
568 boost::mutex _recorder_lock;
569 boost::mutex locus_thread_pool_lock;
570 int locus_curr_threads = 0;
571 int locus_num_threads = 0;
572 
decr_pool_count()573 void decr_pool_count()
574 {
575 	locus_thread_pool_lock.lock();
576 	locus_curr_threads--;
577 	locus_thread_pool_lock.unlock();
578 }
579 
580 #endif
581 
582 
583 
inspect_map_worker(ReplicatedBundleFactory & fac,int & tmp_min_frag_len,int & tmp_max_frag_len,IdToLocusMap & id_to_locus_map)584 void inspect_map_worker(ReplicatedBundleFactory& fac,
585                         int& tmp_min_frag_len,
586                         int& tmp_max_frag_len,
587                         IdToLocusMap& id_to_locus_map)
588 {
589 #if ENABLE_THREADS
590 	boost::this_thread::at_thread_exit(decr_pool_count);
591 #endif
592 
593     int min_f = std::numeric_limits<int>::max();
594     int max_f = 0;
595 
596     fac.inspect_replicate_maps(min_f, max_f, id_to_locus_map);
597 
598 #if ENABLE_THREADS
599     inspect_lock.lock();
600 #endif
601     tmp_min_frag_len = min(min_f, tmp_min_frag_len);
602     tmp_max_frag_len = max(max_f, tmp_max_frag_len);
603 #if ENABLE_THREADS
604     inspect_lock.unlock();
605 #endif
606 }
607 
learn_bias_worker(boost::shared_ptr<BundleFactory> fac)608 void learn_bias_worker(boost::shared_ptr<BundleFactory> fac)
609 {
610 #if ENABLE_THREADS
611 	boost::this_thread::at_thread_exit(decr_pool_count);
612 #endif
613 	boost::shared_ptr<ReadGroupProperties> rg_props = fac->read_group_properties();
614 	BiasLearner* bl = new BiasLearner(rg_props->frag_len_dist());
615 	learn_bias(*fac, *bl, false);
616 	rg_props->bias_learner(boost::shared_ptr<BiasLearner>(bl));
617 }
618 
619 typedef map<int, vector<AbundanceGroup> > light_ab_group_tracking_table;
620 
621 // Similiar to TestLauncher, except this class records tracking data when abundance groups report in
622 struct AbundanceRecorder
623 {
624 private:
AbundanceRecorderAbundanceRecorder625     AbundanceRecorder(AbundanceRecorder& rhs) {}
626 
627 public:
AbundanceRecorderAbundanceRecorder628     AbundanceRecorder(int num_samples,
629                       Tracking* tracking,
630                       ProgressBar* p_bar)
631         :
632         _orig_workers(num_samples),
633         _tracking(tracking),
634         _p_bar(p_bar)
635         {
636         }
637 
638     void operator()();
639 
640     void register_locus(int locus_id);
641     void abundance_avail(int locus_id,
642                          boost::shared_ptr<SampleAbundances> ab,
643                          size_t factory_id);
644     void record_finished_loci();
645     void record_tracking_data(int locus_id, vector<boost::shared_ptr<SampleAbundances> >& abundances);
646     bool all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances);
647     bool all_samples_reported_in(int locus_id);
648 
clear_tracking_dataAbundanceRecorder649     void clear_tracking_data() { _tracking->clear(); }
650 
651     typedef list<pair<int, vector<boost::shared_ptr<SampleAbundances> > > > recorder_sample_table;
652 
get_sample_tableAbundanceRecorder653     const light_ab_group_tracking_table& get_sample_table() const { return _ab_group_tracking_table; }
654 
655 private:
656 
657     recorder_sample_table::iterator find_locus(int locus_id);
658 
659     int _orig_workers;
660 
661     recorder_sample_table _samples;
662 
663     Tracking* _tracking;
664     ProgressBar* _p_bar;
665 
666     light_ab_group_tracking_table _ab_group_tracking_table;
667 };
668 
669 
find_locus(int locus_id)670 AbundanceRecorder::recorder_sample_table::iterator AbundanceRecorder::find_locus(int locus_id)
671 {
672     recorder_sample_table::iterator itr = _samples.begin();
673     for(; itr != _samples.end(); ++itr)
674     {
675         if (itr->first == locus_id)
676             return itr;
677     }
678     return _samples.end();
679 }
680 
register_locus(int locus_id)681 void AbundanceRecorder::register_locus(int locus_id)
682 {
683 #if ENABLE_THREADS
684 	boost::mutex::scoped_lock lock(_recorder_lock);
685 #endif
686 
687     recorder_sample_table::iterator itr = find_locus(locus_id);
688     if (itr == _samples.end())
689     {
690         pair<recorder_sample_table::iterator, bool> p;
691         vector<boost::shared_ptr<SampleAbundances> >abs(_orig_workers);
692         _samples.push_back(make_pair(locus_id, abs));
693     }
694 }
695 
abundance_avail(int locus_id,boost::shared_ptr<SampleAbundances> ab,size_t factory_id)696 void AbundanceRecorder::abundance_avail(int locus_id,
697                                         boost::shared_ptr<SampleAbundances> ab,
698                                         size_t factory_id)
699 {
700 #if ENABLE_THREADS
701 	boost::mutex::scoped_lock lock(_recorder_lock);
702 #endif
703     recorder_sample_table::iterator itr = find_locus(locus_id);
704     if (itr == _samples.end())
705     {
706         assert(false);
707     }
708     itr->second[factory_id] = ab;
709     //itr->second(factory_id] = ab;
710 }
711 
712 // Note: this routine should be called under lock - it doesn't
713 // acquire the lock itself.
all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances>> & abundances)714 bool AbundanceRecorder::all_samples_reported_in(vector<boost::shared_ptr<SampleAbundances> >& abundances)
715 {
716     BOOST_FOREACH (boost::shared_ptr<SampleAbundances> ab, abundances)
717     {
718         if (!ab)
719         {
720             return false;
721         }
722     }
723     return true;
724 }
725 
726 
727 // Note: this routine should be called under lock - it doesn't
728 // acquire the lock itself.
record_tracking_data(int locus_id,vector<boost::shared_ptr<SampleAbundances>> & abundances)729 void AbundanceRecorder::record_tracking_data(int locus_id, vector<boost::shared_ptr<SampleAbundances> >& abundances)
730 {
731     assert (abundances.size() == _orig_workers);
732 
733     // Just verify that all the loci from each factory match up.
734     for (size_t i = 1; i < abundances.size(); ++i)
735     {
736         const SampleAbundances& curr = *(abundances[i]);
737         const SampleAbundances& prev = *(abundances[i-1]);
738 
739         assert (curr.locus_tag == prev.locus_tag);
740 
741         const AbundanceGroup& s1 = curr.transcripts;
742         const AbundanceGroup& s2 =  prev.transcripts;
743 
744         assert (s1.abundances().size() == s2.abundances().size());
745 
746         for (size_t j = 0; j < s1.abundances().size(); ++j)
747         {
748             assert (s1.abundances()[j]->description() == s2.abundances()[j]->description());
749         }
750     }
751 
752     vector<AbundanceGroup> lightweight_ab_groups;
753 
754     // Add all the transcripts, CDS groups, TSS groups, and genes to their
755     // respective FPKM tracking table.  Whether this is a time series or an
756     // all pairs comparison, we should be calculating and reporting FPKMs for
757     // all objects in all samples
758 	for (size_t i = 0; i < abundances.size(); ++i)
759 	{
760 		const AbundanceGroup& ab_group = abundances[i]->transcripts;
761         /*
762         //fprintf(stderr, "[%d] count = %lg\n",i,  ab_group.num_fragments());
763 		BOOST_FOREACH (boost::shared_ptr<Abundance> ab, ab_group.abundances())
764 		{
765 			add_to_tracking_table(i, *ab, _tracking->isoform_fpkm_tracking);
766             //assert (_tracking->isoform_fpkm_tracking.num_fragments_by_replicate().empty() == false);
767 		}
768 
769 		BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->cds)
770 		{
771 			add_to_tracking_table(i, ab, _tracking->cds_fpkm_tracking);
772 		}
773 
774 		BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->primary_transcripts)
775 		{
776 			add_to_tracking_table(i, ab, _tracking->tss_group_fpkm_tracking);
777 		}
778 
779 		BOOST_FOREACH (AbundanceGroup& ab, abundances[i]->genes)
780 		{
781 			add_to_tracking_table(i, ab, _tracking->gene_fpkm_tracking);
782 		}
783         */
784 
785         abundances[i]->transcripts.clear_non_serialized_data();
786         lightweight_ab_groups.push_back(abundances[i]->transcripts);
787 	}
788 
789     if (_ab_group_tracking_table.find(locus_id) != _ab_group_tracking_table.end())
790     {
791         fprintf (stderr, "Error: locus %d is already recorded!\n", locus_id);
792     }
793     _ab_group_tracking_table[locus_id] = lightweight_ab_groups;
794 }
795 
record_finished_loci()796 void AbundanceRecorder::record_finished_loci()
797 {
798 #if ENABLE_THREADS
799 	boost::mutex::scoped_lock lock(_recorder_lock);
800 #endif
801 
802     recorder_sample_table::iterator itr = _samples.begin();
803     while(itr != _samples.end())
804     {
805         if (all_samples_reported_in(itr->second))
806         {
807             // In some abundance runs, we don't actually want to perform testing
808             // (eg initial quantification before bias correction).
809             // _tests and _tracking will be NULL in these cases.
810             if (_tracking != NULL)
811             {
812                 if (_p_bar)
813                 {
814                     verbose_msg("Estimating expression in locus [%s]\n", itr->second.front()->locus_tag.c_str());
815                     _p_bar->update(itr->second.front()->locus_tag.c_str(), 1);
816                 }
817             }
818             record_tracking_data(itr->first, itr->second);
819 
820             // Removes the samples that have already been tested and transferred to the tracking tables,
821             itr = _samples.erase(itr);
822         }
823         else
824         {
825 
826             ++itr;
827         }
828     }
829 }
830 
831 
832 boost::shared_ptr<AbundanceRecorder> abundance_recorder;
833 
sample_worker(const RefSequenceTable & rt,ReplicatedBundleFactory & sample_factory,boost::shared_ptr<SampleAbundances> abundance,size_t factory_id,boost::shared_ptr<AbundanceRecorder> recorder,bool calculate_variance)834 void sample_worker(const RefSequenceTable& rt,
835                    ReplicatedBundleFactory& sample_factory,
836                    boost::shared_ptr<SampleAbundances> abundance,
837                    size_t factory_id,
838                    boost::shared_ptr<AbundanceRecorder> recorder,
839                    bool calculate_variance)
840 {
841 #if ENABLE_THREADS
842 	boost::this_thread::at_thread_exit(decr_pool_count);
843 #endif
844 
845     boost::shared_ptr<HitBundle> bundle(new HitBundle);
846     bool non_empty = sample_factory.next_bundle(*bundle, true);
847 
848     char bundle_label_buf[2048];
849     sprintf(bundle_label_buf,
850             "%s:%d-%d",
851             rt.get_name(bundle->ref_id()),
852             bundle->left(),
853             bundle->right());
854     string locus_tag = bundle_label_buf;
855 
856     abundance->cluster_mass = bundle->mass();
857 
858     recorder->register_locus(bundle->id());
859 
860     abundance->locus_tag = locus_tag;
861 
862 //    if (rt.get_name(bundle->ref_id()) == "chr13_random")
863 //    {
864 //        int a = 5;
865 //    }
866     if (!non_empty || (bias_run && bundle->ref_scaffolds().size() != 1)) // Only learn on single isoforms
867     {
868 //#if !ENABLE_THREADS
869         // If Cuffdiff was built without threads, we need to manually invoke
870         // the testing functor, which will check to see if all the workers
871         // are done, and if so, perform the cross sample testing.
872         recorder->abundance_avail(bundle->id(), abundance, factory_id);
873         recorder->record_finished_loci();
874         //launcher();
875 //#endif
876     	return;
877     }
878 
879     bool perform_cds_analysis = false;
880     bool perform_tss_analysis = false;
881 
882     BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle->ref_scaffolds())
883     {
884         if (s->annotated_tss_id() != "")
885         {
886             perform_tss_analysis = final_est_run;
887         }
888         if (s->annotated_protein_id() != "")
889         {
890             perform_cds_analysis = final_est_run;
891         }
892     }
893 
894     set<boost::shared_ptr<ReadGroupProperties const> > rg_props;
895     for (size_t i = 0; i < sample_factory.factories().size(); ++i)
896     {
897         boost::shared_ptr<BundleFactory> bf = sample_factory.factories()[i];
898         rg_props.insert(bf->read_group_properties());
899     }
900 
901     sample_abundance_worker(boost::cref(locus_tag),
902                             boost::cref(rg_props),
903                             boost::ref(*abundance),
904                             bundle,
905                             perform_cds_analysis,
906                             perform_tss_analysis,
907                             calculate_variance);
908 
909     ///////////////////////////////////////////////
910 
911 
912     BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff,  bundle->ref_scaffolds())
913     {
914         ref_scaff->clear_hits();
915     }
916 
917     recorder->abundance_avail(bundle->id(), abundance, factory_id);
918     recorder->record_finished_loci();
919 }
920 
quantitate_next_locus(const RefSequenceTable & rt,vector<boost::shared_ptr<ReplicatedBundleFactory>> & bundle_factories,boost::shared_ptr<AbundanceRecorder> recorder)921 bool quantitate_next_locus(const RefSequenceTable& rt,
922                            vector<boost::shared_ptr<ReplicatedBundleFactory> >& bundle_factories,
923                            boost::shared_ptr<AbundanceRecorder> recorder)
924 {
925     for (size_t i = 0; i < bundle_factories.size(); ++i)
926     {
927         boost::shared_ptr<SampleAbundances> s_ab = boost::shared_ptr<SampleAbundances>(new SampleAbundances);
928 
929 #if ENABLE_THREADS
930         while(1)
931         {
932             locus_thread_pool_lock.lock();
933             if (locus_curr_threads < locus_num_threads)
934             {
935                 break;
936             }
937 
938             locus_thread_pool_lock.unlock();
939 
940             boost::this_thread::sleep(boost::posix_time::milliseconds(5));
941 
942         }
943 
944         locus_curr_threads++;
945         locus_thread_pool_lock.unlock();
946 
947         boost::thread quantitate(sample_worker,
948                           boost::ref(rt),
949                           boost::ref(*(bundle_factories[i])),
950                           s_ab,
951                           i,
952                           recorder,
953                           false);
954 #else
955         sample_worker(boost::ref(rt),
956                       boost::ref(*(bundle_factories[i])),
957                       s_ab,
958                       i,
959                       recorder,
960                       false);
961 #endif
962     }
963     return true;
964 }
965 
parse_norm_standards_file(FILE * norm_standards_file)966 void parse_norm_standards_file(FILE* norm_standards_file)
967 {
968     char pBuf[10 * 1024];
969     size_t non_blank_lines_read = 0;
970 
971     boost::shared_ptr<map<string, LibNormStandards> > norm_standards(new map<string, LibNormStandards>);
972 
973     while (fgets(pBuf, 10*1024, norm_standards_file))
974     {
975         if (strlen(pBuf) > 0)
976         {
977             char* nl = strchr(pBuf, '\n');
978             if (nl)
979                 *nl = 0;
980 
981             string pBufstr = pBuf;
982             string trimmed = boost::trim_copy(pBufstr);
983 
984             if (trimmed.length() > 0 && trimmed[0] != '#')
985             {
986                 non_blank_lines_read++;
987                 vector<string> columns;
988                 tokenize(trimmed, "\t", columns);
989 
990                 if (non_blank_lines_read == 1)
991                     continue;
992 
993                 if (columns.size() < 1) //
994                 {
995                     continue;
996                 }
997 
998                 string gene_id = columns[0];
999                 LibNormStandards L;
1000                 norm_standards->insert(make_pair(gene_id, L));
1001             }
1002         }
1003     }
1004     lib_norm_standards = norm_standards;
1005 }
1006 
1007 boost::shared_ptr<AbundanceRecorder> abx_recorder;
1008 
driver(const std::string & ref_gtf_filename,const std::string & mask_gtf_filename,FILE * norm_standards_file,vector<string> & sam_hit_filename_lists,Outfiles & outfiles)1009 void driver(const std::string& ref_gtf_filename, const std::string& mask_gtf_filename, FILE* norm_standards_file, vector<string>& sam_hit_filename_lists, Outfiles& outfiles)
1010 {
1011 
1012     FILE* ref_gtf = NULL;
1013 	if (ref_gtf_filename != "")
1014 	{
1015 		ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
1016 		if (!ref_gtf) // we actually already did this check, leave this code here in case we remove the upstream one
1017 		{
1018 			fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
1019 					ref_gtf_filename.c_str());
1020 			exit(1);
1021 		}
1022 	}
1023 
1024 	FILE* mask_gtf = NULL;
1025 	if (mask_gtf_filename != "")
1026 	{
1027 		mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
1028 		if (!mask_gtf) // we actually already did this check, leave this code here in case we remove the upstream one
1029 		{
1030 			fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
1031 					mask_gtf_filename.c_str());
1032 			exit(1);
1033 		}
1034 	}
1035 
1036 	ReadTable it;
1037 	RefSequenceTable rt(true, false);
1038 
1039 	vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
1040 
1041 	vector<boost::shared_ptr<ReplicatedBundleFactory> > bundle_factories;
1042     vector<boost::shared_ptr<ReadGroupProperties> > all_read_groups;
1043     vector<boost::shared_ptr<HitFactory> > all_hit_factories;
1044 
1045 	for (size_t i = 0; i < sam_hit_filename_lists.size(); ++i)
1046 	{
1047         vector<string> sam_hit_filenames;
1048         tokenize(sam_hit_filename_lists[i], ",", sam_hit_filenames);
1049 
1050         vector<boost::shared_ptr<BundleFactory> > replicate_factories;
1051 
1052         string condition_name = sample_labels[i];
1053 
1054         for (size_t j = 0; j < sam_hit_filenames.size(); ++j)
1055         {
1056             boost::shared_ptr<HitFactory> hs(createSamHitFactory(sam_hit_filenames[j], it, rt));
1057 
1058             all_hit_factories.push_back(hs);
1059 
1060             boost::shared_ptr<BundleFactory> hf(new BundleFactory(hs, REF_DRIVEN));
1061             boost::shared_ptr<ReadGroupProperties> rg_props(new ReadGroupProperties);
1062 
1063             if (global_read_properties)
1064             {
1065                 *rg_props = *global_read_properties;
1066             }
1067             else
1068             {
1069                 *rg_props = hs->read_group_properties();
1070             }
1071 
1072             rg_props->condition_name(condition_name);
1073             rg_props->replicate_num(j);
1074             rg_props->file_path(sam_hit_filenames[j]);
1075 
1076             all_read_groups.push_back(rg_props);
1077 
1078             hf->read_group_properties(rg_props);
1079 
1080             replicate_factories.push_back(hf);
1081             //replicate_factories.back()->set_ref_rnas(ref_mRNAs);
1082         }
1083 
1084         bundle_factories.push_back(boost::shared_ptr<ReplicatedBundleFactory>(new ReplicatedBundleFactory(replicate_factories, condition_name)));
1085 	}
1086 
1087     boost::crc_32_type ref_gtf_crc_result;
1088     ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, corr_bias, false);
1089 
1090 //    for (size_t i = 0; i < ref_mRNAs.size(); ++i)
1091 //    {
1092 //        boost::shared_ptr<Scaffold> s = ref_mRNAs[i];
1093 //        if (s->annotated_gene_id() == "ENSG00000268467.1")
1094 //        {
1095 //            int a = 4;
1096 //        }
1097 //    }
1098 //
1099     if (ref_mRNAs.empty())
1100         return;
1101 
1102     boost::crc_32_type mask_gtf_crc_result;
1103     vector<boost::shared_ptr<Scaffold> > mask_rnas;
1104     if (mask_gtf)
1105     {
1106         ::load_ref_rnas(mask_gtf, rt, mask_rnas, mask_gtf_crc_result, false, false);
1107     }
1108 
1109     BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1110     {
1111         fac->set_ref_rnas(ref_mRNAs);
1112         if (mask_gtf)
1113             fac->set_mask_rnas(mask_rnas);
1114     }
1115 
1116     if (norm_standards_file != NULL)
1117     {
1118         parse_norm_standards_file(norm_standards_file);
1119     }
1120 
1121     for (size_t i = 0; i < all_read_groups.size(); ++i)
1122     {
1123         all_read_groups[i]->collect_checked_parameters();
1124         all_read_groups[i]->ref_gtf(ref_gtf_filename, ref_gtf_crc_result);
1125         all_read_groups[i]->mask_gtf(mask_gtf_filename, mask_gtf_crc_result);
1126     }
1127 
1128 #if ENABLE_THREADS
1129     locus_num_threads = num_threads;
1130 #endif
1131 
1132     dispersion_method = POISSON;
1133 
1134 	int tmp_min_frag_len = numeric_limits<int>::max();
1135 	int tmp_max_frag_len = 0;
1136 
1137     IdToLocusMap id_to_locus_map(boost::shared_ptr<map<string, set<string> > >(new map<string, set<string> >()));
1138 
1139 	ProgressBar p_bar("Inspecting maps and determining fragment length distributions.",0);
1140 	BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> fac, bundle_factories)
1141     {
1142 
1143 #if ENABLE_THREADS
1144         while(1)
1145         {
1146             locus_thread_pool_lock.lock();
1147             if (locus_curr_threads < locus_num_threads)
1148             {
1149                 break;
1150             }
1151 
1152             locus_thread_pool_lock.unlock();
1153 
1154             boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1155         }
1156 
1157         locus_curr_threads++;
1158         locus_thread_pool_lock.unlock();
1159 
1160         boost::thread inspect(inspect_map_worker,
1161                        boost::ref(*fac),
1162                        boost::ref(tmp_min_frag_len),
1163                        boost::ref(tmp_max_frag_len),
1164                        boost::ref(id_to_locus_map));
1165 #else
1166         inspect_map_worker(boost::ref(*fac),
1167                            boost::ref(tmp_min_frag_len),
1168                            boost::ref(tmp_max_frag_len),
1169                            boost::ref(id_to_locus_map));
1170 #endif
1171     }
1172 
1173     // wait for the workers to finish up before reporting everthing.
1174 #if ENABLE_THREADS
1175     while(1)
1176     {
1177         locus_thread_pool_lock.lock();
1178         if (locus_curr_threads == 0)
1179         {
1180             locus_thread_pool_lock.unlock();
1181             break;
1182         }
1183         locus_thread_pool_lock.unlock();
1184 
1185         boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1186     }
1187 #endif
1188 
1189     normalize_counts(all_read_groups);
1190 
1191     for (size_t i = 0; i < all_read_groups.size(); ++i)
1192     {
1193         boost::shared_ptr<ReadGroupProperties> rg = all_read_groups[i];
1194         fprintf(stderr, "> Map Properties:\n");
1195 
1196         fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", rg->normalized_map_mass());
1197         fprintf(stderr, ">\tRaw Map Mass: %.2Lf\n", rg->total_map_mass());
1198         if (corr_multi)
1199             fprintf(stderr,">\tNumber of Multi-Reads: %zu (with %zu total hits)\n", rg->multi_read_table()->num_multireads(), rg->multi_read_table()->num_multihits());
1200 
1201         if (rg->frag_len_dist()->source() == LEARNED)
1202         {
1203             fprintf(stderr, ">\tFragment Length Distribution: Empirical (learned)\n");
1204             fprintf(stderr, ">\t              Estimated Mean: %.2f\n", rg->frag_len_dist()->mean());
1205             fprintf(stderr, ">\t           Estimated Std Dev: %.2f\n", rg->frag_len_dist()->std_dev());
1206         }
1207         else
1208         {
1209             if (rg->frag_len_dist()->source() == USER)
1210                 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (user-specified)\n");
1211             else //rg->frag_len_dist()->source == FLD::DEFAULT
1212                 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (default)\n");
1213             fprintf(stderr, ">\t              Default Mean: %d\n", def_frag_len_mean);
1214             fprintf(stderr, ">\t           Default Std Dev: %d\n", def_frag_len_std_dev);
1215         }
1216     }
1217 
1218     long double total_norm_mass = 0.0;
1219     long double total_mass = 0.0;
1220     BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1221     {
1222         total_norm_mass += rg_props->normalized_map_mass();
1223         total_mass += rg_props->total_map_mass();
1224     }
1225 
1226 	min_frag_len = tmp_min_frag_len;
1227     max_frag_len = tmp_max_frag_len;
1228 
1229 	final_est_run = false;
1230 
1231 	double num_bundles = (double)bundle_factories[0]->num_bundles();
1232 
1233     p_bar = ProgressBar("Calculating preliminary abundance estimates", num_bundles);
1234 
1235     Tracking tracking;
1236 
1237     abundance_recorder = boost::shared_ptr<AbundanceRecorder>(new AbundanceRecorder(bundle_factories.size(), &tracking, &p_bar));
1238 
1239 	if (model_mle_error || corr_bias || corr_multi) // Only run initial estimation if correcting bias or multi-reads
1240 	{
1241 		while (1)
1242 		{
1243 			boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1244 			quantitate_next_locus(rt, bundle_factories, abundance_recorder);
1245 			bool more_loci_remain = false;
1246             BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1247             {
1248                 if (rep_fac->bundles_remain())
1249                 {
1250                     more_loci_remain = true;
1251                     break;
1252                 }
1253             }
1254 
1255 			if (!more_loci_remain)
1256             {
1257                 // wait for the workers to finish up before breaking out.
1258 #if ENABLE_THREADS
1259                 while(1)
1260                 {
1261                     locus_thread_pool_lock.lock();
1262                     if (locus_curr_threads == 0)
1263                     {
1264                         locus_thread_pool_lock.unlock();
1265                         break;
1266                     }
1267 
1268                     locus_thread_pool_lock.unlock();
1269 
1270                     boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1271 
1272                 }
1273 #endif
1274 				break;
1275             }
1276 		}
1277 
1278         BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1279 		{
1280 			rep_fac->reset();
1281         }
1282 
1283 		p_bar.complete();
1284 	}
1285     if (corr_bias)
1286     {
1287         bias_run = true;
1288         p_bar = ProgressBar("Learning bias parameters.", 0);
1289 		BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1290 		{
1291 			BOOST_FOREACH (boost::shared_ptr<BundleFactory> fac, rep_fac->factories())
1292 			{
1293 #if ENABLE_THREADS
1294 				while(1)
1295 				{
1296 					locus_thread_pool_lock.lock();
1297 					if (locus_curr_threads < locus_num_threads)
1298 					{
1299 						break;
1300 					}
1301 
1302 					locus_thread_pool_lock.unlock();
1303 
1304 					boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1305 				}
1306 				locus_curr_threads++;
1307 				locus_thread_pool_lock.unlock();
1308 
1309 				boost::thread bias(learn_bias_worker, fac);
1310 #else
1311 				learn_bias_worker(fac);
1312 #endif
1313 			}
1314     	}
1315 
1316     // wait for the workers to finish up before reporting everthing.
1317 #if ENABLE_THREADS
1318 		while(1)
1319 		{
1320 			locus_thread_pool_lock.lock();
1321 			if (locus_curr_threads == 0)
1322 			{
1323 				locus_thread_pool_lock.unlock();
1324 				break;
1325 			}
1326 
1327 			locus_thread_pool_lock.unlock();
1328 
1329 			boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1330 		}
1331 #endif
1332         BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1333 		{
1334 			rep_fac->reset();
1335         }
1336         bias_run = false;
1337 	}
1338 
1339     // Allow the multiread tables to do their thing...
1340     BOOST_FOREACH (boost::shared_ptr<ReadGroupProperties> rg_props, all_read_groups)
1341     {
1342         rg_props->multi_read_table()->valid_mass(true);
1343     }
1344 
1345 
1346     abundance_recorder->clear_tracking_data();
1347 
1348     abundance_recorder = boost::shared_ptr<AbundanceRecorder>(new AbundanceRecorder(bundle_factories.size(), &tracking, &p_bar));
1349 
1350 	final_est_run = true;
1351 	p_bar = ProgressBar("Quantifying expression levels in locus.", num_bundles);
1352 
1353 	while (true)
1354 	{
1355         //boost::shared_ptr<vector<boost::shared_ptr<SampleAbundances> > > abundances(new vector<boost::shared_ptr<SampleAbundances> >());
1356         quantitate_next_locus(rt, bundle_factories, abundance_recorder);
1357         bool more_loci_remain = false;
1358         BOOST_FOREACH (boost::shared_ptr<ReplicatedBundleFactory> rep_fac, bundle_factories)
1359         {
1360             if (rep_fac->bundles_remain())
1361             {
1362                 more_loci_remain = true;
1363                 break;
1364             }
1365         }
1366         if (!more_loci_remain)
1367         {
1368             // wait for the workers to finish up before doing the cross-sample testing.
1369 #if ENABLE_THREADS
1370             while(1)
1371             {
1372                 locus_thread_pool_lock.lock();
1373                 if (locus_curr_threads == 0)
1374                 {
1375                     locus_thread_pool_lock.unlock();
1376                     break;
1377                 }
1378 
1379                 locus_thread_pool_lock.unlock();
1380 
1381                 boost::this_thread::sleep(boost::posix_time::milliseconds(5));
1382 
1383             }
1384 #endif
1385             break;
1386         }
1387     }
1388 
1389 	p_bar.complete();
1390 
1391 
1392     string expression_cxb_filename = output_dir + "/abundances.cxb";
1393     std::ofstream ofs(expression_cxb_filename.c_str());
1394     boost::archive::binary_oarchive oa(ofs);
1395 
1396     vector< pair<int, AbundanceGroup> > single_sample_tracking;
1397 
1398     const light_ab_group_tracking_table& sample_table = abundance_recorder->get_sample_table();
1399     for (light_ab_group_tracking_table::const_iterator itr = sample_table.begin(); itr != sample_table.end(); ++itr)
1400     {
1401 
1402         assert (itr->second.size() == 1);
1403         single_sample_tracking.push_back(make_pair(itr->first, itr->second[0]));
1404     }
1405 
1406     std::sort(single_sample_tracking.begin(), single_sample_tracking.end(),
1407               boost::bind(&std::pair<int, AbundanceGroup>::first, _1) <
1408               boost::bind(&std::pair<int, AbundanceGroup>::first, _2));
1409 
1410     size_t num_loci = single_sample_tracking.size();
1411     oa << num_loci;
1412 
1413 
1414 
1415     for (int i = 0; i < single_sample_tracking.size(); ++i)
1416     {
1417         oa << single_sample_tracking[i];
1418     }
1419 
1420 //    // FPKM tracking
1421 //
1422 //	FILE* fiso_fpkm_tracking =  outfiles.isoform_fpkm_tracking_out;
1423 //	fprintf(stderr, "Writing isoform-level FPKM tracking\n");
1424 //	print_FPKM_tracking(fiso_fpkm_tracking,tracking.isoform_fpkm_tracking);
1425 //
1426 //	FILE* ftss_fpkm_tracking =  outfiles.tss_group_fpkm_tracking_out;
1427 //	fprintf(stderr, "Writing TSS group-level FPKM tracking\n");
1428 //	print_FPKM_tracking(ftss_fpkm_tracking,tracking.tss_group_fpkm_tracking);
1429 //
1430 //	FILE* fgene_fpkm_tracking =  outfiles.gene_fpkm_tracking_out;
1431 //	fprintf(stderr, "Writing gene-level FPKM tracking\n");
1432 //	print_FPKM_tracking(fgene_fpkm_tracking,tracking.gene_fpkm_tracking);
1433 //
1434 //	FILE* fcds_fpkm_tracking =  outfiles.cds_fpkm_tracking_out;
1435 //	fprintf(stderr, "Writing CDS-level FPKM tracking\n");
1436 //	print_FPKM_tracking(fcds_fpkm_tracking,tracking.cds_fpkm_tracking);
1437 //
1438 //    // Count tracking
1439 //
1440 //    FILE* fiso_count_tracking =  outfiles.isoform_count_tracking_out;
1441 //	fprintf(stderr, "Writing isoform-level count tracking\n");
1442 //	print_count_tracking(fiso_count_tracking,tracking.isoform_fpkm_tracking);
1443 //
1444 //	FILE* ftss_count_tracking =  outfiles.tss_group_count_tracking_out;
1445 //	fprintf(stderr, "Writing TSS group-level count tracking\n");
1446 //	print_count_tracking(ftss_count_tracking,tracking.tss_group_fpkm_tracking);
1447 //
1448 //	FILE* fgene_count_tracking =  outfiles.gene_count_tracking_out;
1449 //	fprintf(stderr, "Writing gene-level count tracking\n");
1450 //	print_count_tracking(fgene_count_tracking,tracking.gene_fpkm_tracking);
1451 //
1452 //	FILE* fcds_count_tracking =  outfiles.cds_count_tracking_out;
1453 //	fprintf(stderr, "Writing CDS-level count tracking\n");
1454 //	print_count_tracking(fcds_count_tracking,tracking.cds_fpkm_tracking);
1455 //
1456 //    // Run info
1457 //    FILE* frun_info =  outfiles.run_info_out;
1458 //	fprintf(stderr, "Writing run info\n");
1459 //	print_run_info(frun_info);
1460 }
1461 
main(int argc,char ** argv)1462 int main(int argc, char** argv)
1463 {
1464 //    boost::serialization::void_cast_register<TranscriptAbundance, Abundance>(
1465 //                                                                             static_cast<TranscriptAbundance *>(NULL),
1466 //                                                                             static_cast<Abundance *>(NULL)
1467 //                                                                             );
1468 
1469     for (int i = 0; i < argc; ++i)
1470     {
1471         cmd_str += string(argv[i]) + " ";
1472     }
1473 
1474     init_library_table();
1475     init_dispersion_method_table();
1476     init_lib_norm_method_table();
1477 
1478     min_isoform_fraction = 0;
1479 
1480 	int parse_ret = parse_options(argc,argv);
1481     if (parse_ret)
1482         return parse_ret;
1483 
1484     if (!use_total_mass && !use_compat_mass)
1485     {
1486         use_total_mass = false;
1487         use_compat_mass = true;
1488     }
1489 
1490 	if(optind >= argc)
1491     {
1492         print_usage();
1493         return 1;
1494     }
1495 
1496     if (!no_update_check)
1497         check_version(PACKAGE_VERSION);
1498 
1499     string ref_gtf_filename = argv[optind++];
1500     vector<string> sam_hit_filenames;
1501 
1502     if(optind < argc)
1503     {
1504         string sam_hits_file_name = argv[optind++];
1505         sam_hit_filenames.push_back(sam_hits_file_name);
1506     }
1507 
1508     if (sample_labels.size() == 0)
1509     {
1510         for (size_t i = 1; i < sam_hit_filenames.size() + 1; ++i)
1511         {
1512             char buf[256];
1513             sprintf(buf, "q%lu", i);
1514             sample_labels.push_back(buf);
1515         }
1516     }
1517 
1518 	while (sam_hit_filenames.size() < 1)
1519     {
1520         fprintf(stderr, "Error: cuffquant requires exactly 1 SAM/BAM file\n");
1521         exit(1);
1522     }
1523 
1524 
1525     if (sam_hit_filenames.size() != sample_labels.size())
1526     {
1527         fprintf(stderr, "Error: number of labels must match number of conditions\n");
1528         exit(1);
1529     }
1530 
1531     if (random_seed == -1)
1532         random_seed = boost::mt19937::default_seed;
1533 
1534 	// seed the random number generator - we'll need it for the importance
1535 	// sampling during MAP estimation of the gammas
1536 	srand48(random_seed);
1537 
1538 	FILE* ref_gtf = NULL;
1539 	if (ref_gtf_filename != "")
1540 	{
1541 		ref_gtf = fopen(ref_gtf_filename.c_str(), "r");
1542 		if (!ref_gtf)
1543 		{
1544 			fprintf(stderr, "Error: cannot open reference GTF file %s for reading\n",
1545 					ref_gtf_filename.c_str());
1546 			exit(1);
1547 		}
1548 	}
1549 
1550 	FILE* mask_gtf = NULL;
1551 	if (mask_gtf_filename != "")
1552 	{
1553 		mask_gtf = fopen(mask_gtf_filename.c_str(), "r");
1554 		if (!mask_gtf)
1555 		{
1556 			fprintf(stderr, "Error: cannot open mask GTF file %s for reading\n",
1557 					mask_gtf_filename.c_str());
1558 			exit(1);
1559 		}
1560 	}
1561 
1562     FILE* norm_standards_file = NULL;
1563 	if (norm_standards_filename != "")
1564 	{
1565 		norm_standards_file = fopen(norm_standards_filename.c_str(), "r");
1566 		if (!norm_standards_file)
1567 		{
1568 			fprintf(stderr, "Error: cannot open contrast file %s for reading\n",
1569 					norm_standards_filename.c_str());
1570 			exit(1);
1571 		}
1572 	}
1573 
1574 
1575 	// Note: we don't want the assembly filters interfering with calculations
1576 	// here
1577 
1578 	pre_mrna_fraction = 0.0;
1579     olap_radius = 0;
1580 
1581 	Outfiles outfiles;
1582 
1583     if (output_dir != "")
1584     {
1585         int retcode = mkpath(output_dir.c_str(), 0777);
1586         if (retcode == -1)
1587         {
1588             if (errno != EEXIST)
1589             {
1590                 fprintf (stderr,
1591                          "Error: cannot create directory %s\n",
1592                          output_dir.c_str());
1593                 exit(1);
1594             }
1595         }
1596     }
1597 
1598     static const int filename_buf_size = 2048;
1599 
1600     char out_file_prefix[filename_buf_size];
1601     sprintf(out_file_prefix, "%s/", output_dir.c_str());
1602 
1603     driver(ref_gtf_filename, mask_gtf_filename, norm_standards_file, sam_hit_filenames, outfiles);
1604 
1605 	return 0;
1606 }
1607