1 /*
2  *  bundles.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 9/6/09.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include <list>
11 #include <map>
12 #include <numeric>
13 #include <boost/math/distributions/binomial.hpp>
14 #include <boost/crc.hpp>
15 
16 #include "common.h"
17 #include "bundles.h"
18 #include "scaffolds.h"
19 
20 #include "abundances.h"
21 
22 using namespace std;
23 using boost::math::binomial;
24 
25 //struct ScaffoldSorter
26 //{
27 //	ScaffoldSorter(RefSequenceTable& _rt) : rt(_rt) {}
28 //	bool operator()(boost::shared_ptr<Scaffold const> lhs, boost::shared_ptr<Scaffold const> rhs)
29 //	{
30 //        assert (lhs);
31 //        assert (rhs);
32 //		const char* lhs_name = rt.get_name(lhs->ref_id());
33 //		const char* rhs_name = rt.get_name(rhs->ref_id());
34 //		int c = strcmp(lhs_name, rhs_name);
35 //		if (c != 0)
36 //		{
37 //			return c < 0;
38 //		}
39 //		if (lhs->left() != rhs->left())
40 //		{
41 //			return lhs->left() < rhs->left();
42 //		}
43 //        return false;
44 //	}
45 //
46 //	RefSequenceTable& rt;
47 //};
48 
49 struct ScaffoldSorter
50 {
ScaffoldSorterScaffoldSorter51 	ScaffoldSorter(RefSequenceTable& _rt) : rt(_rt) {}
operator ()ScaffoldSorter52 	bool operator()(boost::shared_ptr<Scaffold const> lhs, boost::shared_ptr<Scaffold const> rhs)
53 	{
54         //assert (lhs);
55         //assert (rhs);
56         if (!lhs || !rhs)
57             return false;
58 		int lhs_order = rt.observation_order(lhs->ref_id());
59         assert (lhs_order != -1);
60 		int rhs_order = rt.observation_order(rhs->ref_id());
61         assert (rhs_order != -1);
62 
63 		if (lhs_order != rhs_order)
64 		{
65 			return lhs_order < rhs_order;
66 		}
67 		if (lhs->left() != rhs->left())
68 		{
69 			return lhs->left() < rhs->left();
70 		}
71         return false;
72 	}
73 
74 	RefSequenceTable& rt;
75 };
76 
77 //FIXME: needs refactoring
load_ref_rnas(FILE * ref_mRNA_file,RefSequenceTable & rt,vector<boost::shared_ptr<Scaffold>> & ref_mRNAs,boost::crc_32_type & gtf_crc_result,bool loadSeqs,bool loadFPKM)78 void load_ref_rnas(FILE* ref_mRNA_file,
79 				   RefSequenceTable& rt,
80 				   vector<boost::shared_ptr<Scaffold> >& ref_mRNAs,
81                    boost::crc_32_type& gtf_crc_result,
82 				   bool loadSeqs,
83 				   bool loadFPKM)
84 {
85 	if (loadSeqs)
86 		ProgressBar p_bar("Loading reference annotation and sequence.",0);
87     else
88         ProgressBar p_bar("Loading reference annotation.",0);
89 
90 	GList<GSeqData> ref_rnas;
91 
92 	// If the RefSequenceTable already has entries, we will sort the GTF records
93 	// according to their observation order.  Otherwise, we will sort the
94 	// RefSequenceTable's records lexicographically.
95 	bool reorder_GTF_recs_lexicographically = false;
96 	if (rt.size() == 0)
97 	{
98 	  reorder_GTF_recs_lexicographically = true;
99 	}
100 
101 	if (ref_mRNA_file)
102 	{
103 		gtf_tracking_verbose=cuff_verbose;
104 		read_transcripts(ref_mRNA_file, ref_rnas, gtf_crc_result, true);
105 	}
106 
107 	int last_gseq_id = -1;
108 	GFaSeqGet* faseq = NULL;
109 	GFastaHandler gfasta(fasta_dir.c_str());
110 	// Geo groups them by chr.
111 	if (ref_rnas.Count()>0) //if any ref data was loaded
112 	{
113 		for (int j = 0; j < ref_rnas.Count(); ++j)
114 		{    //ref data is grouped by genomic sequence
115 			//const char* name = ref_rnas[j]->gseq_name;
116 
117 			int f = 0;
118 			int r = 0;
119 			int u  = 0;
120 			GffObj* rna_p;
121 			RefID ref_id = rt.get_id(ref_rnas[j]->gseq_name, NULL);
122 			int f_count = ref_rnas[j]->mrnas_f.Count();
123 			int r_count = ref_rnas[j]->mrnas_r.Count();
124 			int u_count = ref_rnas[j]->umrnas.Count();
125 
126 			while(!(f==f_count && r==r_count && u==u_count))
127 			{
128 				CuffStrand strand;
129 
130 				if (f < f_count)
131 				{
132 					rna_p = ref_rnas[j]->mrnas_f[f++];
133 					strand = CUFF_FWD;
134 				}
135 				else if (r < r_count)
136 				{
137 					rna_p = ref_rnas[j]->mrnas_r[r++];
138 					strand = CUFF_REV;
139 				}
140 				else
141 				{
142 					rna_p = ref_rnas[j]->umrnas[u++];
143 					strand = CUFF_STRAND_UNKNOWN;
144 				}
145 
146 				GffObj& rna = *rna_p;
147 
148 				if (loadSeqs && rna.gseq_id != last_gseq_id) //next chromosome
149 				{
150 					delete faseq;
151 					faseq = NULL;
152 					last_gseq_id = rna.gseq_id;
153 					faseq = gfasta.fetch(last_gseq_id);
154 					if (faseq==NULL)
155 					{
156 						fprintf(stderr,"This contig will not be bias corrected.\n");
157 					}
158 				}
159 
160 				vector<AugmentedCuffOp> ops;
161 				for (int e = 0; e < rna.exons.Count(); ++e)
162 				{
163 					GffExon& ex = *(rna.exons[e]);
164 					ops.push_back(AugmentedCuffOp(CUFF_MATCH, ex.start - 1, ex.end - ex.start + 1));
165 
166 					if (e + 1 < rna.exons.Count())
167 					{
168 						GffExon& next_ex = *(rna.exons[e+1]);
169 						ops.push_back(AugmentedCuffOp(CUFF_INTRON, ex.end, next_ex.start - ex.end - 1));
170 					}
171 				}
172 
173 				Scaffold ref_scaff(ref_id, strand, ops, true);
174 
175 				char* rna_seq = 0;
176 				int seqlen=0;
177 				if (loadSeqs && faseq){
178 					rna_seq = rna.getSpliced(faseq, false, &seqlen);
179 				}
180 
181 				if (rna.getID())
182 					ref_scaff.annotated_trans_id(rna.getID());
183 
184 
185 				if (rna.getGeneID())
186 					ref_scaff.annotated_gene_id(rna.getGeneID());
187 
188 				if (rna.getGeneName())
189 					ref_scaff.annotated_gene_name(rna.getGeneName());
190 
191 
192 				char* nearest_ref_match = rna.getAttr("nearest_ref");
193 				char* class_code = rna.getAttr("class_code");
194 
195 				if (nearest_ref_match && class_code)
196 				{
197 					ref_scaff.nearest_ref_id(nearest_ref_match);
198 					ref_scaff.nearest_ref_classcode(*class_code);
199 				}
200 
201 				char* protein_id = rna.getAttr("p_id");
202 				if (protein_id)
203 					ref_scaff.annotated_protein_id(protein_id);
204 
205 
206 				char* tss_id = rna.getAttr("tss_id");
207 				if (tss_id)
208 					ref_scaff.annotated_tss_id(tss_id);
209 
210 
211 				if (loadFPKM)
212 				{
213 					const char* expr = rna.getAttr("FPKM");
214 					if (expr!=NULL) {
215 						if (expr[0]=='"') expr++;
216 						ref_scaff.fpkm(strtod(expr, NULL));
217 					}
218 				}
219 
220 				if (loadSeqs)
221                 {
222 					string rs = (rna_seq) ? rna_seq:"";
223 					std::transform(rs.begin(), rs.end(), rs.begin(), (int (*)(int))std::toupper);
224 					ref_scaff.seq(rs);
225 					GFREE(rna_seq);
226 				}
227 
228 				boost::shared_ptr<Scaffold> scaff(new Scaffold());
229                 *scaff = ref_scaff;
230                 assert (scaff);
231 				ref_mRNAs.push_back(scaff);
232 			}
233 		}
234 
235         BOOST_FOREACH (boost::shared_ptr<Scaffold> s, ref_mRNAs)
236         {
237             assert (s);
238         }
239 
240         if (reorder_GTF_recs_lexicographically)
241         {
242             rt.order_recs_lexicographically();
243         }
244 
245 		ScaffoldSorter sorter(rt);
246 		sort(ref_mRNAs.begin(), ref_mRNAs.end(), sorter);
247 
248 	}
249 	delete faseq;
250 }
251 
252 
253 int HitBundle::_next_id = 0;
254 
add_hit(const MateHit & hit)255 bool HitBundle::add_hit(const MateHit& hit)
256 {
257 	if (_final)
258     {
259 		return false;
260     }
261 
262 	// Update the bounds on the span
263 	if (hit.left() < _leftmost)
264 		_leftmost = hit.left();
265 	if (hit.right() > _rightmost)
266 		_rightmost = hit.right();
267 
268 
269 	_hits.push_back(hit);
270 	return true;
271 }
272 
273 struct HitlessScaffold
274 {
operator ()HitlessScaffold275 	bool operator()(boost::shared_ptr<Scaffold> x)
276 	{
277 		return x->mate_hits().empty();
278 	}
279 };
280 
unmapped_hit(const MateHit & x)281 bool unmapped_hit(const MateHit& x)
282 {
283 	return !(x.is_mapped());
284 }
285 
286 
add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props,const ReadHit * bh,bool expand_by_partner)287 bool HitBundle::add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props,
288                              const ReadHit* bh,
289 							 bool expand_by_partner)
290 {
291     assert (bh != NULL);
292 
293 	_leftmost = min(_leftmost, bh->left());
294 	_ref_id = bh->ref_id();
295 
296 	if (bh->is_singleton() || no_read_pairs)
297 	{
298 		_rightmost = max(_rightmost, bh->right());
299 		MateHit m(rg_props, bh->ref_id(), bh, NULL);
300         if (m.right() - m.left() > max_gene_length)
301         {
302             fprintf(stderr, "Warning: hit is longer than max_gene_length, skipping\n");
303             return false;
304         }
305 		add_hit(m);
306 	}
307 	else
308 	{
309         if (abs(bh->right() - bh->partner_pos()+1) > max_gene_length)
310         {
311             fprintf(stderr, "Warning: hit is longer than max_gene_length, skipping\n");
312             return false;
313         }
314 		if (expand_by_partner)
315 			_rightmost = max(max(_rightmost, bh->right()), bh->partner_pos()+1);
316 
317 		uint64_t search_key = bh->insert_id() ^ bh->left();
318 		std::pair<OpenMates::iterator, OpenMates::iterator> its = _open_mates.equal_range(search_key);
319 
320 		// Does this hit close an open mate?
321 		bool found_partner = false;
322 		for(OpenMates::iterator it = its.first; it != its.second; ++it) {
323 
324 			MateHit& pm = it->second;
325 
326 			if(pm.left_alignment()->partner_pos() != bh->left())
327 				continue;
328 
329 			if(pm.insert_id() != bh->insert_id())
330 				continue;
331 
332 			{
333 
334 				Scaffold L(MateHit(rg_props, bh->ref_id(), pm.left_alignment(), NULL));
335 				Scaffold R(MateHit(rg_props, bh->ref_id(), bh, NULL));
336 
337 				bool strand_agree = L.strand() == CUFF_STRAND_UNKNOWN ||
338 					R.strand() == CUFF_STRAND_UNKNOWN ||
339 					L.strand() == R.strand();
340 
341 				//bool orientation_agree = pm.left_alignment()->antisense_align() != bh->antisense_align();
342 
343 				if (strand_agree &&
344 					(!Scaffold::overlap_in_genome(L, R, olap_radius) ||
345 					 Scaffold::compatible(L,R)))
346 					{
347 						pm.right_alignment(bh);
348 						add_hit(pm);
349 						_open_mates.erase(it);
350 						// Boost unordered_multimap ordinarily never shrinks.
351 						// Compel it to do so if significantly underloaded.
352 						if(_open_mates.size() > _rehash_size_threshold && _open_mates.load_factor() < _rehash_threshold) {
353 						  _open_mates.rehash(0);
354 						  if(_open_mates.load_factor() < _rehash_threshold) {
355 							// It didn't shrink -- looks like this Boost implementation has a minimum
356 							// size limit, or other reason to keep the table large. Don't keep rehashing with every remove op:
357 							_rehash_size_threshold = _open_mates.size();
358 						  }
359 						}
360 						found_partner = true;
361 						break;
362 					}
363 
364 			}
365 
366 		}
367 
368 		if(!found_partner) {
369 
370 			// Add it to the list of open mates, unless we would
371 			// already have seen it's partner
372 			if(bh->left() <= bh->partner_pos())
373 				{
374 					MateHit open_hit(rg_props,
375 									 bh->ref_id(),
376 									 bh,
377 									 NULL);
378 
379 					uint64_t insert_key = bh->insert_id() ^ bh->partner_pos();
380 					_open_mates.insert(make_pair(insert_key, open_hit));
381 				}
382 			else
383 				{
384 					// This should never happen during hit_driven or ref_guided bundling, and in the case of
385 					// ref_driven, this read clearly shouldn't map to any of the transcripts anyways.
386 					// Adding this hit would cause problems with multi-reads that straddle boundaries after assembly.
387 					// add_hit(MateHit(rg_props,bh->ref_id(), bh, NULL));
388 					return false;
389 				}
390 
391 		}
392 
393 	}
394     return true;
395 }
396 
collapse_hits()397 void HitBundle::collapse_hits()
398 {
399 	::collapse_hits(_hits, _non_redundant);
400     //_non_redundant = _hits;
401 }
402 
finalize_open_mates()403 void HitBundle::finalize_open_mates()
404 {
405     // We don't want to split reads accross boundaries since this would only occur
406     // in ref_driven mode and the read shouldn't map to any of the references in this case.
407 
408     for(OpenMates::iterator itr = _open_mates.begin(); itr != _open_mates.end(); ++itr)
409     {
410 		delete itr->second.left_alignment();
411 		delete itr->second.right_alignment();
412     }
413     _open_mates.clear();
414 }
415 
remove_hitless_scaffolds()416 void HitBundle::remove_hitless_scaffolds()
417 {
418 	vector<boost::shared_ptr<Scaffold> >::iterator new_end = remove_if(_ref_scaffs.begin(),
419 												   _ref_scaffs.end(),
420 												   HitlessScaffold());
421 	_ref_scaffs.erase(new_end, _ref_scaffs.end());
422 }
423 
424 
425 
combine(const vector<HitBundle * > & in_bundles,HitBundle & out_bundle)426 void HitBundle::combine(const vector<HitBundle*>& in_bundles,
427                         HitBundle& out_bundle)
428 {
429     out_bundle._hits.clear();
430     out_bundle._non_redundant.clear();
431     out_bundle._ref_scaffs.clear();
432 
433     for (size_t i = 1; i < in_bundles.size(); ++i)
434     {
435         assert(in_bundles[i]->ref_id() == in_bundles[i-1]->ref_id());
436     }
437 
438     // Merge  hits
439     vector<size_t> indices(in_bundles.size(),0);
440     while(true)
441     {
442         int next_bundle = -1;
443         const MateHit* next_hit=NULL;
444         for(size_t i = 0; i < in_bundles.size(); ++i)
445         {
446             const vector<MateHit>& curr_hits = in_bundles[i]->hits();
447 
448             if (indices[i] == curr_hits.size())
449                 continue;
450 
451             const MateHit* curr_hit = &curr_hits[indices[i]];
452 
453             if (next_bundle == -1 || mate_hit_lt(*curr_hit, *next_hit))
454             {
455                 next_bundle = i;
456                 next_hit = curr_hit;
457             }
458         }
459 
460         if(next_bundle==-1)
461             break;
462 
463         out_bundle._hits.push_back(*next_hit);
464         indices[next_bundle]++;
465     }
466 
467     // Merge collapsed hits
468     indices = vector<size_t>(in_bundles.size(), 0);
469     while(true)
470     {
471         int next_bundle = -1;
472         const MateHit* next_hit = NULL;
473         for(size_t i = 0; i < in_bundles.size(); ++i)
474         {
475             const vector<MateHit>& curr_non_redundant_hits = in_bundles[i]->non_redundant_hits();
476 
477             if (indices[i] == curr_non_redundant_hits.size())
478                 continue;
479 
480             const MateHit* curr_hit = &curr_non_redundant_hits[indices[i]];
481 
482             if (next_bundle == -1 || mate_hit_lt(*curr_hit, *next_hit))
483             {
484                 next_bundle = i;
485                 next_hit = curr_hit;
486             }
487         }
488 
489         if(next_bundle==-1)
490             break;
491 
492         out_bundle._non_redundant.push_back(*next_hit);
493         indices[next_bundle]++;
494     }
495 
496     for(size_t i = 0; i < in_bundles.size(); ++i)
497     {
498         for (size_t j = 0; j < in_bundles[i]->_ref_scaffs.size(); ++j)
499         {
500             in_bundles[i]->_ref_scaffs[j]->clear_hits();
501         }
502     }
503 
504     /*
505     // Merge ref scaffolds
506     indices = vector<size_t>(in_bundles.size(), 0);
507     while(true)
508     {
509         int next_bundle = -1;
510         boost::shared_ptr<Scaffold> next_scaff;
511         for(size_t i = 0; i < in_bundles.size(); ++i)
512         {
513             const vector<boost::shared_ptr<Scaffold> >& curr_scaffs = in_bundles[i]->_ref_scaffs;
514 
515             if (indices[i] == curr_scaffs.size())
516                 continue;
517 
518             boost::shared_ptr<Scaffold> curr_scaff = curr_scaffs[indices[i]];
519 
520             if (next_bundle == -1 || scaff_lt_rt_oplt(*curr_scaff, *next_scaff))
521             {
522                 next_bundle = i;
523                 next_scaff = curr_scaff;
524             }
525         }
526 
527         if(next_bundle==-1)
528             break;
529 
530         if (out_bundle._ref_scaffs.size()==0 || out_bundle._ref_scaffs.back()->annotated_trans_id() != next_scaff->annotated_trans_id())
531             out_bundle.add_ref_scaffold(next_scaff);
532         indices[next_bundle]++;
533     }
534 	*/
535 
536     for (size_t i = 0; i < in_bundles.size(); ++i)
537     {
538         for (size_t j = 0; j < in_bundles[i]->ref_scaffolds().size(); ++j)
539         {
540             out_bundle.add_ref_scaffold(in_bundles[i]->ref_scaffolds()[j]);
541         }
542     }
543 
544     sort(out_bundle._ref_scaffs.begin(), out_bundle._ref_scaffs.end(), scaff_lt_rt_oplt_sp);
545     vector<boost::shared_ptr<Scaffold> >::iterator new_end = unique(out_bundle._ref_scaffs.begin(),
546                                                              out_bundle._ref_scaffs.end(),
547                                                              StructurallyEqualScaffolds());
548     out_bundle._ref_scaffs.erase(new_end, out_bundle._ref_scaffs.end());
549     vector<boost::shared_ptr<Scaffold> >(out_bundle._ref_scaffs).swap(out_bundle._ref_scaffs);
550 
551     out_bundle.finalize(true); // true means everything is already sorted, etc.
552     out_bundle._num_replicates = (int)in_bundles.size();
553 }
554 
555 
finalize(bool is_combined)556 void HitBundle::finalize(bool is_combined)
557 {
558 	_final = true;
559     if (!is_combined)
560 	{
561         // only perform read skipping on primary bundles
562         // (i.e. don't do it on bundles we're making by combining two or more other bundles)
563         size_t num_skipped = _hits.size() * read_skip_fraction;
564         if (num_skipped > 0 && num_skipped < _hits.size())
565         {
566             random_shuffle(_hits.begin(), _hits.end());
567             for (int i = (int)_hits.size() - num_skipped; i >= 0 && i < (int)_hits.size(); ++i)
568             {
569                 delete _hits[i].left_alignment();
570                 _hits[i].left_alignment(NULL);
571 
572                 delete _hits[i].right_alignment();
573                 _hits[i].right_alignment(NULL);
574             }
575             _hits.resize(_hits.size() - num_skipped);
576             is_combined = false;
577         }
578         else if (num_skipped >= _hits.size())
579         {
580             for (size_t i = 0; i < _hits.size(); ++i)
581             {
582                 delete _hits[i].left_alignment();
583                 delete _hits[i].right_alignment();
584             }
585             _hits.clear();
586         }
587 
588 		sort(_hits.begin(), _hits.end(), mate_hit_lt);
589         if (cond_prob_collapse)
590         {
591             collapse_hits();
592         }
593         else
594         {
595             BOOST_FOREACH (MateHit& hit, _hits)
596             {
597                 hit.incr_collapse_mass(hit.internal_scale_mass());
598             }
599             _non_redundant = _hits;
600 
601         }
602 		sort(_ref_scaffs.begin(), _ref_scaffs.end(), scaff_lt_rt_oplt_sp);
603 		vector<boost::shared_ptr<Scaffold> >::iterator new_end = unique(_ref_scaffs.begin(),
604 												_ref_scaffs.end(),
605 												StructurallyEqualScaffolds());
606 		_ref_scaffs.erase(new_end, _ref_scaffs.end());
607         vector<boost::shared_ptr<Scaffold> >(_ref_scaffs).swap(_ref_scaffs);
608 	}
609 
610     for (size_t j = 0; j < _ref_scaffs.size(); ++j)
611 	{
612 		_ref_scaffs[j]->clear_hits();
613 	}
614 
615     _compatible_mass = 0.0;
616 
617 	for (size_t i = 0; i < _hits.size(); ++i)
618 	{
619 		MateHit& hit = _hits[i];
620 
621 		Scaffold hs(hit);
622 
623         if (i >= 1)
624         {
625             assert (hit.ref_id() == _hits[i-1].ref_id());
626         }
627 		hit.is_mapped(false);
628 		for (size_t j = 0; j < _ref_scaffs.size(); ++j)
629 		{
630 			// add hit only adds if the hit is structurally compatible
631 			if (_ref_scaffs[j]->contains(hs))
632 			{
633 				bool added = _ref_scaffs[j]->add_hit(&hit);
634                 if (added)
635                     hit.is_mapped(true);
636 			}
637 		}
638         if (hit.is_mapped())
639         {
640             _compatible_mass += hit.internal_scale_mass();
641         }
642 	}
643 
644 }
645 
print_sort_error(const char * last_chr_name,int last_chr_pos,const char * bh_name,int bh_pos)646 void print_sort_error(const char* last_chr_name,
647                       int last_chr_pos,
648                       const char* bh_name,
649                       int bh_pos)
650 {
651     fprintf(stderr, "\nError: this SAM file doesn't appear to be correctly sorted!\n");
652     fprintf(stderr, "\tcurrent hit is at %s:%d, last one was at %s:%d\n",
653             bh_name,
654             bh_pos,
655             last_chr_name,
656             last_chr_pos);
657     fprintf(stderr, "Cufflinks requires that if your file has SQ records in\nthe SAM header that they appear in the same order as the chromosomes names \nin the alignments.\nIf there are no SQ records in the header, or if the header is missing,\nthe alignments must be sorted lexicographically by chromsome\nname and by position.\n \n");
658 }
659 
660 
next_valid_alignment(const ReadHit * & bh)661 double BundleFactory::next_valid_alignment(const ReadHit*& bh)
662 {
663     bh = NULL;
664 
665 	// Keep track of mass of hits we skip
666 	double raw_mass = 0;
667 
668     while (_hit_fac->records_remain())
669     {
670         ReadHit tmp;
671         if (!_hit_fac->next_hit(tmp))
672             continue;
673 
674 		if (tmp.ref_id() == 12638153115695167477)  // corresponds to SAM "*" under FNV hash. unaligned read record
675             continue;
676 
677 		raw_mass += tmp.mass();
678 
679         if (_hit_fac->ref_table().get_name(tmp.ref_id())==NULL) // unaligned read record (!?)
680             continue;
681 
682         if (spans_bad_intron(tmp))
683             continue;
684 
685 		// Check for reads with no matching CIGAR entries. Generally such a read should have been rejected
686 		// as unmapped, but such records have been seen in the wild. If they were allowed to stay they would cause
687 		// trouble when converted to Scaffolds, yielding an invalid aug_ops vector.
688 		const vector<CigarOp>& cig = tmp.cigar();
689 		bool found_match = false;
690 		for(vector<CigarOp>::const_iterator it = cig.begin(), itend = cig.end(); it != itend && !found_match; ++it)
691 			if(it->opcode == MATCH)
692 				found_match = true;
693 
694 		if(!found_match) {
695 			fprintf(stderr, "Skipping hit with no Match operators in its CIGAR string\n");
696 			continue;
697 		}
698 
699         int order = _hit_fac->ref_table().observation_order(tmp.ref_id());
700         if (_prev_pos != 0)
701         {
702             int prev_order = _hit_fac->ref_table().observation_order(_prev_ref_id);
703 
704             if (prev_order > order || (prev_order == order && _prev_pos > tmp.left()))
705             {
706                 const char* bh_chr_name = _hit_fac->ref_table().get_name(tmp.ref_id());
707                 const char* last_bh_chr_name = _hit_fac->ref_table().get_name(_prev_ref_id);
708 
709                 print_sort_error(last_bh_chr_name,
710                                  _prev_pos,
711                                  bh_chr_name,
712                                  tmp.left());
713                 exit(1);
714             }
715         }
716 
717         _prev_ref_id = tmp.ref_id();
718         _prev_pos = tmp.left();
719 
720         bool hit_within_mask = false;
721 
722         // We want to skip stuff that overlaps masked GTF records, so
723         // sync up the masking chromosome
724         if (!mask_gtf_recs.empty() &&
725             next_mask_scaff != mask_gtf_recs.end() &&
726             (*next_mask_scaff)->ref_id() != tmp.ref_id())
727         {
728             bool found_scaff = false;
729             vector<boost::shared_ptr<Scaffold> >::iterator curr_mask_scaff = mask_gtf_recs.begin();
730             for (size_t i = 0; i < _mask_scaff_offsets.size(); ++i)
731             {
732                 if (_mask_scaff_offsets[i].first == tmp.ref_id())
733                 {
734                     curr_mask_scaff = _mask_scaff_offsets[i].second;
735                     found_scaff = true;
736                     break;
737                 }
738             }
739 
740             next_mask_scaff = curr_mask_scaff;
741         }
742 
743         //check that we aren't sitting in the middle of a masked scaffold
744         while (next_mask_scaff != mask_gtf_recs.end() &&
745                (*next_mask_scaff)->ref_id() == tmp.ref_id() &&
746                (*next_mask_scaff)->right() <= tmp.left())
747         {
748             if ((*next_mask_scaff)->left() >= tmp.left())
749             {
750                 break;
751             }
752 
753             next_mask_scaff++;
754         }
755 
756         if (next_mask_scaff != mask_gtf_recs.end() &&
757             (*next_mask_scaff)->ref_id() == tmp.ref_id() &&
758             (*next_mask_scaff)->left() <= tmp.left() &&
759             (*next_mask_scaff)->right() >= tmp.right())
760         {
761             hit_within_mask = true;
762         }
763 
764         if (hit_within_mask)
765             continue;
766 
767         // if the user's asked for read trimming, do it here.
768         if (trim_read_length > 0)
769         {
770             tmp.trim(trim_read_length);
771         }
772 
773         bh = new ReadHit(tmp);
774 
775         break;
776     }
777 
778     return raw_mass;
779 }
780 
rewind_hit(const ReadHit * rh)781 double BundleFactory::rewind_hit(const ReadHit* rh)
782 {
783 	double mass = rh->mass();
784 	delete rh;
785 	_hit_fac->undo_hit();
786 	return mass;
787 }
788 
next_bundle_hit_driven(HitBundle & bundle)789 bool BundleFactory::next_bundle_hit_driven(HitBundle& bundle)
790 {
791 	const ReadHit* bh = NULL;
792 
793     bool skip_read = false;
794 
795 	while(bh == NULL)
796 	{
797 		if (!_hit_fac->records_remain())
798 		{
799 			return false;
800 		}
801 
802         // If we are randomly throwing out reads, check to see
803         // whether this one should be kept.
804         if (bundle.hits().size() >= max_frags_per_bundle)
805         {
806             skip_read = true;
807             next_valid_alignment(bh);
808         }
809         else
810         {
811             double raw_mass = next_valid_alignment(bh);
812             if (bh && bh->num_hits() > max_frag_multihits)
813             {
814                 skip_read = true;
815             }
816             else
817             {
818                 bundle.add_raw_mass(raw_mass);
819             }
820         }
821 	}
822 
823 	if ((skip_read || !bundle.add_open_hit(read_group_properties(), bh)) && bh != NULL)
824     {
825         delete bh;
826         bh = NULL;
827     }
828 	_expand_by_hits(bundle);
829 
830     assert(bundle.left() != -1);
831 	bundle.finalize_open_mates();
832 	bundle.finalize();
833     assert(bundle.right() != -1);
834 
835     return true;
836 }
837 
next_bundle_ref_driven(HitBundle & bundle)838 bool BundleFactory::next_bundle_ref_driven(HitBundle& bundle)
839 {
840 	if (next_ref_scaff == ref_mRNAs.end())
841 	{
842 		const ReadHit* bh = NULL;
843 		while(_hit_fac->records_remain())
844 		{
845             double raw_mass = next_valid_alignment(bh);
846             if (bundle.hits().size() < max_frags_per_bundle)
847             {
848                 if (bh && bh->num_hits() > max_frag_multihits)
849                 {
850 
851                 }
852                 else
853                 {
854                     bundle.add_raw_mass(raw_mass);
855                 }
856                 if (bh) { delete bh; }
857             }
858             else
859             {
860                 delete bh;
861                 bh = NULL;
862             }
863 
864 		}
865 		bundle.finalize();
866 		return false;
867 	}
868 
869 	bundle.add_ref_scaffold(*next_ref_scaff);
870 
871 	++next_ref_scaff;
872 
873 	_expand_by_refs(bundle);
874 
875 //    for (size_t i = 0; i < bundle.ref_scaffolds().size(); ++i)
876 //    {
877 //        boost::shared_ptr<Scaffold> s = bundle.ref_scaffolds()[i];
878 //        if (s->annotated_gene_id() == "ERCC-00002")
879 //        {
880 //                int a = 4;
881 //        }
882 //    }
883 
884 
885 	// The most recent RefID and position we've seen in the hit stream
886 	RefID last_hit_ref_id_seen = 0;
887 	int last_hit_pos_seen = 0;
888 
889 	// include hits that lay within the bundle interval
890 	while(true)
891 	{
892 		const ReadHit* bh = NULL;
893 
894         bool skip_read = false;
895 		// If we are randomly throwing out reads, check to see
896         // whether this one should be kept.
897         if (bundle.hits().size() >= max_frags_per_bundle)
898         {
899             next_valid_alignment(bh);
900             skip_read = true;
901         }
902         else
903         {
904             double raw_mass = next_valid_alignment(bh);
905             if (bh && bh->num_hits() > max_frag_multihits)
906             {
907                 skip_read = true;
908             }
909             else
910             {
911                 bundle.add_raw_mass(raw_mass);
912             }
913         }
914 
915         if (bh == NULL)
916         {
917 			if (_hit_fac->records_remain())
918 				continue;
919 			else
920 				break;
921         }
922 
923 		last_hit_ref_id_seen = bh->ref_id();
924 		last_hit_pos_seen = bh->left();
925 
926 		// test if the hit stream needs to catch up or has gone too far based on ref_id
927 		if (bh->ref_id() != bundle.ref_id())
928 		{
929 			int bh_chr_order = _hit_fac->ref_table().observation_order(bh->ref_id());
930 			int bundle_chr_order = _hit_fac->ref_table().observation_order(bundle.ref_id());
931 
932 			if (bh_chr_order < bundle_chr_order) // the hit stream has not caught up, skip
933 			{
934 				delete bh;
935                 bh = NULL;
936 				continue;
937 			}
938 			else // the hit stream has gone too far, rewind and break
939 			{
940                 rewind_hit(bh);
941                 bh = NULL;
942                 break;
943 			}
944 		}
945 
946         if (bh == NULL) // the hit stream has gone too far, break
947             break;
948 
949         if (bh->left() >= bundle.left() && bh->right() <= bundle.right())
950 		{
951             if (skip_read)
952             {
953                 delete bh;
954                 bh = NULL;
955             }
956 			else
957             {
958                 if (!bundle.add_open_hit(read_group_properties(), bh, false))
959                 {
960                     delete bh;
961                     bh = NULL;
962                 }
963             }
964 		}
965 		else if (bh->left() >= bundle.right())
966 		{
967             if (skip_read == false)
968             {
969                 bundle.rem_raw_mass(rewind_hit(bh));
970                 bh = NULL;
971             }
972             else
973             {
974                 delete bh;
975                 bh = NULL;
976             }
977 			break;
978 		}
979 	    else
980         {
981             // It's not within the bundle bounds, but it's also not past the
982             // right end, so skip it.
983             delete bh;
984             bh = NULL;
985         }
986 
987         if (skip_read == true && bh != NULL)
988         {
989             delete bh;
990             bh = NULL;
991         }
992 	}
993 
994     assert(bundle.left() != -1);
995     bundle.finalize_open_mates();
996 	bundle.finalize();
997     assert(bundle.right() != -1);
998 
999     return true;
1000 }
1001 
1002 // NOTE: does not support read skipping yet or max hits per bundle yet.
next_bundle_ref_guided(HitBundle & bundle)1003 bool BundleFactory::next_bundle_ref_guided(HitBundle& bundle)
1004 {
1005 
1006 	if (next_ref_scaff == ref_mRNAs.end())
1007 	{
1008 		return next_bundle_hit_driven(bundle);
1009 	}
1010 
1011 	const ReadHit* bh = NULL;
1012 	while(bh == NULL)
1013 	{
1014 		if (!_hit_fac->records_remain())
1015 		{
1016 			return next_bundle_ref_driven(bundle);
1017 		}
1018 		bundle.add_raw_mass(next_valid_alignment(bh));
1019 	}
1020 
1021 	if (bh->ref_id() != (*next_ref_scaff)->ref_id())
1022 	{
1023 		int bh_chr_order = _hit_fac->ref_table().observation_order(bh->ref_id());
1024 		int scaff_chr_order = _hit_fac->ref_table().observation_order((*next_ref_scaff)->ref_id());
1025 
1026 		bundle.rem_raw_mass(rewind_hit(bh));
1027 		bh = NULL;
1028 
1029 		if (bh_chr_order < scaff_chr_order)
1030 		{
1031 			return next_bundle_hit_driven(bundle);
1032 		}
1033 		else
1034 		{
1035 			return next_bundle_ref_driven(bundle);
1036 		}
1037 	}
1038 
1039 	if (bh->left() < (*next_ref_scaff)->left())
1040 	{
1041 		if (!bundle.add_open_hit(read_group_properties(), bh))
1042         {
1043             delete bh;
1044             bh = NULL;
1045         }
1046 	}
1047 	else
1048 	{
1049 		bundle.rem_raw_mass(rewind_hit(bh));
1050         bh = NULL;
1051 
1052 		bundle.add_ref_scaffold(*next_ref_scaff);
1053 		next_ref_scaff++;
1054 		_expand_by_refs(bundle);
1055 	}
1056 
1057 	while(_expand_by_hits(bundle) ||
1058 		  _expand_by_refs(bundle)) {}
1059 
1060 	assert(bundle.left() != -1);
1061 	bundle.finalize_open_mates();
1062 	bundle.finalize();
1063 	assert(bundle.right() != -1);
1064 
1065 	return true;
1066 }
1067 
1068 // expand the bundle interval as far as needed to include the overlapping
1069 // chain of reference transcripts that also overlap the initial bundle
1070 // interval
_expand_by_refs(HitBundle & bundle)1071 bool BundleFactory::_expand_by_refs(HitBundle& bundle)
1072 {
1073 	int initial_right = bundle.right();
1074 	while(next_ref_scaff < ref_mRNAs.end())
1075 	{
1076 		assert(bundle.ref_id() != (*next_ref_scaff)->ref_id() || (*next_ref_scaff)->left() >= bundle.left());
1077 //        if (*next_ref_scaff && (*next_ref_scaff)->annotated_gene_id() == "XLOC_009372")
1078 //        {
1079 //            int a = 5;
1080 //        }
1081 		if (bundle.ref_id() == (*next_ref_scaff)->ref_id()
1082 			&& overlap_in_genome((*next_ref_scaff)->left(),(*next_ref_scaff)->right(),bundle.left(), bundle.right()))
1083 		{
1084 			bundle.add_ref_scaffold(*next_ref_scaff);
1085             next_ref_scaff++;
1086 		}
1087 		else
1088 		{
1089 			break;
1090 		}
1091 	}
1092 
1093 
1094 	return (bundle.right() > initial_right);
1095 }
1096 
1097 // expand bundle by chaining overlapping hits
_expand_by_hits(HitBundle & bundle)1098 bool BundleFactory::_expand_by_hits(HitBundle& bundle)
1099 {
1100 	int initial_right = bundle.right();
1101 	while(true)
1102 	{
1103         bool skip_read = false;
1104         const ReadHit* bh = NULL;
1105 
1106         double raw_mass = next_valid_alignment(bh);
1107         if (bh && bh->num_hits() > max_frag_multihits)
1108         {
1109             skip_read = true;
1110         }
1111         else
1112         {
1113             bundle.add_raw_mass(raw_mass);
1114         }
1115 
1116 		if (bh == NULL)
1117 		{
1118 			if (_hit_fac->records_remain())
1119 			{
1120 				continue;
1121 			}
1122 			else
1123 			{
1124 				break;
1125 			}
1126 		}
1127 
1128 		if (bh->ref_id() == bundle.ref_id() && bh->left() < bundle.right() + olap_radius)
1129 		{
1130 			if (skip_read || !bundle.add_open_hit(read_group_properties(), bh))
1131             {
1132                 delete bh;
1133                 bh = NULL;
1134             }
1135 		}
1136 		else
1137 		{
1138 			bundle.rem_raw_mass(rewind_hit(bh));
1139 
1140 			break;
1141 		}
1142 	}
1143 
1144 	return (bundle.right() > initial_right);
1145 }
1146 
next_bundle(HitBundle & bundle,bool cache_bundle)1147 bool BundleFactory::next_bundle(HitBundle& bundle, bool cache_bundle)
1148 {
1149 #if ENABLE_THREADS
1150     boost::mutex::scoped_lock lock(_factory_lock);
1151 #endif
1152     bool got_bundle = false;
1153 	switch(_bundle_mode)
1154 	{
1155 		case HIT_DRIVEN:
1156             _curr_bundle++;
1157 			got_bundle = next_bundle_hit_driven(bundle);
1158             bundle.id(_curr_bundle);
1159 			break;
1160 		case REF_DRIVEN:
1161             _curr_bundle++;
1162 			got_bundle = next_bundle_ref_driven(bundle);
1163             bundle.id(_curr_bundle);
1164 			break;
1165 		case REF_GUIDED:
1166             _curr_bundle++;
1167 			got_bundle = next_bundle_ref_guided(bundle);
1168             bundle.id(_curr_bundle);
1169 			break;
1170 	}
1171 	return got_bundle;
1172 }
1173 
1174 
1175 struct IntronSpanCounter
1176 {
IntronSpanCounterIntronSpanCounter1177 	IntronSpanCounter() : left_reads(0), little_reads(0), total_reads(0), multimap_reads(0), fwd_strand_frags(0) {}
1178 	size_t left_reads;
1179 	size_t little_reads; // small span overhang
1180 	size_t total_reads;
1181 	size_t multimap_reads;
1182     size_t fwd_strand_frags;
1183 	vector<size_t> hist;
1184 };
1185 
1186 typedef map<AugmentedCuffOp, IntronSpanCounter> IntronCountTable;
1187 
count_introns_in_read(const ReadHit & read,IntronCountTable & intron_counts)1188 void count_introns_in_read(const ReadHit& read,
1189 						   IntronCountTable& intron_counts)
1190 {
1191 	const vector<CigarOp>& cig = read.cigar();
1192 
1193 	int read_len = read.read_len();
1194 	int small_anchor = (int)floor(read_len * small_anchor_fraction);
1195 
1196 	int r_left = 0;
1197 	int g_left = read.left();
1198 
1199 	for (size_t i = 0; i < cig.size(); ++i)
1200 	{
1201 		assert(cig[i].length >= 0);
1202 		switch(cig[i].opcode)
1203 		{
1204 			case MATCH:
1205 				//ops.push_back(AugmentedCuffOp(CUFF_MATCH, g_left, cig[i].length));
1206 				g_left += cig[i].length;
1207 				r_left += cig[i].length;
1208 				break;
1209 
1210 			case REF_SKIP:
1211 			{
1212 				AugmentedCuffOp intron(CUFF_INTRON, g_left, cig[i].length);
1213 				pair<IntronCountTable::iterator, bool> ins_itr;
1214 				ins_itr = intron_counts.insert(make_pair(intron, IntronSpanCounter()));
1215 				IntronCountTable::iterator itr = ins_itr.first;
1216 				itr->second.total_reads++;
1217 
1218 				if (read.num_hits() > 10)
1219 				{
1220 					itr->second.multimap_reads++;
1221 				}
1222 
1223 				if ( r_left <= small_anchor || (read_len - r_left) < small_anchor)
1224 				{
1225 					itr->second.little_reads++;
1226 				}
1227 
1228                 if (read.source_strand() == CUFF_FWD)
1229                 {
1230                     //itr->second.fwd_strand_frags;
1231                 }
1232                 else
1233                 {
1234                     assert(read.source_strand() == CUFF_REV);
1235                 }
1236 
1237 
1238 				vector<size_t>& hist = itr->second.hist;
1239 				if (hist.size() < (size_t)read_len)
1240 				{
1241 					size_t num_new_bins = read_len - hist.size();
1242 					size_t new_left_bins = (size_t)floor(num_new_bins / 2.0);
1243 					size_t new_right_bins = (size_t)ceil(num_new_bins / 2.0);
1244 					hist.insert(hist.begin(), new_left_bins, 0);
1245 					hist.insert(hist.end(), new_right_bins, 0);
1246 				}
1247 
1248 				assert (r_left < hist.size());
1249 				hist[r_left]++;
1250 				//ops.push_back(AugmentedCuffOp(CUFF_INTRON, g_left, cig[i].length));
1251 				g_left += cig[i].length;
1252 				break;
1253 			}
1254 
1255 			case SOFT_CLIP:
1256 				g_left += cig[i].length;
1257 				break;
1258             case HARD_CLIP:
1259 				break;
1260             case INS:
1261                 g_left -= cig[i].length;
1262                 break;
1263             case DEL:
1264                 g_left += cig[i].length;
1265                 break;
1266 			default:
1267 				assert(false);
1268 				break;
1269 		}
1270 	}
1271 }
1272 
minor_introns(int bundle_length,int bundle_left,const IntronCountTable & intron_counts,vector<AugmentedCuffOp> & bad_introns,double fraction)1273 void minor_introns(int bundle_length,
1274 				   int bundle_left,
1275 				   const IntronCountTable& intron_counts,
1276 				   vector<AugmentedCuffOp>& bad_introns,
1277 				   double fraction)
1278 
1279 {
1280 	for(IntronCountTable::const_iterator itr = intron_counts.begin();
1281 		itr != intron_counts.end();
1282 		++itr)
1283 	{
1284 		pair<AugmentedCuffOp, IntronSpanCounter> itr_cnt_pair = *itr;
1285 		const IntronSpanCounter itr_spans = itr_cnt_pair.second;
1286 
1287 		double doc = itr_spans.total_reads;
1288 
1289 		for (IntronCountTable::const_iterator itr2 = intron_counts.begin();
1290 			 itr2 != intron_counts.end();
1291 			 ++itr2)
1292 		{
1293 			if (itr == itr2 ||
1294 				!AugmentedCuffOp::overlap_in_genome(itr->first, itr2->first))
1295 			{
1296 				continue;
1297 			}
1298 
1299 			pair<AugmentedCuffOp, IntronSpanCounter> itr2_cnt_pair = *itr2;
1300 			const IntronSpanCounter itr2_spans = itr2_cnt_pair.second;
1301 
1302 			double thresh = itr2_spans.total_reads * fraction;
1303 			if (doc < thresh)
1304 			{
1305 				//#if verbose_msg
1306 				//							fprintf(stderr, "\t Filtering intron (due to overlap) %d - %d: %f thresh %f\n", itr->first.first, itr->first.second, doc, bundle_avg_thresh);
1307 				//#endif
1308 				bool exists = binary_search(bad_introns.begin(),
1309 											bad_introns.end(),
1310 											itr->first);
1311 				if (!exists)
1312 				{
1313 					verbose_msg("Filtering intron %d-%d spanned by %lu reads based on overlap with much more abundant intron: %d-%d spanned by %lu reads\n",
1314 							itr->first.g_left(),
1315 							itr->first.g_right(),
1316 							itr->second.total_reads,
1317 							itr2->first.g_left(),
1318 							itr2->first.g_right(),
1319 							itr2->second.total_reads);
1320 
1321 					bad_introns.push_back(itr->first);
1322 					sort(bad_introns.begin(), bad_introns.end());
1323 				}
1324 			}
1325 
1326 //            if ((itr->second.fwd_strand_frags == 0 &&
1327 //                 itr2->second.fwd_strand_frags != 0) ||
1328 //                (itr2->second.fwd_strand_frags == 0 &&
1329 //                 itr->second.fwd_strand_frags != 0))
1330 //            {
1331 //                int itr1_L = itr->first.g_left();
1332 //                int itr1_R = itr->first.g_right();
1333 //                int itr2_L = itr2->first.g_left();
1334 //                int itr2_R = itr2->first.g_right();
1335 //
1336 //                if (abs(itr1_L - itr2_L) < 25 && abs(itr1_R - itr2_R) < 25)
1337 //                {
1338 //                    int a = 3;
1339 //                }
1340 //            }
1341 		}
1342 	}
1343 }
1344 
multimapping_introns(int bundle_length,int bundle_left,const IntronCountTable & intron_counts,vector<AugmentedCuffOp> & bad_introns,double fraction)1345 void multimapping_introns(int bundle_length,
1346 						  int bundle_left,
1347 						  const IntronCountTable& intron_counts,
1348 						  vector<AugmentedCuffOp>& bad_introns,
1349 						  double fraction)
1350 
1351 {
1352 	for(IntronCountTable::const_iterator itr = intron_counts.begin();
1353 		itr != intron_counts.end();
1354 		++itr)
1355 	{
1356 		pair<AugmentedCuffOp, IntronSpanCounter> itr_cnt_pair = *itr;
1357 		const IntronSpanCounter itr_spans = itr_cnt_pair.second;
1358 
1359 		double doc = itr_spans.total_reads;
1360 		double multi = itr_spans.multimap_reads;
1361 
1362 		double multi_fraction = multi / doc;
1363 
1364 		if (multi_fraction > fraction)
1365 		{
1366 			bool exists = binary_search(bad_introns.begin(),
1367 										bad_introns.end(),
1368 										itr->first);
1369 			if (!exists)
1370 			{
1371 				verbose_msg("Filtering intron %d-%d spanned by %lu reads because %lg percent are multireads.\n",
1372 						itr->first.g_left(),
1373 						itr->first.g_right(),
1374 						itr->second.total_reads,
1375 						multi_fraction * 100);
1376 
1377 				bad_introns.push_back(itr->first);
1378 				sort(bad_introns.begin(), bad_introns.end());
1379 			}
1380 		}
1381 	}
1382 }
1383 
1384 
identify_bad_splices(const HitBundle & bundle,BadIntronTable & bad_splice_ops)1385 void identify_bad_splices(const HitBundle& bundle,
1386 						  BadIntronTable& bad_splice_ops)
1387 {
1388 	// Tracks, for each intron, how many reads
1389 	IntronCountTable intron_counts;
1390 
1391 	RefID ref_id = bundle.ref_id();
1392 
1393 	pair<BadIntronTable::iterator, bool> ins_itr;
1394 	ins_itr = bad_splice_ops.insert(make_pair(ref_id, vector<AugmentedCuffOp>()));
1395 	vector<AugmentedCuffOp>& bad_introns = ins_itr.first->second;
1396 
1397 	BOOST_FOREACH (const MateHit& hit, bundle.hits())
1398 	{
1399 		if (hit.left_alignment())
1400 		{
1401 			count_introns_in_read(*hit.left_alignment(), intron_counts);
1402 		}
1403 		if (hit.right_alignment())
1404 		{
1405 			count_introns_in_read(*hit.right_alignment(), intron_counts);
1406 		}
1407 	}
1408 
1409 	minor_introns(bundle.length(), bundle.left(), intron_counts, bad_introns, min_isoform_fraction);
1410 	// [Geo] disable filtering of multi-mapped introns:
1411   // multimapping_introns(bundle.length(), bundle.left(), intron_counts, bad_introns, 0.5);
1412 	for (IntronCountTable::iterator itr = intron_counts.begin();
1413 		 itr != intron_counts.end();
1414 		 ++itr)
1415 	{
1416 		if (binary_search(bad_introns.begin(),
1417 						  bad_introns.end(),
1418 						  itr->first))
1419 		{
1420 			continue;
1421 		}
1422 		pair<AugmentedCuffOp, IntronSpanCounter> cnt_pair = *itr;
1423 		try
1424 		{
1425 			const IntronSpanCounter spans = cnt_pair.second;
1426 
1427 			//			binomial read_half_dist(spans.total_reads, success_fraction);
1428 			//			double left_side_p = cdf(read_half_dist, spans.total_reads - spans.left_reads);
1429 			//			double right_side_p = cdf(complement(read_half_dist, spans.left_reads));
1430 
1431 
1432 			double success = 2 * small_anchor_fraction;
1433 
1434 			binomial read_half_dist(spans.total_reads, success);
1435 			double right_side_p;
1436 
1437 			// right_side_p describes the chance that we'd observe at least
1438 			// this many small overhang reads by chance with an unbiased
1439 			// distribution over a normal (e.g. non-artifact) junction
1440 			if (spans.little_reads > 0)
1441 			{
1442 				right_side_p = 1.0 - cdf(read_half_dist, spans.little_reads - 1);
1443 			}
1444 			else
1445 			{
1446 				right_side_p = 1.0;
1447 			}
1448 
1449 			double left_side_p = 0;
1450 			double expected = success * spans.total_reads;
1451 
1452             //double excess = spans.little_reads - expected;
1453 
1454 			// left_side_p describes the chance that we'd observe this few or
1455 			// fewer small overhang reads by chance with an unbiased
1456 			// distribution over a normal (e.g. non-artifact) junction
1457 			if (spans.little_reads > 0)
1458 			{
1459 				left_side_p = cdf(read_half_dist, spans.little_reads);
1460 			}
1461 			else
1462 			{
1463 				left_side_p = cdf(read_half_dist, 0);
1464 			}
1465 
1466 			//double alpha = 0.05;
1467 			//double right_side_p = 0;
1468 
1469 			// Two-tailed binomial test:
1470 //			if (left_side_p < (binomial_junc_filter_alpha / 2.0) ||
1471 //				right_side_p < (binomial_junc_filter_alpha / 2.0))
1472 			// One-tailed binomial test
1473 
1474 			bool filtered = false;
1475 
1476 			const IntronSpanCounter& counter = itr->second;
1477 
1478 			if (right_side_p < (binomial_junc_filter_alpha))
1479 			{
1480 				double overhang_ratio = counter.little_reads / (double) counter.total_reads;
1481 				if (counter.total_reads < 100 || overhang_ratio >= 0.50)
1482 				{
1483 					verbose_msg("Filtering intron %d-%d spanned by %lu reads (%lu low overhang, %lg expected) left P = %lg, right P = %lg\n",
1484 							itr->first.g_left(),
1485 							itr->first.g_right(),
1486 							itr->second.total_reads,
1487 							itr->second.little_reads,
1488 							expected,
1489 							left_side_p,
1490 							right_side_p);
1491 					filtered = true;
1492 
1493 					bool exists = binary_search(bad_introns.begin(),
1494 												bad_introns.end(),
1495 												itr->first);
1496 					if (!exists)
1497 					{
1498 						bad_introns.push_back(itr->first);
1499 						sort(bad_introns.begin(), bad_introns.end());
1500 					}
1501 				}
1502 			}
1503 
1504 			vector<size_t> hist = itr->second.hist;
1505 			if (itr->second.total_reads > 1000)
1506 			{
1507 				sort(hist.begin(), hist.end());
1508 				size_t median = (size_t)floor(hist.size() / 2);
1509 				if (median <= hist.size() && hist[median] == 0)
1510 				{
1511 					verbose_msg("Filtering intron %d-%d spanned by %lu reads (%lu low overhang, %lg expected) left P = %lg, right P = %lg\n",
1512 							itr->first.g_left(),
1513 							itr->first.g_right(),
1514 							itr->second.total_reads,
1515 							itr->second.little_reads,
1516 							expected,
1517 							left_side_p,
1518 							right_side_p);
1519 
1520 					filtered = true;
1521 
1522 					bool exists = binary_search(bad_introns.begin(),
1523 												bad_introns.end(),
1524 												itr->first);
1525 					if (!exists)
1526 					{
1527 						bad_introns.push_back(itr->first);
1528 						sort(bad_introns.begin(), bad_introns.end());
1529 					}
1530 				}
1531 			}
1532 
1533 			if (!filtered)
1534 			{
1535 				verbose_msg("Accepting intron %d-%d spanned by %lu reads (%lu low overhang, %lg expected) left P = %lg, right P = %lg\n",
1536 						itr->first.g_left(),
1537 						itr->first.g_right(),
1538 						itr->second.total_reads,
1539 						itr->second.little_reads,
1540 						expected,
1541 						left_side_p,
1542 						right_side_p);
1543 
1544 			}
1545 		}
1546 
1547 
1548 		catch(const std::exception& e)
1549 		{
1550 			//
1551 			/*`
1552 			 [#coinflip_eg_catch]
1553 			 It is always essential to include try & catch blocks because
1554 			 default policies are to throw exceptions on arguments that
1555 			 are out of domain or cause errors like numeric-overflow.
1556 
1557 			 Lacking try & catch blocks, the program will abort, whereas the
1558 			 message below from the thrown exception will give some helpful
1559 			 clues as to the cause of the problem.
1560 			 */
1561 			std::cout <<
1562 			"\n""Message from thrown exception was:\n   " << e.what() << std::endl;
1563 		}
1564 
1565 	}
1566 }
1567 
spans_bad_intron(const ReadHit & read)1568 bool BundleFactory::spans_bad_intron(const ReadHit& read)
1569 {
1570 
1571 	const vector<CigarOp>& cig = read.cigar();
1572 
1573 	size_t g_left = read.left();
1574 	BadIntronTable::const_iterator itr = _bad_introns.find(read.ref_id());
1575 	if (itr == _bad_introns.end())
1576 		return false;
1577 
1578 	const vector<AugmentedCuffOp>& bi = itr->second;
1579 	for (size_t i = 0; i < cig.size(); ++i)
1580 	{
1581 		assert(cig[i].length >= 0);
1582 		switch(cig[i].opcode)
1583 		{
1584 			case MATCH:
1585 				//ops.push_back(AugmentedCuffOp(CUFF_MATCH, g_left, cig[i].length));
1586 				g_left += cig[i].length;
1587 				break;
1588 
1589 			case REF_SKIP:
1590 			{
1591 				AugmentedCuffOp intron(CUFF_INTRON, g_left, cig[i].length);
1592 				if (binary_search(bi.begin(), bi.end(), intron))
1593 				{
1594 					return true;
1595 				}
1596 
1597 				//ops.push_back(AugmentedCuffOp(CUFF_INTRON, g_left, cig[i].length));
1598 				g_left += cig[i].length;
1599 				break;
1600 			}
1601 
1602 			case SOFT_CLIP:
1603 				g_left += cig[i].length;
1604 				break;
1605 
1606             case HARD_CLIP:
1607 				break;
1608             case INS:
1609                 g_left -= cig[i].length;
1610                 break;
1611             case DEL:
1612                 g_left += cig[i].length;
1613                 break;
1614 			default:
1615 				assert(false);
1616 				break;
1617 		}
1618 	}
1619 
1620 	return false;
1621 }
1622 
inspect_map(boost::shared_ptr<BundleFactory> bundle_factory,BadIntronTable * bad_introns,vector<LocusCount> & compatible_count_table,vector<LocusCount> & total_count_table,IdToLocusMap & id_to_locus_map,bool progress_bar,bool show_stats)1623 void inspect_map(boost::shared_ptr<BundleFactory> bundle_factory,
1624                  BadIntronTable* bad_introns,
1625                  vector<LocusCount>& compatible_count_table,
1626                  vector<LocusCount>& total_count_table,
1627                  IdToLocusMap& id_to_locus_map,
1628                  bool progress_bar,
1629                  bool show_stats)
1630 {
1631 
1632 	ProgressBar p_bar;
1633 	if (progress_bar)
1634 		p_bar = ProgressBar("Inspecting reads and determining fragment length distribution.",bundle_factory->ref_table().size());
1635 	RefID last_chrom = 0;
1636 
1637 	long double map_mass = 0.0;
1638     long double norm_map_mass = 0.0;
1639 
1640 	int min_len = numeric_limits<int>::max();
1641 	int max_len = def_max_frag_len;
1642 	vector<double> frag_len_hist(def_max_frag_len+1,0);
1643 	bool has_pairs = false;
1644 
1645 	int num_bundles = 0;
1646 	size_t total_hits = 0;
1647 	size_t total_non_redundant_hits = 0;
1648 
1649 	//To be used for quartile normalization
1650 	vector<long double> mass_dist;
1651 
1652 	// Store the maximum read length for "first" and "second" reads to report to user.
1653 	int max_1 = 0;
1654 	int max_2 = 0;
1655 
1656 	boost::shared_ptr<MultiReadTable> mrt(new MultiReadTable());
1657 
1658 	while(true)
1659 	{
1660 		HitBundle* bundle_ptr = new HitBundle();
1661 
1662 		bool valid_bundle = bundle_factory->next_bundle(*bundle_ptr, false);
1663 		HitBundle& bundle = *bundle_ptr;
1664 
1665         if (use_compat_mass) //only count hits that are compatible with ref transcripts
1666         {
1667             // Take raw mass even if bundle is "empty", since we could be out of refs
1668             // with remaining hits
1669             map_mass += bundle.compatible_mass();
1670 //            if (lib_norm_method == QUARTILE && bundle.compatible_mass() > 0)
1671 //            {
1672 //                mass_dist.push_back(bundle.compatible_mass());
1673 //            }
1674         }
1675         else if (use_total_mass) //use all raw mass
1676         {
1677 
1678             // Take raw mass even if bundle is "empty", since we could be out of refs
1679             // with remaining hits
1680             map_mass += bundle.raw_mass();
1681 //            if (lib_norm_method == QUARTILE && bundle.raw_mass() > 0)
1682 //            {
1683 //                mass_dist.push_back(bundle.raw_mass());
1684 //            }
1685         }
1686         else
1687         {
1688             fprintf(stderr, "Error: hit counting scheme for normalization is not set!\n");
1689             assert(false);
1690             exit(1);
1691         }
1692 
1693 		const RefSequenceTable& rt = bundle_factory->ref_table();
1694 		const char* chrom = rt.get_name(bundle.ref_id());
1695 		char bundle_label_buf[2048];
1696         if (chrom)
1697         {
1698             sprintf(bundle_label_buf, "%s:%d-%d", chrom, bundle.left(), bundle.right());
1699             verbose_msg("Inspecting bundle %s with %lu reads\n", bundle_label_buf, bundle.hits().size());
1700 
1701             vector<string> gene_ids;
1702             vector<string> gene_short_names;
1703             BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle.ref_scaffolds())
1704             {
1705                 if (s->annotated_gene_id() != "")
1706                     gene_ids.push_back(s->annotated_gene_id());
1707                 if (s->annotated_gene_name() != "")
1708                     gene_short_names.push_back(s->annotated_gene_name());
1709 
1710 //                if (s->annotated_gene_id() == "ENSG00000268467.1")
1711 //                {
1712 //                    int a = 4;
1713 //                }
1714 
1715             }
1716             compatible_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.compatible_mass()), bundle.ref_scaffolds().size(), gene_ids, gene_short_names));
1717             total_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.raw_mass()), bundle.ref_scaffolds().size(), gene_ids, gene_short_names));
1718 		}
1719 
1720         if (!valid_bundle)
1721 		{
1722 			delete bundle_ptr;
1723 			break;
1724 		}
1725 		num_bundles++;
1726 
1727         BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle.ref_scaffolds()){
1728             id_to_locus_map.register_locus_to_id(s->annotated_trans_id(), bundle_label_buf);
1729             id_to_locus_map.register_locus_to_id(s->annotated_gene_id(), bundle_label_buf);
1730 
1731             if (s->annotated_tss_id().empty() == false)
1732             {
1733                 id_to_locus_map.register_locus_to_id(s->annotated_tss_id(), bundle_label_buf);
1734             }
1735 
1736             if (s->annotated_protein_id().empty() == false)
1737             {
1738                 id_to_locus_map.register_locus_to_id(s->annotated_protein_id(), bundle_label_buf);
1739             }
1740         }
1741 
1742         if (progress_bar)
1743         {
1744 			double inc_amt = last_chrom == bundle.ref_id() ? 0.0 : 1.0;
1745 			p_bar.update(bundle_label_buf, inc_amt);
1746 			last_chrom = bundle.ref_id();
1747         }
1748 
1749         if (bad_introns != NULL)
1750 		{
1751 			identify_bad_splices(bundle, *bad_introns);
1752 		}
1753 
1754 		const vector<MateHit>& hits = bundle.non_redundant_hits();
1755 		if (hits.empty())
1756 		{
1757 			delete bundle_ptr;
1758 			continue;
1759 		}
1760 
1761 		list<pair<int, int> > open_ranges;
1762 		int curr_range_start = hits[0].left();
1763 		int curr_range_end = numeric_limits<int>::max();
1764 		int next_range_start = -1;
1765 
1766 		total_non_redundant_hits += bundle.non_redundant_hits().size();
1767 		total_hits += bundle.hits().size();
1768 
1769 		// This first loop calclates the map mass and finds ranges with no introns
1770 		// Note that we are actually looking at non-redundant hits, which is why we use collapse_mass
1771 		// This loop will also add multi-reads to the MultiReads table
1772 		for (size_t i = 0; i < hits.size(); ++i)
1773 		{
1774 			assert(hits[i].left_alignment());
1775 
1776             // Add to table if multi-read
1777 			if (hits[i].is_multi())
1778 			{
1779 				mrt->add_hit(hits[i]);
1780 			}
1781 
1782 			// Find left length
1783 			int left_len = hits[i].left_alignment()->right()-hits[i].left_alignment()->left();
1784 			min_len = min(min_len, left_len);
1785 			if (!hits[i].left_alignment()->contains_splice())
1786             {
1787 				if (hits[i].left_alignment()->is_first())
1788                     max_1 = max(max_1, left_len);
1789                 else
1790                     max_2 = max(max_2, left_len);
1791             }
1792 
1793 			// Find right length
1794 			if (hits[i].right_alignment())
1795 			{
1796 				int right_len = hits[i].right_alignment()->right()-hits[i].right_alignment()->left();
1797 				min_len = min(min_len, right_len);
1798 				if (!hits[i].right_alignment()->contains_splice())
1799                 {
1800                     if (hits[i].right_alignment()->is_first())
1801                         max_1 = max(max_1, right_len);
1802                     else
1803                         max_2 = max(max_2, right_len);
1804                 }
1805                 has_pairs = true;
1806 			}
1807 
1808 			// Find fragment length
1809 			if (bundle.ref_scaffolds().size()==1 && hits[i].is_pair())
1810 			// Annotation provided and single isoform gene
1811 			{
1812 				int start, end, mate_length;
1813 				boost::shared_ptr<Scaffold> scaff = bundle.ref_scaffolds()[0];
1814 				if (scaff->map_frag(hits[i], start, end, mate_length))
1815 				{
1816 					if (mate_length >= min_len && mate_length <= max_len)
1817 						frag_len_hist[mate_length] += hits[i].collapse_mass();
1818 				}
1819 			}
1820 			else if (bundle.ref_scaffolds().empty())
1821 			// No annotation provided.  Look for ranges.
1822 			{
1823 				if (hits[i].left() > curr_range_end)
1824 				{
1825 					if (curr_range_end - curr_range_start > max_len)
1826 						open_ranges.push_back(make_pair(curr_range_start, curr_range_end));
1827 					curr_range_start = next_range_start;
1828 					curr_range_end = numeric_limits<int>::max();
1829 				}
1830 				if (hits[i].left_alignment()->contains_splice())
1831 				{
1832 					if (hits[i].left() - curr_range_start > max_len)
1833 						open_ranges.push_back(make_pair(curr_range_start, hits[i].left()-1));
1834 					curr_range_start = max(next_range_start, hits[i].left_alignment()->right());
1835 				}
1836 				if (hits[i].right_alignment() && hits[i].right_alignment()->contains_splice())
1837 				{
1838 					assert(hits[i].right_alignment()->left() >= hits[i].left());
1839 					curr_range_end = min(curr_range_end, hits[i].right_alignment()->left()-1);
1840 					next_range_start = max(next_range_start, hits[i].right());
1841 				}
1842 			}
1843 		}
1844 
1845         if (bundle.ref_scaffolds().empty() && has_pairs) // No annotation provided
1846 		{
1847 			pair<int, int> curr_range(-1,-1);
1848 
1849 			// This second loop uses the ranges found above to find the estimated frag length distribution
1850 			// It also finds the minimum read length to use in the linear interpolation
1851 			for (size_t i = 0; i < hits.size(); ++i)
1852 			{
1853 				if (hits[i].left() > curr_range.second && open_ranges.empty())
1854 					break;
1855 
1856 				if (hits[i].left() > curr_range.second)
1857 				{
1858 					curr_range = open_ranges.front();
1859 					open_ranges.pop_front();
1860 				}
1861 
1862 				if (hits[i].left() >= curr_range.first && hits[i].right() <= curr_range.second && hits[i].is_pair())
1863 				{
1864 					int mate_len = hits[i].right()-hits[i].left();
1865 					if (mate_len <= max_len)
1866 						frag_len_hist[mate_len] += hits[i].collapse_mass();
1867 				}
1868 			}
1869 		}
1870 
1871         open_ranges.clear();
1872 		delete bundle_ptr;
1873 	}
1874 
1875     norm_map_mass = map_mass;
1876 
1877 //	if (lib_norm_method == QUARTILE && mass_dist.size() > 0)
1878 //	{
1879 //		sort(mass_dist.begin(),mass_dist.end());
1880 //		int upper_quart_index = mass_dist.size() * 0.75;
1881 //		norm_map_mass = mass_dist[upper_quart_index];
1882 //	}
1883 
1884     if (bad_introns != NULL)
1885     {
1886         size_t alloced = 0;
1887         size_t used = 0;
1888         size_t num_introns = 0;
1889         for (BadIntronTable::const_iterator itr = bad_introns->begin();
1890              itr != bad_introns->end();
1891              ++itr)
1892         {
1893             alloced += itr->second.capacity() * sizeof(AugmentedCuffOp);
1894             used += itr->second.size() * sizeof(AugmentedCuffOp);
1895             num_introns += itr->second.size();
1896         }
1897 
1898         verbose_msg( "Bad intron table has %lu introns: (%lu alloc'd, %lu used)\n", num_introns, alloced, used);
1899     	verbose_msg( "Map has %lu hits, %lu are non-redundant\n", total_hits, total_non_redundant_hits);
1900     }
1901 
1902 	if (progress_bar)
1903 		p_bar.complete();
1904 
1905 	vector<double> frag_len_pdf(max_len+1, 0.0);
1906 	vector<double> frag_len_cdf(max_len+1, 0.0);
1907     long double tot_count = accumulate(frag_len_hist.begin(), frag_len_hist.end(), 0.0 );
1908     bool empirical = false;
1909 
1910 	if (user_provided_fld && has_pairs && tot_count >= 10000)
1911 	{
1912 		fprintf(stderr, "Warning: Overriding empirical fragment length distribution with user-specified parameters is not recommended.\n");
1913 	}
1914 
1915 	if (!has_pairs || tot_count < 10000)
1916 	{
1917 		if (has_pairs && !user_provided_fld)
1918 		{
1919 			fprintf(stderr, "Warning: Using default Gaussian distribution due to insufficient paired-end reads in open ranges.  It is recommended that correct parameters (--frag-len-mean and --frag-len-std-dev) be provided.\n");
1920 		}
1921 		tot_count = 0;
1922 		normal frag_len_norm(def_frag_len_mean, def_frag_len_std_dev);
1923 		max_len = def_frag_len_mean + 3*def_frag_len_std_dev;
1924 		for(int i = min_len; i <= max_len; i++)
1925 		{
1926 			frag_len_hist[i] = cdf(frag_len_norm, i+0.5)-cdf(frag_len_norm, i-0.5);
1927 			tot_count += frag_len_hist[i];
1928 		}
1929 	}
1930 	else
1931 	// Calculate the max frag length and interpolate all zeros between min read len and max frag len
1932 	{
1933 		empirical = true;
1934 		double curr_total = 0;
1935 		size_t last_nonzero = min_len-1;
1936 		for(size_t i = last_nonzero+1; i < frag_len_hist.size(); i++)
1937 		{
1938 			if (frag_len_hist[i] > 0)
1939 			{
1940 				if (last_nonzero != i-1)
1941 				{
1942 					double b = frag_len_hist[last_nonzero];
1943 					double m = (frag_len_hist[i] - b)/(i-last_nonzero);
1944 					for (size_t x = 1; x < i - last_nonzero; x++)
1945 					{
1946 						frag_len_hist[last_nonzero+x] = m * x + b;
1947 						tot_count += frag_len_hist[last_nonzero+x];
1948 						curr_total += frag_len_hist[last_nonzero+x];
1949 					}
1950 				}
1951 				last_nonzero = i;
1952 			}
1953 
1954 			curr_total += frag_len_hist[i];
1955 
1956 			if (curr_total/tot_count > 0.9999)
1957 			{
1958 				max_len = i;
1959 				tot_count = curr_total;
1960 				break;
1961 			}
1962 		}
1963 	}
1964 
1965     double mean = 0.0;
1966 
1967     if (output_fld)
1968     {
1969         FILE* fhist = fopen(string(output_dir + "/frag_len_hist.csv").c_str(),"w");
1970         fprintf(fhist, "Length,Count\n");
1971         for(size_t i = 1; i < frag_len_hist.size(); i++)
1972         {
1973             fprintf(fhist, "%zu,%f\n", i, frag_len_hist[i]);
1974         }
1975         fclose(fhist);
1976     }
1977 
1978 	// Convert histogram to pdf and cdf, calculate mean
1979 	int frag_len_mode = 0;
1980 	for(size_t i = min_len; i <= (size_t)max_len; i++)
1981 	{
1982 		frag_len_pdf[i] = frag_len_hist[i]/tot_count;
1983 		frag_len_cdf[i] = frag_len_cdf[i-1] + frag_len_pdf[i];
1984 
1985 		if (frag_len_pdf[i] > frag_len_pdf[frag_len_mode])
1986 			frag_len_mode = i;
1987         mean += frag_len_pdf[i] * i;
1988 	}
1989 
1990     double std_dev =  0.0;
1991     for(size_t i = 1; i < frag_len_hist.size(); i++)
1992     {
1993         std_dev += frag_len_pdf[i] * ((i - mean) * (i - mean));
1994     }
1995 
1996     std_dev = sqrt(std_dev);
1997 
1998 	boost::shared_ptr<ReadGroupProperties> rg_props = bundle_factory->read_group_properties();
1999 
2000     FLDSource source = DEFAULT;
2001     if (empirical)
2002     {
2003         source = LEARNED;
2004     }
2005     else if (user_provided_fld)
2006     {
2007         source = USER;
2008     }
2009 
2010 	boost::shared_ptr<EmpDist const> fld(new EmpDist(frag_len_pdf, frag_len_cdf, frag_len_mode, mean, std_dev, min_len, max_len, source));
2011 	rg_props->multi_read_table(mrt);
2012 	rg_props->frag_len_dist(fld);
2013 	rg_props->normalized_map_mass(norm_map_mass);
2014     rg_props->total_map_mass(map_mass);
2015 
2016     if (show_stats)
2017     {
2018         fprintf(stderr, "> Map Properties:\n");
2019         //if (lib_norm_method == QUARTILE)
2020         //    fprintf(stderr, ">\tUpper Quartile: %.2Lf\n", norm_map_mass);
2021         fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", norm_map_mass);
2022         fprintf(stderr, ">\tRaw Map Mass: %.2Lf\n", map_mass);
2023         if (corr_multi)
2024             fprintf(stderr,">\tNumber of Multi-Reads: %zu (with %zu total hits)\n", mrt->num_multireads(), mrt->num_multihits());
2025     //	if (has_pairs)
2026     //		fprintf(stderr, ">\tRead Type: %dbp x %dbp\n", max_1, max_2);
2027     //	else
2028     //		fprintf(stderr, ">\tRead Type: %dbp single-end\n", max(max_1,max_2));
2029 
2030         if (empirical)
2031         {
2032             fprintf(stderr, ">\tFragment Length Distribution: Empirical (learned)\n");
2033             fprintf(stderr, ">\t              Estimated Mean: %.2f\n", mean);
2034             fprintf(stderr, ">\t           Estimated Std Dev: %.2f\n", std_dev);
2035         }
2036         else
2037         {
2038             if (user_provided_fld)
2039             {
2040                 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (user-specified)\n");
2041             }
2042             else
2043             {
2044                 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (default)\n");
2045             }
2046             fprintf(stderr, ">\t              Default Mean: %d\n", def_frag_len_mean);
2047             fprintf(stderr, ">\t           Default Std Dev: %d\n", def_frag_len_std_dev);
2048         }
2049     }
2050 	bundle_factory->num_bundles(num_bundles);
2051 	bundle_factory->reset();
2052 	return;
2053 }
2054 
2055 //////////////////////
2056 
2057 
next_bundle(HitBundle & bundle,bool cache_bundle)2058 bool PrecomputedExpressionBundleFactory::next_bundle(HitBundle& bundle, bool cache_bundle)
2059 {
2060 #if ENABLE_THREADS
2061     boost::mutex::scoped_lock lock(_factory_lock);
2062 #endif
2063     bool got_bundle = BundleFactory::next_bundle(bundle, cache_bundle);
2064     if (got_bundle)
2065     {
2066         RefSequenceTable& rt = ref_table();
2067 
2068         char bundle_label_buf[2048];
2069         sprintf(bundle_label_buf, "%s:%d-%d", rt.get_name(bundle.ref_id()),	bundle.left(), bundle.right());
2070 
2071         boost::shared_ptr<const AbundanceGroup> ab = _hit_fac->next_locus(bundle.id(), cache_bundle);
2072         if (ab)
2073         {
2074             double compatible_mass = _hit_fac->get_compat_mass(bundle.id());
2075             double total_mass  = _hit_fac->get_total_mass(bundle.id());
2076 
2077             /*
2078             double compatible_mass = _hit_fac->get_compat_mass(bundle_label_buf);
2079             double total_mass  = _hit_fac->get_total_mass(bundle_label_buf);
2080             */
2081 
2082             bundle.finalize();
2083             bundle.add_raw_mass(total_mass);
2084             bundle.compatible_mass(compatible_mass);
2085 
2086             //fprintf (stderr, "Reconstituting bundle %s (%d) with mass %lf\n", bundle_label_buf, bundle.id(), compatible_mass);
2087             if (bundle.ref_scaffolds().size() != ab->abundances().size())
2088             {
2089                 fprintf (stderr, "Error in file %s: reconstituted expression bundle %s (%lu transcripts)  does not match GTF (%lu transcripts):\n", read_group_properties()->file_path().c_str(),  bundle_label_buf, ab->abundances().size(), bundle.ref_scaffolds().size());
2090                 fprintf(stderr, "Reconstituted:\n");
2091                 for (size_t i = 0; i < ab->abundances().size(); ++i)
2092                 {
2093                     fprintf(stderr, "%s\n", ab->abundances()[i]->description().c_str());
2094                 }
2095                 fprintf(stderr, "GTF:\n");
2096                 for (size_t i = 0; i < bundle.ref_scaffolds().size(); ++i)
2097                 {
2098                     fprintf(stderr, "%s\n", bundle.ref_scaffolds()[i]->annotated_trans_id().c_str());
2099                 }
2100                 exit(1);
2101             }
2102         }
2103         else
2104         {
2105             fprintf (stderr, "Error: no abundance info for locus %s\n", bundle_label_buf);
2106         }
2107 
2108     }
2109     return got_bundle;
2110 }
2111 
get_abundance_for_locus(int locus_id)2112 boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionBundleFactory::get_abundance_for_locus(int locus_id)
2113 {
2114 #if ENABLE_THREADS
2115     boost::mutex::scoped_lock lock(_factory_lock);
2116 #endif
2117     return _hit_fac->get_abundance_for_locus(locus_id);
2118 }
2119 
clear_abundance_for_locus(int locus_id)2120 void PrecomputedExpressionBundleFactory::clear_abundance_for_locus(int locus_id)
2121 {
2122 #if ENABLE_THREADS
2123     boost::mutex::scoped_lock lock(_factory_lock);
2124 #endif
2125     _hit_fac->clear_abundance_for_locus(locus_id);
2126 }
2127