1 #ifndef BUNDLES_H
2 #define BUNDLES_H
3 /*
4  *  bundles.h
5  *  cufflinks
6  *
7  *  Created by Cole Trapnell on 9/6/09.
8  *  Copyright 2009 Cole Trapnell. All rights reserved.
9  *
10  */
11 
12 #ifdef HAVE_CONFIG_H
13 #include <config.h>
14 #endif
15 #include <boost/bind.hpp>
16 #include <boost/random.hpp>
17 #include <boost/unordered_map.hpp>
18 #include <vector>
19 #include <numeric>
20 #include <cmath>
21 #include "common.h"
22 #include "hits.h"
23 #include "scaffolds.h"
24 #include "gtf_tracking.h"
25 #include "progressbar.h"
26 #include "rounding.h"
27 
28 #include <boost/accumulators/accumulators.hpp>
29 #include <boost/accumulators/statistics/density.hpp>
30 #include <boost/accumulators/statistics/stats.hpp>
31 
32 struct BundleStats
33 {
BundleStatsBundleStats34 	BundleStats() :
35 	compatible(0),
36 	uncollapsible(0),
37 	closure_edges(0),
38 	matched_edges(0)
39 	{}
40 
41 	int compatible;
42 	int uncollapsible;
43 	int closure_edges;
44 	int matched_edges;
45 };
46 
47 typedef map<RefID, vector<AugmentedCuffOp> > BadIntronTable;
48 
49 
50 /*******************************************************************************
51  HitBundle is a set of MateHit objects that, were you to look at the interval
52  graph of their spanning intervals in genomic coordinates, you'd see a single
53  connected component. Note that bundles do not correspond to single transcripts,
54  or even single genes
55  *******************************************************************************/
56 class HitBundle
57 {
58 private:
HitBundle(const HitBundle & rhs)59     HitBundle(const HitBundle& rhs) {}
60 public:
HitBundle()61 	HitBundle()
62 		: _leftmost(INT_MAX), _rightmost(-1), _final(false), _id(++_next_id), _ref_id(0), _raw_mass(0.0), _num_replicates(1), _compatible_mass(0.0) {
63 		// Shrink the open mates table when it is less than 10% full and larger than 10,000 items.
64 		_rehash_threshold = _open_mates.max_load_factor() / 10;
65 		_rehash_size_threshold = 10000;
66 	}
67 
~HitBundle()68     ~HitBundle()
69     {
70         vector<boost::shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds();
71         BOOST_FOREACH(boost::shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs)
72         {
73             // This bundle and the factory that actually owns the ref_mRNAs
74             // are the only objects that should have access to these scaffolds
75             // so if the use count is 2, we can clear these guys.
76 			// Updated to 3 since the bias learner now uses them.
77             if (ref_scaff.use_count() <= 3)
78             {
79                 ref_scaff->clear_hits();
80             }
81             else if (ref_scaff->mate_hits().size() > 0)
82             {
83                 fprintf(stderr, "Warning: bundle %d-%d shared reference scaffolds with others.  Possible soft memory leak.\n", left(), right());
84             }
85         }
86 
87         BOOST_FOREACH (MateHit& hit, _hits)
88 		{
89 			delete hit.left_alignment();
90 			delete hit.right_alignment();
91 		}
92 
93         for(OpenMates::iterator itr = _open_mates.begin(); itr != _open_mates.end(); ++itr)
94 		{
95 			delete itr->second.left_alignment();
96 			delete itr->second.right_alignment();
97 		}
98 
99     }
left()100 	int left()   const { return _leftmost;  }
right()101 	int right()  const { return _rightmost; }
length()102 	int length() const { return _rightmost - _leftmost; }
103 
104 	// Returns true if the hit was added successfully.
105 	bool add_hit(const MateHit& hit);
106 
107 	// This is to keep track of mass of all hits, including
108 	// thosethat are not added to any bundle
109 	// but are skipped during the creation of this bundle
add_raw_mass(double rm)110 	void add_raw_mass(double rm) { _raw_mass += rm; }
rem_raw_mass(double rm)111 	void rem_raw_mass(double rm) { _raw_mass -= rm; }
raw_mass()112 	double raw_mass() { return _raw_mass; }
113 
compatible_mass()114     double compatible_mass() const
115     {
116         return _compatible_mass;
117     }
118 
compatible_mass(double c)119     void compatible_mass(double c) { _compatible_mass = c; }
120 
clear_hits()121 	void clear_hits()
122     {
123         _hits.clear();
124         _non_redundant.clear();
125         vector<boost::shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds();
126         BOOST_FOREACH(boost::shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs)
127         {
128             if (ref_scaff.use_count() <= 3)
129             {
130                 ref_scaff->clear_hits();
131             }
132             else
133             {
134                 fprintf(stderr, "Warning: bundle %d-%d shared reference scaffolds with others.  Possible soft memory leak.\n", left(), right());
135             }
136         }
137     }
138 
hits()139     const std::vector<MateHit>& hits() const { return _hits; }
non_redundant_hits()140 	const std::vector<MateHit>& non_redundant_hits() const { return _non_redundant; }
141 
ref_id()142 	RefID ref_id()  const {return _ref_id; }
143 
id()144 	int id() const { return _id; }
id(int i)145     void id(int i) { _id = i; }
146 
add_ref_scaffold(boost::shared_ptr<Scaffold> scaff)147 	void add_ref_scaffold(boost::shared_ptr<Scaffold> scaff)
148 	{
149 		if (scaff->left() < _leftmost)
150 			_leftmost = scaff->left();
151 		if (scaff->right() > _rightmost)
152 			_rightmost = scaff->right();
153 		_ref_scaffs.push_back(scaff);
154         _ref_id = scaff->ref_id();
155 	}
156 
ref_scaffolds()157 	vector<boost::shared_ptr<Scaffold> >& ref_scaffolds() { return _ref_scaffs; }
158 
159 	// Adds a Bowtie hit to the open hits buffer.  The Bundle will handle turning
160 	// the Bowtie hit into a properly mated Cufflinks hit record
161 	bool add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props,
162                       const ReadHit* bh,
163 					  bool expand_by = true);
164 
165 	// Commits any mates still open as singleton hits
166 	void finalize_open_mates();
167 
168 	// Sorts the hits, and performs other functions needed to prepare this
169 	// bundle for assembly and quantitation
170 	void finalize(bool is_combined=false);
171 
172 	void remove_hitless_scaffolds();
173 
174 	void collapse_hits();
175 
num_replicates()176     int num_replicates() const { return _num_replicates; }
177 
mass()178 	double mass() const
179 	{
180 		double mass = 0;
181 		for(size_t i = 0; i < _non_redundant.size(); i++)
182 		{
183 			mass += _non_redundant[i].collapse_mass();
184 		}
185 		return mass;
186 	}
187 
188     static void combine(const vector<HitBundle*>& in_bundles,
189                         HitBundle& out_bundle);
190 
191 private:
192     int _leftmost;
193 	int _rightmost;
194 	std::vector<MateHit> _hits;
195 	std::vector<MateHit> _non_redundant;
196 	std::vector<boost::shared_ptr<Scaffold> > _ref_scaffs; // user-supplied reference annotations overlapping the bundle
197 	bool _final;
198 	int _id;
199     RefID _ref_id;
200 	double _raw_mass;
201 
202 
203 	static int _next_id;
204 
205 	// InsertIDs are already hashes of SAM record names; no need
206 	// to hash them again. Note on 32-bit this will take the truncate
207 	// the larger InsertID hash to size_t.
208 	struct identity_hash {
operatoridentity_hash209 		size_t operator()(const InsertID& in) const { return (size_t)in; }
210 	};
211 
212 	typedef boost::unordered_multimap<uint64_t, MateHit, identity_hash> OpenMates;
213 	OpenMates _open_mates;
214     int _num_replicates;
215     double _compatible_mass;
216 	float _rehash_threshold;
217 	OpenMates::size_type _rehash_size_threshold;
218 
219 };
220 
221 void load_ref_rnas(FILE* ref_mRNA_file,
222 				   RefSequenceTable& rt,
223 				   vector<boost::shared_ptr<Scaffold> >& ref_mRNAs,
224                    boost::crc_32_type& gtf_crc_result,
225 				   bool loadSeqs=false,
226 				   bool loadFPKM=false);
227 
228 class BundleFactory
229 {
230 public:
231 
BundleFactory(boost::shared_ptr<HitFactory> fac,BundleMode bm)232 	BundleFactory(boost::shared_ptr<HitFactory> fac, BundleMode bm)
233     : _hit_fac(fac), _bundle_mode(bm), _prev_pos(0), _prev_ref_id(0), _curr_bundle(0),  rng(boost::mt19937(random_seed)), _zeroone(rng)
234 	{
235 		_rg_props = boost::shared_ptr<ReadGroupProperties>(new ReadGroupProperties(fac->read_group_properties()));
236         next_ref_scaff = ref_mRNAs.begin();
237         next_mask_scaff = mask_gtf_recs.begin();
238 	}
239 
~BundleFactory()240     virtual ~BundleFactory() {}
hit_factory()241     boost::shared_ptr<const HitFactory> hit_factory() const { return _hit_fac; }
242 
bundles_remain()243     bool bundles_remain()
244     {
245 #if ENABLE_THREADS
246         boost::mutex::scoped_lock lock(_factory_lock);
247 #endif
248         return _curr_bundle < num_bundles();
249     }
250 
251 	virtual bool next_bundle(HitBundle& bundle_out, bool cache_bundle);
252 	bool next_bundle_hit_driven(HitBundle& bundle_out);
253 	bool next_bundle_ref_driven(HitBundle& bundle_out);
254 	bool next_bundle_ref_guided(HitBundle& bundle_out);
255 
256 
ref_table()257     RefSequenceTable& ref_table() { return _hit_fac->ref_table(); }
258 
259     // Not available until after inspect_bundle
num_bundles()260     int num_bundles() const { return _num_bundles; }
num_bundles(int n)261     void num_bundles(int n) { _num_bundles = n; }
262 
reset()263 	virtual void reset()
264 	{
265 #if ENABLE_THREADS
266         boost::mutex::scoped_lock lock(_factory_lock);
267 #endif
268         _curr_bundle = 0;
269 		//rewind(hit_file);
270 		_hit_fac->reset();
271 		next_ref_scaff = ref_mRNAs.begin();
272         next_mask_scaff = mask_gtf_recs.begin();
273 
274         BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, ref_mRNAs)
275         {
276             ref_scaff->clear_hits();
277         }
278 
279         _prev_pos = 0;
280         _prev_ref_id = 0;
281 	}
282 
283     // This function NEEDS to deep copy the ref_mRNAs, otherwise cuffdiff'd
284     // samples will clobber each other. Deep copying is necessary whenever
285     // you need to set fpkm or num fragments in Scaffold objects (e.g. during bias correction or multiread correction)
286     void set_ref_rnas(const vector<boost::shared_ptr<Scaffold> >& mRNAs, bool deep_copy = true)
287     {
288 #if ENABLE_THREADS
289         boost::mutex::scoped_lock lock(_factory_lock);
290 #endif
291         ref_mRNAs.clear();
292         if (deep_copy)
293         {
294             for (vector<boost::shared_ptr<Scaffold> >::const_iterator i = mRNAs.begin(); i < mRNAs.end(); ++i)
295             {
296                 ref_mRNAs.push_back(boost::shared_ptr<Scaffold>(new Scaffold(**i)));
297             }
298         }
299         else
300         {
301             ref_mRNAs = mRNAs;
302         }
303 
304         RefID last_id = 0;
305         for (vector<boost::shared_ptr<Scaffold> >::iterator i = ref_mRNAs.begin(); i < ref_mRNAs.end(); ++i)
306         {
307             if ((*i)->ref_id() != last_id)
308             {
309                 _ref_scaff_offsets.push_back(make_pair((*i)->ref_id(), i));
310             }
311             last_id = (*i)->ref_id();
312         }
313 
314         next_ref_scaff = ref_mRNAs.begin();
315     }
316 
set_mask_rnas(const vector<boost::shared_ptr<Scaffold>> & masks)317     void set_mask_rnas(const vector<boost::shared_ptr<Scaffold> >& masks)
318     {
319 #if ENABLE_THREADS
320         boost::mutex::scoped_lock lock(_factory_lock);
321 #endif
322         mask_gtf_recs = masks;
323         RefID last_id = 0;
324         for (vector<boost::shared_ptr<Scaffold> >::iterator i = mask_gtf_recs.begin(); i < mask_gtf_recs.end(); ++i)
325         {
326             if ((*i)->ref_id() != last_id)
327             {
328                 _mask_scaff_offsets.push_back(make_pair((*i)->ref_id(), i));
329             }
330             last_id = (*i)->ref_id();
331         }
332 
333         next_mask_scaff = mask_gtf_recs.begin();
334     }
335 
bad_intron_table(const BadIntronTable & bad_introns)336 	void bad_intron_table(const BadIntronTable& bad_introns)
337 	{
338 #if ENABLE_THREADS
339         boost::mutex::scoped_lock lock(_factory_lock);
340 #endif
341 		_bad_introns = bad_introns;
342 	}
343 
read_group_properties(boost::shared_ptr<ReadGroupProperties> rg)344     void read_group_properties(boost::shared_ptr<ReadGroupProperties> rg)
345     {
346 #if ENABLE_THREADS
347         boost::mutex::scoped_lock lock(_factory_lock);
348 #endif
349         _rg_props = rg;
350     }
351 
read_group_properties()352     boost::shared_ptr<ReadGroupProperties> read_group_properties()
353     {
354         return _rg_props;
355     }
356 
357 	bool spans_bad_intron(const ReadHit& read);
358 
359 private:
360 
361 	bool _expand_by_hits(HitBundle& bundle);
362 	bool _expand_by_refs(HitBundle& bundle);
363 
364 	boost::shared_ptr<HitFactory> _hit_fac;
365 
366 	vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
367 	//FILE* ref_mRNA_file;
368 	vector<pair<RefID, vector<boost::shared_ptr<Scaffold> >::iterator> > _ref_scaff_offsets;
369 	vector<boost::shared_ptr<Scaffold> >::iterator next_ref_scaff;
370 
371     vector<boost::shared_ptr<Scaffold> > mask_gtf_recs;
372 	//FILE* mask_file;
373 	vector<pair<RefID, vector<boost::shared_ptr<Scaffold> >::iterator> > _mask_scaff_offsets;
374 	vector<boost::shared_ptr<Scaffold> >::iterator next_mask_scaff;
375 
376 	BadIntronTable _bad_introns;
377 
378     boost::shared_ptr<ReadGroupProperties> _rg_props;
379 
380 	// Sets nva to point to the next valid alignment
381 	// Returns the mass of any alignments that are seen, valid or not
382     double next_valid_alignment(const ReadHit*& nva);
383 
384 	// Backs up the factory to before the last valid alignment
385 	// and returns the mass of that alignment (rh)
386 	double rewind_hit(const ReadHit* rh);
387 
388     BundleMode _bundle_mode;
389     int _prev_pos;
390     RefID _prev_ref_id;
391     int _num_bundles;
392     int _curr_bundle;
393 
394     boost::mt19937 rng;
395     boost::uniform_01<boost::mt19937> _zeroone;
396 
397 #if ENABLE_THREADS
398     boost::mutex _factory_lock;
399 #endif
400 };
401 
402 class PrecomputedExpressionBundleFactory : public BundleFactory
403 {
404 public:
PrecomputedExpressionBundleFactory(boost::shared_ptr<PrecomputedExpressionHitFactory> fac)405     PrecomputedExpressionBundleFactory(boost::shared_ptr<PrecomputedExpressionHitFactory> fac)
406 	: BundleFactory(fac, REF_DRIVEN), _hit_fac(fac)
407 	{
408 
409 	}
410 
411     bool next_bundle(HitBundle& bundle_out, bool cache_bundle);
412 
413     boost::shared_ptr<const AbundanceGroup> get_abundance_for_locus(int locus_id);
414     void clear_abundance_for_locus(int locus_id);
415 
reset()416     void reset() { BundleFactory::reset(); }
417 
418 private:
419 
420     boost::shared_ptr<PrecomputedExpressionHitFactory> _hit_fac;
421 #if ENABLE_THREADS
422     boost::mutex _factory_lock;
423 #endif
424 };
425 
426 void identify_bad_splices(const HitBundle& bundle,
427 						  BadIntronTable& bad_splice_ops);
428 
429 class IdToLocusMap
430 {
431 
432 #if ENABLE_THREADS
433 	boost::mutex _bl_lock;
434 #endif
435 
IdToLocusMap(const IdToLocusMap & rhs)436     IdToLocusMap(const IdToLocusMap& rhs) {}
437 public:
438 
IdToLocusMap(boost::shared_ptr<map<string,set<string>>> id_to_locus_map)439     IdToLocusMap(boost::shared_ptr<map<string, set<string> > > id_to_locus_map) :
440         _id_to_locus_map(id_to_locus_map) {}
441 
register_locus_to_id(const string & ID,const string & locus_desc)442     void register_locus_to_id(const string& ID, const string& locus_desc)
443     {
444 #if ENABLE_THREADS
445         boost::mutex::scoped_lock lock(_bl_lock);
446 #endif
447 
448         pair< map<string, set<string> >::iterator, bool> p = _id_to_locus_map->insert(make_pair(ID, set<string>()));
449         p.first->second.insert(locus_desc);
450     }
451 
unregister_locus_from_id(const string & ID,const string & locus_desc)452     int unregister_locus_from_id(const string& ID, const string& locus_desc)
453     {
454 #if ENABLE_THREADS
455         boost::mutex::scoped_lock lock(_bl_lock);
456 #endif
457 
458         map<string, set<string> >::iterator itr = _id_to_locus_map->find(ID);
459         if (itr != _id_to_locus_map->end()){
460             itr->second.erase(locus_desc);
461             return itr->second.size();
462         }
463         return 0;
464     }
465 
num_locus_registered(const string & ID)466     int num_locus_registered(const string& ID) {
467 #if ENABLE_THREADS
468         boost::mutex::scoped_lock lock(_bl_lock);
469 #endif
470 
471         map<string, set<string> >::const_iterator itr = _id_to_locus_map->find(ID);
472         if (itr == _id_to_locus_map->end())
473             return 0;
474         return itr->second.size();
475     }
476 private:
477     boost::shared_ptr<map<string, set<string> > > _id_to_locus_map;
478 };
479 
480 void inspect_map(boost::shared_ptr<BundleFactory> bundle_factory,
481                  BadIntronTable* bad_introns,
482                  vector<LocusCount>& compatible_count_table,
483                  vector<LocusCount>& total_count_table,
484                  IdToLocusMap& id_to_locus_map,
485                  bool progress_bar = true,
486                  bool show_stats = true);
487 
488 #endif
489