1 #ifndef BUNDLES_H 2 #define BUNDLES_H 3 /* 4 * bundles.h 5 * cufflinks 6 * 7 * Created by Cole Trapnell on 9/6/09. 8 * Copyright 2009 Cole Trapnell. All rights reserved. 9 * 10 */ 11 12 #ifdef HAVE_CONFIG_H 13 #include <config.h> 14 #endif 15 #include <boost/bind.hpp> 16 #include <boost/random.hpp> 17 #include <boost/unordered_map.hpp> 18 #include <vector> 19 #include <numeric> 20 #include <cmath> 21 #include "common.h" 22 #include "hits.h" 23 #include "scaffolds.h" 24 #include "gtf_tracking.h" 25 #include "progressbar.h" 26 #include "rounding.h" 27 28 #include <boost/accumulators/accumulators.hpp> 29 #include <boost/accumulators/statistics/density.hpp> 30 #include <boost/accumulators/statistics/stats.hpp> 31 32 struct BundleStats 33 { BundleStatsBundleStats34 BundleStats() : 35 compatible(0), 36 uncollapsible(0), 37 closure_edges(0), 38 matched_edges(0) 39 {} 40 41 int compatible; 42 int uncollapsible; 43 int closure_edges; 44 int matched_edges; 45 }; 46 47 typedef map<RefID, vector<AugmentedCuffOp> > BadIntronTable; 48 49 50 /******************************************************************************* 51 HitBundle is a set of MateHit objects that, were you to look at the interval 52 graph of their spanning intervals in genomic coordinates, you'd see a single 53 connected component. Note that bundles do not correspond to single transcripts, 54 or even single genes 55 *******************************************************************************/ 56 class HitBundle 57 { 58 private: HitBundle(const HitBundle & rhs)59 HitBundle(const HitBundle& rhs) {} 60 public: HitBundle()61 HitBundle() 62 : _leftmost(INT_MAX), _rightmost(-1), _final(false), _id(++_next_id), _ref_id(0), _raw_mass(0.0), _num_replicates(1), _compatible_mass(0.0) { 63 // Shrink the open mates table when it is less than 10% full and larger than 10,000 items. 64 _rehash_threshold = _open_mates.max_load_factor() / 10; 65 _rehash_size_threshold = 10000; 66 } 67 ~HitBundle()68 ~HitBundle() 69 { 70 vector<boost::shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds(); 71 BOOST_FOREACH(boost::shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs) 72 { 73 // This bundle and the factory that actually owns the ref_mRNAs 74 // are the only objects that should have access to these scaffolds 75 // so if the use count is 2, we can clear these guys. 76 // Updated to 3 since the bias learner now uses them. 77 if (ref_scaff.use_count() <= 3) 78 { 79 ref_scaff->clear_hits(); 80 } 81 else if (ref_scaff->mate_hits().size() > 0) 82 { 83 fprintf(stderr, "Warning: bundle %d-%d shared reference scaffolds with others. Possible soft memory leak.\n", left(), right()); 84 } 85 } 86 87 BOOST_FOREACH (MateHit& hit, _hits) 88 { 89 delete hit.left_alignment(); 90 delete hit.right_alignment(); 91 } 92 93 for(OpenMates::iterator itr = _open_mates.begin(); itr != _open_mates.end(); ++itr) 94 { 95 delete itr->second.left_alignment(); 96 delete itr->second.right_alignment(); 97 } 98 99 } left()100 int left() const { return _leftmost; } right()101 int right() const { return _rightmost; } length()102 int length() const { return _rightmost - _leftmost; } 103 104 // Returns true if the hit was added successfully. 105 bool add_hit(const MateHit& hit); 106 107 // This is to keep track of mass of all hits, including 108 // thosethat are not added to any bundle 109 // but are skipped during the creation of this bundle add_raw_mass(double rm)110 void add_raw_mass(double rm) { _raw_mass += rm; } rem_raw_mass(double rm)111 void rem_raw_mass(double rm) { _raw_mass -= rm; } raw_mass()112 double raw_mass() { return _raw_mass; } 113 compatible_mass()114 double compatible_mass() const 115 { 116 return _compatible_mass; 117 } 118 compatible_mass(double c)119 void compatible_mass(double c) { _compatible_mass = c; } 120 clear_hits()121 void clear_hits() 122 { 123 _hits.clear(); 124 _non_redundant.clear(); 125 vector<boost::shared_ptr<Scaffold> >& bundle_ref_scaffs = ref_scaffolds(); 126 BOOST_FOREACH(boost::shared_ptr<Scaffold>& ref_scaff, bundle_ref_scaffs) 127 { 128 if (ref_scaff.use_count() <= 3) 129 { 130 ref_scaff->clear_hits(); 131 } 132 else 133 { 134 fprintf(stderr, "Warning: bundle %d-%d shared reference scaffolds with others. Possible soft memory leak.\n", left(), right()); 135 } 136 } 137 } 138 hits()139 const std::vector<MateHit>& hits() const { return _hits; } non_redundant_hits()140 const std::vector<MateHit>& non_redundant_hits() const { return _non_redundant; } 141 ref_id()142 RefID ref_id() const {return _ref_id; } 143 id()144 int id() const { return _id; } id(int i)145 void id(int i) { _id = i; } 146 add_ref_scaffold(boost::shared_ptr<Scaffold> scaff)147 void add_ref_scaffold(boost::shared_ptr<Scaffold> scaff) 148 { 149 if (scaff->left() < _leftmost) 150 _leftmost = scaff->left(); 151 if (scaff->right() > _rightmost) 152 _rightmost = scaff->right(); 153 _ref_scaffs.push_back(scaff); 154 _ref_id = scaff->ref_id(); 155 } 156 ref_scaffolds()157 vector<boost::shared_ptr<Scaffold> >& ref_scaffolds() { return _ref_scaffs; } 158 159 // Adds a Bowtie hit to the open hits buffer. The Bundle will handle turning 160 // the Bowtie hit into a properly mated Cufflinks hit record 161 bool add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props, 162 const ReadHit* bh, 163 bool expand_by = true); 164 165 // Commits any mates still open as singleton hits 166 void finalize_open_mates(); 167 168 // Sorts the hits, and performs other functions needed to prepare this 169 // bundle for assembly and quantitation 170 void finalize(bool is_combined=false); 171 172 void remove_hitless_scaffolds(); 173 174 void collapse_hits(); 175 num_replicates()176 int num_replicates() const { return _num_replicates; } 177 mass()178 double mass() const 179 { 180 double mass = 0; 181 for(size_t i = 0; i < _non_redundant.size(); i++) 182 { 183 mass += _non_redundant[i].collapse_mass(); 184 } 185 return mass; 186 } 187 188 static void combine(const vector<HitBundle*>& in_bundles, 189 HitBundle& out_bundle); 190 191 private: 192 int _leftmost; 193 int _rightmost; 194 std::vector<MateHit> _hits; 195 std::vector<MateHit> _non_redundant; 196 std::vector<boost::shared_ptr<Scaffold> > _ref_scaffs; // user-supplied reference annotations overlapping the bundle 197 bool _final; 198 int _id; 199 RefID _ref_id; 200 double _raw_mass; 201 202 203 static int _next_id; 204 205 // InsertIDs are already hashes of SAM record names; no need 206 // to hash them again. Note on 32-bit this will take the truncate 207 // the larger InsertID hash to size_t. 208 struct identity_hash { operatoridentity_hash209 size_t operator()(const InsertID& in) const { return (size_t)in; } 210 }; 211 212 typedef boost::unordered_multimap<uint64_t, MateHit, identity_hash> OpenMates; 213 OpenMates _open_mates; 214 int _num_replicates; 215 double _compatible_mass; 216 float _rehash_threshold; 217 OpenMates::size_type _rehash_size_threshold; 218 219 }; 220 221 void load_ref_rnas(FILE* ref_mRNA_file, 222 RefSequenceTable& rt, 223 vector<boost::shared_ptr<Scaffold> >& ref_mRNAs, 224 boost::crc_32_type& gtf_crc_result, 225 bool loadSeqs=false, 226 bool loadFPKM=false); 227 228 class BundleFactory 229 { 230 public: 231 BundleFactory(boost::shared_ptr<HitFactory> fac,BundleMode bm)232 BundleFactory(boost::shared_ptr<HitFactory> fac, BundleMode bm) 233 : _hit_fac(fac), _bundle_mode(bm), _prev_pos(0), _prev_ref_id(0), _curr_bundle(0), rng(boost::mt19937(random_seed)), _zeroone(rng) 234 { 235 _rg_props = boost::shared_ptr<ReadGroupProperties>(new ReadGroupProperties(fac->read_group_properties())); 236 next_ref_scaff = ref_mRNAs.begin(); 237 next_mask_scaff = mask_gtf_recs.begin(); 238 } 239 ~BundleFactory()240 virtual ~BundleFactory() {} hit_factory()241 boost::shared_ptr<const HitFactory> hit_factory() const { return _hit_fac; } 242 bundles_remain()243 bool bundles_remain() 244 { 245 #if ENABLE_THREADS 246 boost::mutex::scoped_lock lock(_factory_lock); 247 #endif 248 return _curr_bundle < num_bundles(); 249 } 250 251 virtual bool next_bundle(HitBundle& bundle_out, bool cache_bundle); 252 bool next_bundle_hit_driven(HitBundle& bundle_out); 253 bool next_bundle_ref_driven(HitBundle& bundle_out); 254 bool next_bundle_ref_guided(HitBundle& bundle_out); 255 256 ref_table()257 RefSequenceTable& ref_table() { return _hit_fac->ref_table(); } 258 259 // Not available until after inspect_bundle num_bundles()260 int num_bundles() const { return _num_bundles; } num_bundles(int n)261 void num_bundles(int n) { _num_bundles = n; } 262 reset()263 virtual void reset() 264 { 265 #if ENABLE_THREADS 266 boost::mutex::scoped_lock lock(_factory_lock); 267 #endif 268 _curr_bundle = 0; 269 //rewind(hit_file); 270 _hit_fac->reset(); 271 next_ref_scaff = ref_mRNAs.begin(); 272 next_mask_scaff = mask_gtf_recs.begin(); 273 274 BOOST_FOREACH(boost::shared_ptr<Scaffold> ref_scaff, ref_mRNAs) 275 { 276 ref_scaff->clear_hits(); 277 } 278 279 _prev_pos = 0; 280 _prev_ref_id = 0; 281 } 282 283 // This function NEEDS to deep copy the ref_mRNAs, otherwise cuffdiff'd 284 // samples will clobber each other. Deep copying is necessary whenever 285 // you need to set fpkm or num fragments in Scaffold objects (e.g. during bias correction or multiread correction) 286 void set_ref_rnas(const vector<boost::shared_ptr<Scaffold> >& mRNAs, bool deep_copy = true) 287 { 288 #if ENABLE_THREADS 289 boost::mutex::scoped_lock lock(_factory_lock); 290 #endif 291 ref_mRNAs.clear(); 292 if (deep_copy) 293 { 294 for (vector<boost::shared_ptr<Scaffold> >::const_iterator i = mRNAs.begin(); i < mRNAs.end(); ++i) 295 { 296 ref_mRNAs.push_back(boost::shared_ptr<Scaffold>(new Scaffold(**i))); 297 } 298 } 299 else 300 { 301 ref_mRNAs = mRNAs; 302 } 303 304 RefID last_id = 0; 305 for (vector<boost::shared_ptr<Scaffold> >::iterator i = ref_mRNAs.begin(); i < ref_mRNAs.end(); ++i) 306 { 307 if ((*i)->ref_id() != last_id) 308 { 309 _ref_scaff_offsets.push_back(make_pair((*i)->ref_id(), i)); 310 } 311 last_id = (*i)->ref_id(); 312 } 313 314 next_ref_scaff = ref_mRNAs.begin(); 315 } 316 set_mask_rnas(const vector<boost::shared_ptr<Scaffold>> & masks)317 void set_mask_rnas(const vector<boost::shared_ptr<Scaffold> >& masks) 318 { 319 #if ENABLE_THREADS 320 boost::mutex::scoped_lock lock(_factory_lock); 321 #endif 322 mask_gtf_recs = masks; 323 RefID last_id = 0; 324 for (vector<boost::shared_ptr<Scaffold> >::iterator i = mask_gtf_recs.begin(); i < mask_gtf_recs.end(); ++i) 325 { 326 if ((*i)->ref_id() != last_id) 327 { 328 _mask_scaff_offsets.push_back(make_pair((*i)->ref_id(), i)); 329 } 330 last_id = (*i)->ref_id(); 331 } 332 333 next_mask_scaff = mask_gtf_recs.begin(); 334 } 335 bad_intron_table(const BadIntronTable & bad_introns)336 void bad_intron_table(const BadIntronTable& bad_introns) 337 { 338 #if ENABLE_THREADS 339 boost::mutex::scoped_lock lock(_factory_lock); 340 #endif 341 _bad_introns = bad_introns; 342 } 343 read_group_properties(boost::shared_ptr<ReadGroupProperties> rg)344 void read_group_properties(boost::shared_ptr<ReadGroupProperties> rg) 345 { 346 #if ENABLE_THREADS 347 boost::mutex::scoped_lock lock(_factory_lock); 348 #endif 349 _rg_props = rg; 350 } 351 read_group_properties()352 boost::shared_ptr<ReadGroupProperties> read_group_properties() 353 { 354 return _rg_props; 355 } 356 357 bool spans_bad_intron(const ReadHit& read); 358 359 private: 360 361 bool _expand_by_hits(HitBundle& bundle); 362 bool _expand_by_refs(HitBundle& bundle); 363 364 boost::shared_ptr<HitFactory> _hit_fac; 365 366 vector<boost::shared_ptr<Scaffold> > ref_mRNAs; 367 //FILE* ref_mRNA_file; 368 vector<pair<RefID, vector<boost::shared_ptr<Scaffold> >::iterator> > _ref_scaff_offsets; 369 vector<boost::shared_ptr<Scaffold> >::iterator next_ref_scaff; 370 371 vector<boost::shared_ptr<Scaffold> > mask_gtf_recs; 372 //FILE* mask_file; 373 vector<pair<RefID, vector<boost::shared_ptr<Scaffold> >::iterator> > _mask_scaff_offsets; 374 vector<boost::shared_ptr<Scaffold> >::iterator next_mask_scaff; 375 376 BadIntronTable _bad_introns; 377 378 boost::shared_ptr<ReadGroupProperties> _rg_props; 379 380 // Sets nva to point to the next valid alignment 381 // Returns the mass of any alignments that are seen, valid or not 382 double next_valid_alignment(const ReadHit*& nva); 383 384 // Backs up the factory to before the last valid alignment 385 // and returns the mass of that alignment (rh) 386 double rewind_hit(const ReadHit* rh); 387 388 BundleMode _bundle_mode; 389 int _prev_pos; 390 RefID _prev_ref_id; 391 int _num_bundles; 392 int _curr_bundle; 393 394 boost::mt19937 rng; 395 boost::uniform_01<boost::mt19937> _zeroone; 396 397 #if ENABLE_THREADS 398 boost::mutex _factory_lock; 399 #endif 400 }; 401 402 class PrecomputedExpressionBundleFactory : public BundleFactory 403 { 404 public: PrecomputedExpressionBundleFactory(boost::shared_ptr<PrecomputedExpressionHitFactory> fac)405 PrecomputedExpressionBundleFactory(boost::shared_ptr<PrecomputedExpressionHitFactory> fac) 406 : BundleFactory(fac, REF_DRIVEN), _hit_fac(fac) 407 { 408 409 } 410 411 bool next_bundle(HitBundle& bundle_out, bool cache_bundle); 412 413 boost::shared_ptr<const AbundanceGroup> get_abundance_for_locus(int locus_id); 414 void clear_abundance_for_locus(int locus_id); 415 reset()416 void reset() { BundleFactory::reset(); } 417 418 private: 419 420 boost::shared_ptr<PrecomputedExpressionHitFactory> _hit_fac; 421 #if ENABLE_THREADS 422 boost::mutex _factory_lock; 423 #endif 424 }; 425 426 void identify_bad_splices(const HitBundle& bundle, 427 BadIntronTable& bad_splice_ops); 428 429 class IdToLocusMap 430 { 431 432 #if ENABLE_THREADS 433 boost::mutex _bl_lock; 434 #endif 435 IdToLocusMap(const IdToLocusMap & rhs)436 IdToLocusMap(const IdToLocusMap& rhs) {} 437 public: 438 IdToLocusMap(boost::shared_ptr<map<string,set<string>>> id_to_locus_map)439 IdToLocusMap(boost::shared_ptr<map<string, set<string> > > id_to_locus_map) : 440 _id_to_locus_map(id_to_locus_map) {} 441 register_locus_to_id(const string & ID,const string & locus_desc)442 void register_locus_to_id(const string& ID, const string& locus_desc) 443 { 444 #if ENABLE_THREADS 445 boost::mutex::scoped_lock lock(_bl_lock); 446 #endif 447 448 pair< map<string, set<string> >::iterator, bool> p = _id_to_locus_map->insert(make_pair(ID, set<string>())); 449 p.first->second.insert(locus_desc); 450 } 451 unregister_locus_from_id(const string & ID,const string & locus_desc)452 int unregister_locus_from_id(const string& ID, const string& locus_desc) 453 { 454 #if ENABLE_THREADS 455 boost::mutex::scoped_lock lock(_bl_lock); 456 #endif 457 458 map<string, set<string> >::iterator itr = _id_to_locus_map->find(ID); 459 if (itr != _id_to_locus_map->end()){ 460 itr->second.erase(locus_desc); 461 return itr->second.size(); 462 } 463 return 0; 464 } 465 num_locus_registered(const string & ID)466 int num_locus_registered(const string& ID) { 467 #if ENABLE_THREADS 468 boost::mutex::scoped_lock lock(_bl_lock); 469 #endif 470 471 map<string, set<string> >::const_iterator itr = _id_to_locus_map->find(ID); 472 if (itr == _id_to_locus_map->end()) 473 return 0; 474 return itr->second.size(); 475 } 476 private: 477 boost::shared_ptr<map<string, set<string> > > _id_to_locus_map; 478 }; 479 480 void inspect_map(boost::shared_ptr<BundleFactory> bundle_factory, 481 BadIntronTable* bad_introns, 482 vector<LocusCount>& compatible_count_table, 483 vector<LocusCount>& total_count_table, 484 IdToLocusMap& id_to_locus_map, 485 bool progress_bar = true, 486 bool show_stats = true); 487 488 #endif 489