1 #ifndef ABUNDANCES_H 2 #define ABUNDANCES_H 3 /* 4 * abundances.h 5 * cufflinks 6 * 7 * Created by Cole Trapnell on 4/27/09. 8 * Copyright 2009 Cole Trapnell. All rights reserved. 9 * 10 */ 11 12 #ifdef HAVE_CONFIG_H 13 #include <config.h> 14 #endif 15 16 #include <stdint.h> 17 #include <vector> 18 #include <boost/numeric/ublas/matrix.hpp> 19 #include <boost/numeric/ublas/vector.hpp> 20 21 #include <Eigen/Dense> 22 23 #include "hits.h" 24 #include "scaffolds.h" 25 #include "bundles.h" 26 #include "biascorrection.h" 27 28 namespace ublas = boost::numeric::ublas; 29 30 struct ConfidenceInterval 31 { 32 ConfidenceInterval(double Low = 0.0, double High = 0.0) lowConfidenceInterval33 : low(Low), high(High) {} 34 double low; 35 double high; 36 37 private: 38 friend std::ostream & operator<<(std::ostream &os, const ConfidenceInterval &gp); 39 friend class boost::serialization::access; 40 template<class Archive> serializeConfidenceInterval41 void serialize(Archive & ar, const unsigned int /* file_version */){ 42 ar & low & high; 43 } 44 }; 45 46 enum AbundanceStatus { NUMERIC_OK, NUMERIC_FAIL, NUMERIC_LOW_DATA, NUMERIC_HI_DATA }; 47 48 typedef map<boost::shared_ptr<ReadGroupProperties const>, double> CountPerReplicateTable; 49 typedef map<boost::shared_ptr<ReadGroupProperties const>, double> FPKMPerReplicateTable; 50 typedef map<boost::shared_ptr<ReadGroupProperties const>, AbundanceStatus> StatusPerReplicateTable; 51 52 bool fit_negbin_dist(const vector<double> samples, double& r, double& p); 53 long double negbin_log_likelihood(const vector<double>& samples, long double r, long double p); 54 long double poisson_log_likelihood(const vector<double>& samples, long double lambda); 55 56 namespace bser = boost::serialization; 57 58 class Abundance 59 { 60 public: ~Abundance()61 virtual ~Abundance() {} 62 63 // Status of the numerical calculation performed on this object. Safe to 64 // do testing only if status == NUMERIC_OK 65 virtual AbundanceStatus status() const = 0; 66 virtual void status(AbundanceStatus s) = 0; 67 68 // Fragments Per Kilbase of transcript per Million fragments mapped 69 virtual double FPKM() const = 0; 70 virtual void FPKM(double fpkm) = 0; 71 virtual double FPKM_variance() const = 0; 72 virtual void FPKM_variance(double v) = 0; 73 74 virtual ConfidenceInterval FPKM_conf() const = 0; 75 virtual void FPKM_conf(const ConfidenceInterval& cf) = 0; 76 77 // gamma is a fixed property of each transcript or transcript group. It's 78 // the probability that one would draw a fragment from this object, scaled 79 // to an arbitrary locus' worth of fragments. 80 virtual double gamma() const = 0; 81 virtual void gamma(double g) = 0; 82 83 // Kappa is only really meaningful when this Abundance record is part of a 84 // group - it's the relative abundance of this object within the larger 85 // group. 86 virtual double kappa() const = 0; 87 virtual void kappa(double k) = 0; 88 89 // This tracks the final modeled variance in the assigned counts. 90 virtual double num_fragments() const = 0; 91 virtual void num_fragments(double nf) = 0; 92 93 // This tracks the final modeled variance in the assigned counts. 94 virtual double num_fragment_var() const = 0; 95 virtual void num_fragment_var(double nfv) = 0; 96 97 virtual CountPerReplicateTable num_fragments_by_replicate() const = 0; 98 virtual void num_fragments_by_replicate(CountPerReplicateTable& cpr) = 0; 99 100 virtual FPKMPerReplicateTable FPKM_by_replicate() const = 0; 101 virtual void FPKM_by_replicate(CountPerReplicateTable& cpr) = 0; 102 103 virtual StatusPerReplicateTable status_by_replicate() const = 0; 104 virtual void status_by_replicate(StatusPerReplicateTable& fpr) = 0; 105 106 // This tracks the fitted variance from the overdispersion model, 107 // and does not include the fragment assignment uncertainty. 108 virtual double mass_variance() const = 0; 109 virtual void mass_variance(double mv) = 0; 110 111 // This tracks the fragment assignment uncertainty, 112 // and does not include the overdispersion. 113 virtual double num_fragment_uncertainty_var() const = 0; 114 virtual void num_fragment_uncertainty_var(double mv) = 0; 115 116 virtual double effective_length() const= 0; 117 virtual void effective_length(double el) = 0; 118 cond_probs()119 virtual const vector<double>* cond_probs() const { return NULL; } 120 virtual void cond_probs(vector<double>* cp) = 0; 121 122 // The structural information for the object, if defined. transfrag()123 virtual boost::shared_ptr<Scaffold> transfrag() const { return boost::shared_ptr<Scaffold>(); } 124 125 virtual const vector<double>& fpkm_samples() const = 0; 126 virtual void fpkm_samples(const vector<double>& s) = 0; 127 128 129 virtual set<string> gene_id() const = 0; 130 virtual set<string> gene_name() const = 0; 131 virtual set<string> tss_id() const = 0; 132 virtual set<string> protein_id() const = 0; 133 134 virtual const string& description() const = 0; 135 virtual void description(const string& d) = 0; 136 137 virtual const string& locus_tag() const = 0; 138 virtual void locus_tag(const string& L) = 0; 139 140 virtual const string& reference_tag() const = 0; 141 virtual void reference_tag(const string& r) = 0; 142 143 template<class Archive> serialize(Archive & ar,const unsigned int file_version)144 void serialize(Archive & ar, const unsigned int file_version) 145 { 146 147 } 148 149 virtual void clear_non_serialized_data() = 0; 150 }; 151 152 BOOST_SERIALIZATION_ASSUME_ABSTRACT(Abundance); 153 154 class TranscriptAbundance : public Abundance 155 { 156 public: 157 TranscriptAbundance()158 TranscriptAbundance() : 159 _status(NUMERIC_OK), 160 _transfrag(boost::shared_ptr<Scaffold>()), 161 _FPKM(0), 162 _FPKM_variance(0), 163 _gamma(0), 164 _kappa(1.0), 165 _num_fragments(0), 166 _num_fragment_var(0), 167 _num_fragment_uncertainty_var(0), 168 _eff_len(0), 169 _cond_probs(NULL), 170 _sample_mass_variance(0.0){} 171 ~TranscriptAbundance()172 ~TranscriptAbundance() 173 { 174 if (_cond_probs != NULL) 175 { 176 delete _cond_probs; 177 _cond_probs = NULL; 178 } 179 } 180 status()181 AbundanceStatus status() const { return _status; } status(AbundanceStatus s)182 void status(AbundanceStatus s) { _status = s; } 183 FPKM()184 double FPKM() const { return _FPKM; } FPKM(double fpkm)185 void FPKM(double fpkm) 186 { 187 _FPKM = fpkm; 188 if (_transfrag) 189 _transfrag->fpkm(fpkm); 190 } FPKM_variance()191 double FPKM_variance() const { return _FPKM_variance; } 192 void FPKM_variance(double v); 193 FPKM_conf()194 ConfidenceInterval FPKM_conf() const { return _FPKM_conf; } FPKM_conf(const ConfidenceInterval & cf)195 void FPKM_conf(const ConfidenceInterval& cf) { _FPKM_conf = cf; } 196 gamma()197 double gamma() const { return _gamma; } gamma(double g)198 void gamma(double g) { assert(!isnan(g)); _gamma = g; }; 199 kappa()200 double kappa() const { return _kappa; } kappa(double k)201 void kappa(double k) { _kappa = k; } 202 203 // This returns the estimated number of fragments num_fragments()204 double num_fragments() const { return _num_fragments; } num_fragments(double nf)205 void num_fragments(double nf) 206 { 207 assert (!isnan(nf)); 208 _num_fragments = nf; 209 if (_transfrag) 210 _transfrag->num_fragments(nf); 211 } 212 213 // This tracks the final modeled variance in the assigned counts. num_fragment_var()214 double num_fragment_var() const { return _num_fragment_var; } num_fragment_var(double nfv)215 void num_fragment_var(double nfv) { assert (!isnan(nfv)); _num_fragment_var = nfv; } 216 217 // This tracks the fragment assignment uncertainty, 218 // and does not include the overdispersion. num_fragment_uncertainty_var()219 double num_fragment_uncertainty_var() const { return _num_fragment_uncertainty_var; } num_fragment_uncertainty_var(double nfv)220 void num_fragment_uncertainty_var(double nfv) { assert (!isnan(nfv)); _num_fragment_uncertainty_var = nfv; } 221 num_fragments_by_replicate()222 CountPerReplicateTable num_fragments_by_replicate() const { return _num_fragments_per_replicate; } num_fragments_by_replicate(CountPerReplicateTable & cpr)223 void num_fragments_by_replicate(CountPerReplicateTable& cpr) { _num_fragments_per_replicate = cpr; } 224 FPKM_by_replicate()225 FPKMPerReplicateTable FPKM_by_replicate() const { return _fpkm_per_replicate; } FPKM_by_replicate(FPKMPerReplicateTable & fpr)226 void FPKM_by_replicate(FPKMPerReplicateTable& fpr) { _fpkm_per_replicate = fpr; } 227 status_by_replicate()228 StatusPerReplicateTable status_by_replicate() const { return _status_per_replicate; } status_by_replicate(StatusPerReplicateTable & fpr)229 void status_by_replicate(StatusPerReplicateTable& fpr) { _status_per_replicate = fpr; } 230 mass_variance()231 double mass_variance() const { return _sample_mass_variance; } mass_variance(double mv)232 void mass_variance(double mv) { _sample_mass_variance = mv; } 233 transfrag(boost::shared_ptr<Scaffold> tf)234 void transfrag(boost::shared_ptr<Scaffold> tf) { _transfrag = tf; } transfrag()235 boost::shared_ptr<Scaffold> transfrag() const { return _transfrag; } 236 effective_length()237 double effective_length() const { return _eff_len; } effective_length(double el)238 void effective_length(double el) { _eff_len = el; } 239 cond_probs()240 const vector<double>* cond_probs() const { return _cond_probs; } cond_probs(vector<double> * cp)241 void cond_probs(vector<double>* cp) 242 { 243 if(_cond_probs != NULL) { delete _cond_probs; }; 244 _cond_probs = cp; 245 } 246 fpkm_samples()247 const vector<double>& fpkm_samples() const { return _fpkm_samples; } fpkm_samples(const vector<double> & s)248 void fpkm_samples(const vector<double>& s) { _fpkm_samples = s; } 249 gene_id()250 set<string> gene_id() const 251 { 252 if (_transfrag) 253 { 254 set<string> s; 255 s.insert(_transfrag->annotated_gene_id()); 256 return s; 257 } 258 else 259 { 260 assert (false); 261 return set<string>(); 262 } 263 } 264 gene_name()265 set<string> gene_name() const 266 { 267 if (_transfrag) 268 { 269 set<string> s; 270 s.insert(_transfrag->annotated_gene_name()); 271 return s; 272 } 273 else 274 { 275 assert (false); 276 return set<string>(); 277 } 278 } 279 tss_id()280 set<string> tss_id() const 281 { 282 if (_transfrag) 283 { 284 set<string> s; 285 s.insert(_transfrag->annotated_tss_id()); 286 return s; 287 } 288 else 289 { 290 assert (false); 291 return set<string>(); 292 } 293 } 294 protein_id()295 set<string> protein_id() const 296 { 297 if (_transfrag) 298 { 299 set<string> s; 300 s.insert(_transfrag->annotated_protein_id()); 301 return s; 302 } 303 else 304 { 305 assert (false); 306 return set<string>(); 307 } 308 } 309 description()310 virtual const string& description() const { return _description; } description(const string & d)311 virtual void description(const string& d) { _description = d; } 312 locus_tag()313 virtual const string& locus_tag() const { return _locus_tag; } locus_tag(const string & L)314 virtual void locus_tag(const string& L) { _locus_tag = L; } 315 reference_tag()316 virtual const string& reference_tag() const { return _ref_tag; } reference_tag(const string & r)317 virtual void reference_tag(const string& r) { _ref_tag = r; } 318 319 void clear_non_serialized_data(); 320 private: 321 322 friend std::ostream & operator<<(std::ostream &os, const TranscriptAbundance &gp); 323 friend class boost::serialization::access; 324 325 template<class Archive> serialize(Archive & ar,const unsigned int)326 void serialize(Archive & ar, const unsigned int /* file_version */){ 327 328 ar & _status; 329 //ar & _transfrag; 330 ar & _FPKM; 331 ar & _FPKM_conf; 332 ar & _gamma; 333 ar & _kappa; 334 ar & _num_fragments; 335 ar & _num_fragment_var; 336 ar & _num_fragment_uncertainty_var; 337 ar & _eff_len; 338 //ar & _cond_probs; 339 ar & _description; 340 ar & _locus_tag; 341 ar & _ref_tag; 342 ar & _sample_mass_variance; 343 ar & _num_fragments_per_replicate; 344 ar & _fpkm_per_replicate; 345 ar & _status_per_replicate; 346 } 347 348 void calculate_FPKM_err_bar(double variance); 349 350 AbundanceStatus _status; 351 boost::shared_ptr<Scaffold> _transfrag; 352 double _FPKM; 353 double _FPKM_variance; 354 ConfidenceInterval _FPKM_conf; 355 double _gamma; 356 double _kappa; 357 double _num_fragments; 358 double _num_fragment_var; 359 double _num_fragment_uncertainty_var; 360 double _eff_len; 361 vector<double>* _cond_probs; 362 363 vector<double> _fpkm_samples; 364 365 string _description; 366 string _locus_tag; 367 string _ref_tag; 368 369 long double _sample_mass_variance; 370 371 CountPerReplicateTable _num_fragments_per_replicate; 372 FPKMPerReplicateTable _fpkm_per_replicate; 373 StatusPerReplicateTable _status_per_replicate; 374 }; 375 376 //BOOST_CLASS_EXPORT_GUID(TranscriptAbundance, "TranscriptAbundance"); 377 //BOOST_SERIALIZATION_boost::shared_ptr(TranscriptAbundance) 378 379 class AbundanceGroup : public Abundance 380 { 381 public: AbundanceGroup()382 AbundanceGroup() : 383 _kappa(1.0), 384 _FPKM_variance(0.0), 385 _salient_frags(0.0), 386 _total_frags(0.0) {} 387 AbundanceGroup(const vector<boost::shared_ptr<Abundance>> & abundances)388 AbundanceGroup(const vector<boost::shared_ptr<Abundance> >& abundances) : 389 _abundances(abundances), 390 _iterated_exp_count_covariance(ublas::zero_matrix<double>(abundances.size(), abundances.size())), 391 _count_covariance(ublas::zero_matrix<double>(abundances.size(), abundances.size())), 392 _fpkm_covariance(ublas::zero_matrix<double>(abundances.size(), abundances.size())), 393 _gamma_covariance(ublas::zero_matrix<double>(abundances.size(), abundances.size())), 394 _kappa_covariance(ublas::zero_matrix<double>(abundances.size(), abundances.size())), 395 _kappa(1.0), 396 _FPKM_variance(0.0), 397 _salient_frags(0.0), 398 _total_frags(0.0) {} 399 400 AbundanceGroup(const vector<boost::shared_ptr<Abundance> >& abundances, 401 const ublas::matrix<double>& gamma_covariance, 402 const ublas::matrix<double>& iterated_exp_count_covariance, 403 const ublas::matrix<double>& count_covariance, 404 const ublas::matrix<double>& fpkm_covariance, 405 const std::set<boost::shared_ptr<ReadGroupProperties const > >& rg_props); 406 407 AbundanceStatus status() const; status(AbundanceStatus s)408 void status(AbundanceStatus s) { } 409 bool has_member_with_status(AbundanceStatus member_status) const; 410 411 double FPKM() const; FPKM(double fpkm)412 void FPKM(double fpkm) { } 413 FPKM_variance()414 double FPKM_variance() const { return _FPKM_variance; } FPKM_variance(double v)415 void FPKM_variance(double v) { } 416 FPKM_conf()417 ConfidenceInterval FPKM_conf() const { return _FPKM_conf; } 418 419 420 double gamma() const; gamma(double g)421 void gamma(double g) { }; 422 kappa()423 double kappa() const { return _kappa; } kappa(double k)424 void kappa(double k) { _kappa = k; } 425 426 double num_fragments() const; num_fragments(double nf)427 void num_fragments(double nf) { } 428 429 // This tracks the final modeled variance in the assigned counts. 430 double num_fragment_var() const; num_fragment_var(double nfv)431 void num_fragment_var(double nfv) { } 432 433 // This tracks the uncertainty in the assigned counts 434 double num_fragment_uncertainty_var() const; num_fragment_uncertainty_var(double nfv)435 void num_fragment_uncertainty_var(double nfv) { } 436 437 CountPerReplicateTable num_fragments_by_replicate() const; num_fragments_by_replicate(CountPerReplicateTable & cpr)438 void num_fragments_by_replicate(CountPerReplicateTable& cpr) { } 439 440 FPKMPerReplicateTable FPKM_by_replicate() const; FPKM_by_replicate(FPKMPerReplicateTable & fpr)441 void FPKM_by_replicate(FPKMPerReplicateTable& fpr) { } 442 443 StatusPerReplicateTable status_by_replicate() const; status_by_replicate(StatusPerReplicateTable & fpr)444 void status_by_replicate(StatusPerReplicateTable& fpr) { } 445 446 double mass_variance() const; mass_variance(double mf)447 void mass_variance(double mf) { } 448 449 set<string> gene_id() const; 450 set<string> gene_name() const; 451 set<string> tss_id() const; 452 set<string> protein_id() const; 453 description()454 virtual const string& description() const { return _description; } description(const string & d)455 virtual void description(const string& d) { _description = d; } 456 457 virtual const string& locus_tag() const; locus_tag(const string & L)458 virtual void locus_tag(const string& L) { } 459 460 virtual const string& reference_tag() const; reference_tag(const string & r)461 virtual void reference_tag(const string& r) { } 462 463 double effective_length() const; 464 465 //DUMMY FUNCTIONS effective_length(double ef)466 void effective_length(double ef) {} cond_probs(vector<double> * cp)467 void cond_probs(vector<double>* cp) {} 468 469 void filter_group(const vector<bool>& to_keep, 470 AbundanceGroup& filtered_group) const; 471 472 void get_transfrags(vector<boost::shared_ptr<Abundance> >& transfrags) const; 473 abundances()474 vector<boost::shared_ptr<Abundance> >& abundances() { return _abundances; } abundances()475 const vector<boost::shared_ptr<Abundance> >& abundances() const { return _abundances; } 476 gamma_cov()477 const ublas::matrix<double>& gamma_cov() const { return _gamma_covariance; } 478 iterated_count_cov()479 const ublas::matrix<double>& iterated_count_cov() const { return _iterated_exp_count_covariance; } 480 count_cov()481 const ublas::matrix<double>& count_cov() const { return _count_covariance; } 482 kappa_cov()483 const ublas::matrix<double>& kappa_cov() const { return _kappa_covariance; } 484 fpkm_cov()485 const ublas::matrix<double>& fpkm_cov() const { return _fpkm_covariance; } 486 assigned_counts()487 const vector<Eigen::VectorXd>& assigned_counts() const { return _assigned_count_samples; } 488 fpkm_samples()489 const vector<double>& fpkm_samples() const { return _fpkm_samples; } fpkm_samples(const vector<double> & s)490 void fpkm_samples(const vector<double>& s) { _fpkm_samples = s; } clear_fpkm_samples()491 void clear_fpkm_samples() { 492 _fpkm_samples.clear(); 493 std::vector<double>().swap(_fpkm_samples); 494 for (size_t i = 0; i < _member_fpkm_samples.size(); ++i) 495 { 496 _member_fpkm_samples.clear(); 497 std::vector<Eigen::VectorXd>().swap(_member_fpkm_samples); 498 } 499 } 500 member_fpkm_samples()501 const vector<Eigen::VectorXd>& member_fpkm_samples() const { return _member_fpkm_samples; } member_fpkm_samples(const vector<Eigen::VectorXd> & s)502 void member_fpkm_samples(const vector<Eigen::VectorXd>& s) { _member_fpkm_samples = s; } 503 504 void calculate_abundance(const vector<MateHit>& alignments, 505 bool perform_collapse = true, 506 bool calculate_variance = true); 507 salient_frags()508 double salient_frags() const { return _salient_frags; } salient_frags(double nf)509 void salient_frags(double nf) { _salient_frags = nf; } 510 total_frags()511 double total_frags() const { return _total_frags; } total_frags(double nf)512 void total_frags(double nf) { _total_frags = nf; } 513 rg_props()514 const std::set<boost::shared_ptr<ReadGroupProperties const> >& rg_props() const { return _read_group_props; } 515 init_rg_props(const std::set<boost::shared_ptr<ReadGroupProperties const>> & rg)516 void init_rg_props(const std::set<boost::shared_ptr<ReadGroupProperties const> >& rg) 517 { 518 _read_group_props = rg; 519 _count_per_replicate.clear(); 520 for ( std::set<boost::shared_ptr<ReadGroupProperties const> >::const_iterator itr = rg.begin(); 521 itr != rg.end(); 522 ++itr) 523 { 524 _count_per_replicate[*itr] = 0; 525 } 526 } 527 528 void fit_gamma_distributions(); 529 530 void clear_non_serialized_data(); 531 532 void aggregate_replicate_abundances(const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate); 533 534 void calculate_abundance_group_variance(const vector<boost::shared_ptr<Abundance> >& transcripts, 535 const std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate); 536 collect_per_replicate_mass(std::map<boost::shared_ptr<ReadGroupProperties const>,boost::shared_ptr<const AbundanceGroup>> & ab_group_per_replicate)537 void collect_per_replicate_mass(std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >& ab_group_per_replicate) 538 { 539 _count_per_replicate.clear(); 540 for (std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = ab_group_per_replicate.begin(); 541 itr != ab_group_per_replicate.end(); ++itr) 542 { 543 _count_per_replicate[itr->first] = itr->second->num_fragments(); 544 } 545 } 546 547 static void apply_normalization_to_abundances(const map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<const AbundanceGroup> >& unnormalized_ab_group_per_replicate, 548 map<boost::shared_ptr<const ReadGroupProperties>, boost::shared_ptr<AbundanceGroup> >& normalized_ab_group_per_replicate); 549 550 private: 551 552 void calculate_abundance_for_replicate(const vector<MateHit>& alignments, bool perform_collapse); 553 554 555 FPKM_conf(const ConfidenceInterval & cf)556 void FPKM_conf(const ConfidenceInterval& cf) { _FPKM_conf = cf; } 557 558 bool calculate_gammas(const vector<MateHit>& nr_alignments, 559 const vector<double>& log_conv_factors, 560 const vector<boost::shared_ptr<Abundance> >& transcripts, 561 const vector<boost::shared_ptr<Abundance> >& mapped_transcripts); 562 void calculate_FPKM_covariance(); 563 564 void calculate_conf_intervals(); 565 void calculate_locus_scaled_mass_and_variance(const vector<boost::shared_ptr<Abundance> >& transcripts); 566 567 568 AbundanceStatus calculate_per_replicate_abundances(vector<boost::shared_ptr<Abundance> >& transcripts, 569 const std::map<boost::shared_ptr<ReadGroupProperties const >, vector<MateHit> >& alignments_per_read_group, 570 std::map<boost::shared_ptr<ReadGroupProperties const >, boost::shared_ptr<AbundanceGroup> >& ab_group_per_replicate, 571 bool perform_collapse = true); 572 573 574 void calculate_kappas(); 575 576 577 void update_multi_reads(const vector<MateHit>& alignments, vector<boost::shared_ptr<Abundance> > transcripts); 578 579 void collect_per_replicate_mass(const vector<MateHit>& alignments, 580 vector<boost::shared_ptr<Abundance> >& transcripts); 581 582 void generate_fpkm_samples(); 583 584 friend std::ostream & operator<<(std::ostream &os, const AbundanceGroup &gp); 585 friend class boost::serialization::access; 586 587 template<class Archive> save(Archive & ar,const unsigned int version)588 void save(Archive & ar, const unsigned int version) const 589 { 590 //ar & _abundances; 591 vector<TranscriptAbundance*> tmp; 592 BOOST_FOREACH(boost::shared_ptr<Abundance> ab, _abundances) 593 { 594 tmp.push_back((TranscriptAbundance*)&(*ab)); 595 } 596 ar & tmp; 597 ar & _iterated_exp_count_covariance; 598 ar & _count_covariance; 599 ar & _fpkm_covariance; 600 ar & _gamma_covariance; 601 602 //ar & _FPKM_conf; 603 //ar & _kappa_covariance; 604 //ar & _assign_probs; 605 606 ar & _kappa; 607 ar & _FPKM_variance; 608 ar & _description; 609 //ar & _salient_frags; 610 ar & _total_frags; 611 //ar & _fpkm_samples; // don't save the samples 612 ar & _read_group_props; 613 //ar & _member_fpkm_samples // don't save the member samples either 614 ar & _count_per_replicate; 615 616 } 617 template<class Archive> load(Archive & ar,const unsigned int version)618 void load(Archive & ar, const unsigned int version) 619 { 620 621 vector<TranscriptAbundance*> tmp; 622 ar & tmp; 623 BOOST_FOREACH(TranscriptAbundance* ab, tmp) 624 { 625 _abundances.push_back(boost::shared_ptr<Abundance>(ab)); 626 } 627 628 //ar & _abundances; 629 630 ar & _iterated_exp_count_covariance; 631 ar & _count_covariance; 632 ar & _fpkm_covariance; 633 ar & _gamma_covariance; 634 635 //ar & _FPKM_conf; 636 //ar & _kappa_covariance; 637 //ar & _assign_probs; 638 639 ar & _kappa; 640 ar & _FPKM_variance; 641 ar & _description; 642 //ar & _salient_frags; 643 ar & _total_frags; 644 //ar & _fpkm_samples; // don't save the samples 645 ar & _read_group_props; 646 //ar & _member_fpkm_samples // don't save the member samples either 647 ar & _count_per_replicate; 648 } 649 BOOST_SERIALIZATION_SPLIT_MEMBER() 650 651 //void collect_read_group_props(); 652 653 vector<boost::shared_ptr<Abundance> > _abundances; 654 655 // _iterated_exp_count_covariance is the ITERATED EXPECTATION count covariance matrix. It's not the 656 // estimated count covariance matrix (i.e. it doesn't include biological variability from 657 // the fitted model. 658 ublas::matrix<double> _iterated_exp_count_covariance; 659 660 // _count_covariance is the final count covariance matrix. It's includes our estimates 661 // of transcript-level biological variability on counts 662 ublas::matrix<double> _count_covariance; 663 664 ublas::matrix<double> _fpkm_covariance; 665 ublas::matrix<double> _gamma_covariance; 666 667 ConfidenceInterval _FPKM_conf; 668 669 ublas::matrix<double> _kappa_covariance; 670 ublas::matrix<double> _assign_probs; 671 672 double _kappa; 673 double _FPKM_variance; 674 string _description; 675 double _salient_frags; 676 double _total_frags; 677 678 vector<double> _fpkm_samples; 679 vector<Eigen::VectorXd> _member_fpkm_samples; 680 681 std::set<boost::shared_ptr<ReadGroupProperties const > > _read_group_props; 682 vector<Eigen::VectorXd> _assigned_count_samples; 683 684 map<boost::shared_ptr<ReadGroupProperties const>, double> _count_per_replicate; 685 //std::map<boost::shared_ptr<ReadGroupProperties const >, ublas::vector<double> > _mles_for_read_groups; 686 }; 687 688 struct SampleAbundances 689 { 690 string locus_tag; 691 AbundanceGroup transcripts; 692 vector<AbundanceGroup> primary_transcripts; 693 vector<AbundanceGroup> gene_primary_transcripts; 694 vector<AbundanceGroup> cds; 695 vector<AbundanceGroup> gene_cds; 696 vector<AbundanceGroup> genes; 697 double cluster_mass; 698 }; 699 700 void compute_cond_probs_and_effective_lengths(const vector<MateHit>& alignments, 701 vector<boost::shared_ptr<Abundance> >& transcripts, 702 vector<boost::shared_ptr<Abundance> >& mapped_transcripts); 703 704 void compute_compatibilities(const vector<boost::shared_ptr<Abundance> >& transcripts, 705 const vector<MateHit>& alignments, 706 vector<vector<char> >& compatibilities); 707 708 void get_alignments_from_scaffolds(const vector<boost::shared_ptr<Abundance> >& abundances, 709 vector<MateHit>& alignments); 710 711 AbundanceStatus empirical_replicate_gammas(vector<boost::shared_ptr<Abundance> >& transcripts, 712 const vector<MateHit>& nr_alignments, 713 const vector<double>& log_conv_factors, 714 ublas::vector<double>& gamma_map_estimate, 715 ublas::matrix<double>& gamma_map_covariance, 716 std::map<boost::shared_ptr<ReadGroupProperties const >, ublas::vector<double> >& mles_for_read_groups); 717 718 AbundanceStatus gamma_mle(const vector<boost::shared_ptr<Abundance> >& transcripts, 719 const vector<MateHit>& nr_alignments, 720 const vector<double>& log_conv_factors, 721 vector<double>& gammas, 722 bool check_identifiability = true); 723 724 double compute_doc(int bundle_origin, 725 const vector<Scaffold>& scaffolds, 726 vector<float>& depth_of_coverage, 727 map<pair<int, int>, float>& intron_depth_of_coverage, 728 bool exclude_intra_intron=false, 729 vector<float>* intronic_cov=NULL, 730 vector<int>* scaff_intronic_status=NULL); 731 732 double major_isoform_intron_doc(map<pair<int, int>, float>& intron_doc); 733 734 void record_doc_for_scaffolds(int bundle_origin, 735 const std::vector<Scaffold>& hits, 736 const std::vector<float>& depth_of_coverage, 737 const std::map<std::pair<int, int>, float>& intron_depth_of_coverage, 738 std::vector<double>& scaff_doc); 739 740 void record_doc_for_scaffolds(int bundle_origin, 741 const std::vector<Scaffold>& hits, 742 const std::vector<float>& depth_of_coverage, 743 std::vector<double>& scaff_doc); 744 745 void record_min_doc_for_scaffolds(int bundle_origin, 746 const std::vector<Scaffold>& hits, 747 const std::vector<float>& depth_of_coverage, 748 const std::map<std::pair<int, int>, float>& intron_depth_of_coverage, 749 std::vector<double>& scaff_doc); 750 751 752 double get_intron_doc(const Scaffold& s, 753 const map<pair<int, int>, float>& intron_depth_of_coverage); 754 755 double get_scaffold_doc(int bundle_origin, 756 const Scaffold& s, 757 const vector<float>& depth_of_coverage); 758 759 double get_scaffold_min_doc(int bundle_origin, 760 const Scaffold& s, 761 const vector<float>& depth_of_coverage); 762 763 AbundanceStatus calculate_inverse_fisher(const vector<boost::shared_ptr<Abundance> >& transcripts, 764 const vector<MateHit>& alignments, 765 const ublas::vector<double>& gamma_mean, 766 ublas::matrix<double>& inverse_fisher); 767 768 void calculate_assignment_probs(const Eigen::VectorXd& alignment_multiplicities, 769 const Eigen::MatrixXd& transcript_cond_probs, 770 const Eigen::VectorXd& proposed_gammas, 771 Eigen::MatrixXd& assignment_probs); 772 773 void calculate_average_assignment_probs(const Eigen::VectorXd& alignment_multiplicities, 774 const Eigen::MatrixXd& transcript_cond_probs, 775 const Eigen::VectorXd& proposed_gammas, 776 Eigen::MatrixXd& assignment_probs); 777 778 void calculate_iterated_exp_count_covariance(const vector<double>& gammas, 779 const vector<MateHit>& nr_alignments, 780 const vector<boost::shared_ptr<Abundance> >& transcripts, 781 ublas::matrix<double>& count_covariance); 782 783 bool simulate_count_covariance(const vector<double>& num_fragments, 784 const vector<double>& frag_variances, 785 const ublas::matrix<double>& iterated_exp_count_covariances, 786 const vector<boost::shared_ptr<Abundance> >& transcripts, 787 ublas::matrix<double>& count_covariances, 788 vector<Eigen::VectorXd>& assigned_count_samples, 789 vector<ublas::vector<double> >* gamma_samples); 790 791 void sample_abundance_worker(const string& locus_tag, 792 const set<boost::shared_ptr<ReadGroupProperties const> >& rg_props, 793 SampleAbundances& sample, 794 boost::shared_ptr<HitBundle> sample_bundle, 795 bool perform_cds_analysis, 796 bool perform_tss_analysis, 797 bool calculate_variance); 798 799 void merge_precomputed_expression_worker(const string& locus_tag, 800 const vector<boost::shared_ptr<PrecomputedExpressionBundleFactory> >& expression_factories, 801 SampleAbundances& sample, 802 boost::shared_ptr<HitBundle> sample_bundle, 803 bool perform_cds_analysis, 804 bool perform_tss_analysis, 805 bool calculate_variance); 806 #endif 807