1 #ifndef BWT_MAP_H 2 #define BWT_MAP_H 3 4 #ifdef HAVE_CONFIG_H 5 #include <config.h> 6 #endif 7 8 #include <iostream> 9 #include <fstream> 10 #include <stdint.h> 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string> 14 #include <cstring> 15 #include <map> 16 #include <vector> 17 #include <cassert> 18 19 #include <boost/shared_ptr.hpp> 20 21 #ifdef HAVE_HTSLIB 22 #include <htslib/hts.h> 23 #include <htslib/hfile.h> 24 #include <htslib/cram.h> 25 #include <htslib/bgzf.h> 26 #include <htslib/sam.h> 27 #else 28 #include <bam/sam.h> 29 #endif 30 31 #include "common.h" 32 #include "multireads.h" 33 34 using namespace std; 35 36 /* 37 * hits.h 38 * Cufflinks 39 * 40 * Created by Cole Trapnell on 3/23/09. 41 * Copyright 2009 Cole Trapnell. All rights reserved. 42 * 43 */ 44 45 enum CuffStrand { CUFF_STRAND_UNKNOWN = 0, CUFF_FWD = 1, CUFF_REV = 2, CUFF_BOTH = 3 }; 46 47 48 enum CigarOpCode 49 { 50 MATCH = BAM_CMATCH, 51 INS = BAM_CINS, 52 DEL = BAM_CDEL, 53 REF_SKIP = BAM_CREF_SKIP, 54 SOFT_CLIP = BAM_CSOFT_CLIP, 55 HARD_CLIP = BAM_CHARD_CLIP, 56 PAD = BAM_CPAD 57 }; 58 59 struct CigarOp 60 { CigarOpCigarOp61 CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {} 62 CigarOpCode opcode : 3; 63 uint32_t length : 29; 64 65 bool operator==(const CigarOp& rhs) const { return opcode == rhs.opcode && length == rhs.length; } 66 67 }; 68 69 typedef uint64_t InsertID; 70 typedef uint64_t RefID; 71 72 extern int num_deleted; 73 74 /* Stores the information from a single record of the bowtie map. A given read 75 may have many of these. 76 */ 77 struct ReadHit 78 { ReadHitReadHit79 ReadHit() : 80 _ref_id(0), 81 _insert_id(0), 82 _base_mass(1.0), 83 _edit_dist(0xFFFFFFFF), 84 _num_hits(1) 85 { 86 num_deleted++; 87 } 88 ReadHitReadHit89 ReadHit(RefID ref_id, 90 InsertID insert_id, 91 int left, 92 int read_len, 93 CuffStrand source_strand, 94 RefID partner_ref, 95 int partner_pos, 96 unsigned int edit_dist, 97 int num_hits, 98 float base_mass, 99 uint32_t sam_flag) : 100 _ref_id(ref_id), 101 _insert_id(insert_id), 102 _left(left), 103 _partner_ref_id(partner_ref), 104 _partner_pos(partner_pos), 105 _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))), 106 _source_strand(source_strand), 107 _base_mass(base_mass), 108 _edit_dist(edit_dist), 109 _num_hits(num_hits), 110 _sam_flag(sam_flag) 111 { 112 assert(_cigar.capacity() == _cigar.size()); 113 _right = get_right(); 114 num_deleted++; 115 } 116 ReadHitReadHit117 ReadHit(RefID ref_id, 118 InsertID insert_id, 119 int left, 120 const vector<CigarOp>& cigar, 121 CuffStrand source_strand, 122 RefID partner_ref, 123 int partner_pos, 124 unsigned int edit_dist, 125 int num_hits, 126 float base_mass, 127 uint32_t sam_flag) : 128 _ref_id(ref_id), 129 _insert_id(insert_id), 130 _left(left), 131 _partner_ref_id(partner_ref), 132 _partner_pos(partner_pos), 133 _cigar(cigar), 134 _source_strand(source_strand), 135 _base_mass(base_mass), 136 _edit_dist(edit_dist), 137 _num_hits(num_hits), 138 _sam_flag(sam_flag) 139 { 140 assert(_cigar.capacity() == _cigar.size()); 141 _right = get_right(); 142 num_deleted++; 143 } 144 ReadHitReadHit145 ReadHit(const ReadHit& other) 146 { 147 _ref_id = other._ref_id; 148 _insert_id = other._insert_id; 149 _left = other._left; 150 _partner_ref_id = other._partner_ref_id; 151 _partner_pos = other._partner_pos; 152 _cigar = other._cigar; 153 _source_strand = other._source_strand; 154 _num_hits = other._num_hits; 155 _base_mass = other._base_mass; 156 _edit_dist = other._edit_dist; 157 _right = get_right(); 158 _sam_flag = other._sam_flag; 159 num_deleted++; 160 } 161 ~ReadHitReadHit162 ~ReadHit() 163 { 164 --num_deleted; 165 } 166 read_lenReadHit167 int read_len() const 168 { 169 int len = 0; 170 for (size_t i = 0; i < _cigar.size(); ++i) 171 { 172 const CigarOp& op = _cigar[i]; 173 switch(op.opcode) 174 { 175 case MATCH: 176 case INS: 177 case SOFT_CLIP: 178 len += op.length; 179 break; 180 default: 181 break; 182 } 183 } 184 185 return len; 186 } 187 contains_spliceReadHit188 bool contains_splice() const 189 { 190 for (size_t i = 0; i < _cigar.size(); ++i) 191 { 192 if (_cigar[i].opcode == REF_SKIP) 193 return true; 194 } 195 return false; 196 } 197 is_singletonReadHit198 bool is_singleton() const 199 { 200 return (partner_ref_id() == 0 || 201 partner_ref_id() != ref_id() || 202 abs(partner_pos() - left()) > max_partner_dist); 203 } 204 205 bool operator==(const ReadHit& rhs) const 206 { 207 return (_insert_id == rhs._insert_id && 208 _ref_id == rhs._ref_id && 209 antisense_align() == rhs.antisense_align() && 210 _left == rhs._left && 211 _source_strand == rhs._source_strand && 212 /* DO NOT USE ACCEPTED IN COMPARISON */ 213 _cigar == rhs._cigar); 214 } 215 ref_idReadHit216 RefID ref_id() const { return _ref_id; } insert_idReadHit217 InsertID insert_id() const { return _insert_id; } 218 partner_ref_idReadHit219 RefID partner_ref_id() const { return _partner_ref_id; } partner_posReadHit220 int partner_pos() const { return _partner_pos; } 221 leftReadHit222 int left() const { return _left; } rightReadHit223 int right() const { return _right; } source_strandReadHit224 CuffStrand source_strand() const { return _source_strand; } antisense_alignReadHit225 bool antisense_align() const { return _sam_flag & 0x10; } is_firstReadHit226 bool is_first() const { return _sam_flag & 0x40; } 227 228 // Number of multi-hits for this read num_hitsReadHit229 int num_hits() const { return _num_hits; } 230 231 // We are ignoring the _base_mass and re-calculating based on multi-hits massReadHit232 double mass() const 233 { 234 if (is_singleton()) 235 return 1.0/_num_hits; 236 return 0.5 / _num_hits; 237 } 238 239 // For convenience, if you just want a copy of the gap intervals 240 // for this hit. gapsReadHit241 void gaps(vector<pair<int,int> >& gaps_out) const 242 { 243 gaps_out.clear(); 244 int pos = _left; 245 for (size_t i = 0; i < _cigar.size(); ++i) 246 { 247 const CigarOp& op = _cigar[i]; 248 249 switch(op.opcode) 250 { 251 case REF_SKIP: 252 gaps_out.push_back(make_pair(pos, pos + op.length - 1)); 253 pos += op.length; 254 break; 255 case SOFT_CLIP: 256 pos += op.length; 257 break; 258 case HARD_CLIP: 259 break; 260 case MATCH: 261 pos += op.length; 262 break; 263 case INS: 264 pos -= op.length; 265 break; 266 case DEL: 267 pos += op.length; 268 break; 269 default: 270 break; 271 } 272 } 273 } 274 cigarReadHit275 const vector<CigarOp>& cigar() const { return _cigar; } 276 contiguousReadHit277 bool contiguous() const 278 { 279 return _cigar.size() == 1 && _cigar[0].opcode == MATCH; 280 } 281 edit_distReadHit282 unsigned int edit_dist() const { return _edit_dist; } 283 284 void trim(int trimmed_length); 285 286 //const string& hitfile_rec() const { return _hitfile_rec; } 287 //void hitfile_rec(const string& rec) { _hitfile_rec = rec; } 288 289 private: 290 get_rightReadHit291 int get_right() const 292 { 293 int r = _left; 294 for (size_t i = 0; i < _cigar.size(); ++i) 295 { 296 const CigarOp& op = _cigar[i]; 297 298 switch(op.opcode) 299 { 300 case MATCH: 301 case REF_SKIP: 302 case SOFT_CLIP: 303 case DEL: 304 r += op.length; 305 break; 306 case INS: 307 case HARD_CLIP: 308 default: 309 break; 310 } 311 } 312 return r; 313 } 314 315 RefID _ref_id; 316 InsertID _insert_id; // Id of the sequencing insert 317 int _left; // Position in the reference of the left side of the alignment 318 int _right; 319 320 RefID _partner_ref_id; // Reference contig on which we expect the mate 321 int _partner_pos; // Position at which we expect the mate of this hit 322 323 324 vector<CigarOp> _cigar; 325 326 CuffStrand _source_strand; // Which strand the read really came from, if known 327 float _base_mass; 328 unsigned int _edit_dist; // Number of mismatches 329 int _num_hits; // Number of multi-hits (1 by default) 330 uint32_t _sam_flag; 331 //string _hitfile_rec; // Points to the buffer for the record from which this hit came 332 }; 333 334 class ReadTable 335 { 336 public: 337 ReadTable()338 ReadTable() {} 339 340 // This function should NEVER return zero get_id(const string & name)341 InsertID get_id(const string& name) 342 { 343 uint64_t _id = hash_string(name.c_str()); 344 assert(_id); 345 return _id; 346 } 347 348 // Calculate checksum get_cs(const string & name)349 InsertID get_cs(const string& name) 350 { 351 return string_checksum(name.c_str()); 352 } 353 354 private: 355 356 // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash hash_string(const char * __s)357 inline uint64_t hash_string(const char* __s) 358 { 359 uint64_t hash = 0xcbf29ce484222325ull; 360 for ( ; *__s; ++__s) 361 { 362 hash *= 1099511628211ull; 363 hash ^= *__s; 364 } 365 return hash; 366 } 367 368 string_checksum(const char * s)369 inline uint64_t string_checksum(const char * s) 370 { 371 uint64_t c = 0; 372 for ( ; *s; ++s) 373 { 374 c += *s; 375 } 376 return c; 377 } 378 }; 379 380 class RefSequenceTable 381 { 382 public: 383 384 typedef std::string Sequence; 385 386 struct SequenceInfo 387 { SequenceInfoSequenceInfo388 SequenceInfo(uint32_t _order, 389 char* _name, 390 Sequence* _seq) : 391 observation_order(_order), 392 name(_name), 393 seq(_seq) {} 394 uint32_t observation_order; 395 char* name; 396 Sequence* seq; 397 }; 398 399 typedef map<string, RefID> IDTable; 400 typedef map<RefID, SequenceInfo> InvertedIDTable; 401 typedef InvertedIDTable::iterator iterator; 402 typedef InvertedIDTable::const_iterator const_iterator; 403 404 RefSequenceTable(bool keep_names, bool keep_seqs = false) : 405 _next_id(1), 406 _keep_names(keep_names) {} 407 ~RefSequenceTable()408 ~RefSequenceTable() 409 { 410 for (InvertedIDTable::iterator itr = _by_id.begin(); 411 itr != _by_id.end(); 412 ++itr) 413 { 414 free(itr->second.name); 415 } 416 } 417 get_id(const string & name,Sequence * seq)418 RefID get_id(const string& name, 419 Sequence* seq) 420 { 421 if (name.empty()) 422 return 0; 423 #if ENABLE_THREADS 424 table_lock.lock(); 425 #endif 426 uint64_t _id = hash_string(name.c_str()); 427 pair<InvertedIDTable::iterator, bool> ret = 428 _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL))); 429 if (ret.second == true) 430 { 431 char* _name = NULL; 432 if (_keep_names) 433 _name = strdup(name.c_str()); 434 ret.first->second.name = _name; 435 ret.first->second.seq = seq; 436 ++_next_id; 437 } 438 assert (_id); 439 #if ENABLE_THREADS 440 table_lock.unlock(); 441 #endif 442 return _id; 443 } 444 445 // You must call invert() before using this function get_name(RefID ID)446 const char* get_name(RefID ID) const 447 { 448 InvertedIDTable::const_iterator itr = _by_id.find(ID); 449 if (itr != _by_id.end()) 450 { 451 //const SequenceInfo& info = itr->second; 452 return itr->second.name; 453 } 454 else 455 { 456 return NULL; 457 } 458 } 459 get_seq(RefID ID)460 Sequence* get_seq(RefID ID) const 461 { 462 InvertedIDTable::const_iterator itr = _by_id.find(ID); 463 if (itr != _by_id.end()) 464 return itr->second.seq; 465 else 466 return NULL; 467 } 468 get_info(RefID ID)469 const SequenceInfo* get_info(RefID ID) const 470 { 471 472 InvertedIDTable::const_iterator itr = _by_id.find(ID); 473 if (itr != _by_id.end()) 474 { 475 return &(itr->second); 476 } 477 else 478 return NULL; 479 } 480 observation_order(RefID ID)481 int observation_order(RefID ID) const 482 { 483 InvertedIDTable::const_iterator itr = _by_id.find(ID); 484 if (itr != _by_id.end()) 485 { 486 return itr->second.observation_order; 487 } 488 else 489 return -1; 490 } 491 order_recs_lexicographically()492 void order_recs_lexicographically() 493 { 494 map<string, RefID> str_to_id; 495 496 for (InvertedIDTable::iterator i = _by_id.begin(); i != _by_id.end(); ++i) 497 { 498 str_to_id[i->second.name] = i->first; 499 //fprintf(stderr, "%d: %s\n", i->second.observation_order, i->second.name); 500 } 501 502 size_t new_order = 1; 503 for (map<string, RefID>::iterator i = str_to_id.begin(); i != str_to_id.end(); ++i, ++new_order) 504 { 505 _by_id.find(get_id(i->first, NULL))->second.observation_order = new_order; 506 verbose_msg( "%lu: %s\n", new_order, i->first.c_str()); 507 } 508 } 509 print_rec_ordering()510 void print_rec_ordering() 511 { 512 for (InvertedIDTable::iterator i = _by_id.begin(); i != _by_id.end(); ++i) 513 { 514 verbose_msg( "%lu: %s\n", i->second.observation_order, i->second.name); 515 } 516 } 517 begin()518 iterator begin() { return _by_id.begin(); } end()519 iterator end() { return _by_id.end(); } 520 begin()521 const_iterator begin() const { return _by_id.begin(); } end()522 const_iterator end() const { return _by_id.end(); } 523 size()524 size_t size() const { return _by_id.size(); } 525 clear()526 void clear() 527 { 528 //_by_name.clear(); 529 _by_id.clear(); 530 } 531 532 private: 533 534 // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash hash_string(const char * __s)535 inline uint64_t hash_string(const char* __s) 536 { 537 uint64_t hash = 0xcbf29ce484222325ull; 538 for ( ; *__s; ++__s) 539 { 540 hash *= 1099511628211ull; 541 hash ^= *__s; 542 } 543 return hash; 544 } 545 546 //IDTable _by_name; 547 RefID _next_id; 548 bool _keep_names; 549 InvertedIDTable _by_id; 550 #if ENABLE_THREADS 551 static boost::mutex table_lock; 552 #endif 553 }; 554 555 556 bool hit_insert_id_lt(const ReadHit& h1, const ReadHit& h2); 557 558 /****************************************************************************** 559 The HitFactory abstract class is responsible for returning a single ReadHit 560 from an alignment file. 561 *******************************************************************************/ 562 class HitFactory 563 { 564 public: 565 HitFactory(ReadTable & insert_table,RefSequenceTable & reference_table)566 HitFactory(ReadTable& insert_table, 567 RefSequenceTable& reference_table) : 568 _insert_table(insert_table), 569 _ref_table(reference_table), 570 _num_seq_header_recs(0), 571 _undone_hit_present(false) {} 572 573 HitFactory& operator=(const HitFactory& rhs) 574 { 575 if (this != &rhs) 576 { 577 //_hit_file = rhs._hit_file; 578 _insert_table = rhs._insert_table; 579 _ref_table = rhs._ref_table; 580 } 581 return *this; 582 } ~HitFactory()583 virtual ~HitFactory() {} 584 585 ReadHit create_hit(const string& insert_name, 586 const string& ref_name, 587 int left, 588 const vector<CigarOp>& cigar, 589 CuffStrand source_strand, 590 const string& partner_ref, 591 int partner_pos, 592 unsigned int edit_dist, 593 int num_hits, 594 float base_mass, 595 uint32_t sam_flag); 596 597 ReadHit create_hit(const string& insert_name, 598 const string& ref_name, 599 uint32_t left, 600 uint32_t read_len, 601 CuffStrand source_strand, 602 const string& partner_ref, 603 int partner_pos, 604 unsigned int edit_dist, 605 int num_hits, 606 float base_mass, 607 uint32_t sam_flag); 608 609 virtual void reset() = 0; 610 undo_hit()611 void undo_hit() 612 { 613 if(_undone_hit_present) 614 throw "undo_hit can only back up one hit"; 615 _undone_hit_present=true; 616 } 617 next_hit(ReadHit & result)618 bool next_hit(ReadHit &result) 619 { 620 if(_undone_hit_present) 621 { 622 _undone_hit_present=false; 623 } 624 else 625 { 626 if(!read_next_hit(_last_hit)) 627 return false; 628 } 629 result=_last_hit; 630 return true; 631 } 632 633 virtual bool records_remain() const = 0; 634 ref_table()635 RefSequenceTable& ref_table() { return _ref_table; } 636 637 //FILE* hit_file() { return _hit_file; } 638 639 virtual bool inspect_header() = 0; 640 read_group_properties()641 const ReadGroupProperties& read_group_properties() 642 { 643 return _rg_props; 644 } 645 646 protected: 647 648 virtual bool read_next_hit(ReadHit &result) = 0; 649 650 bool parse_header_string(const string& header_rec, 651 ReadGroupProperties& rg_props); 652 653 void parse_header_lines(const string& headers, 654 ReadGroupProperties& rg_props); 655 656 void finalize_rg_props(); 657 658 // TODO: We want to keep a collection of these, indexed by RG ID. See #180 659 ReadGroupProperties _rg_props; 660 661 private: 662 663 664 ReadTable& _insert_table; 665 RefSequenceTable& _ref_table; 666 uint32_t _num_seq_header_recs; 667 bool _undone_hit_present; 668 ReadHit _last_hit; 669 }; 670 671 /****************************************************************************** 672 SAMHitFactory turns SAM alignments into ReadHits 673 *******************************************************************************/ 674 class SAMHitFactory : public HitFactory 675 { 676 public: SAMHitFactory(const string & hit_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)677 SAMHitFactory(const string& hit_file_name, 678 ReadTable& insert_table, 679 RefSequenceTable& reference_table) : 680 HitFactory(insert_table, reference_table), 681 _line_num(0) 682 { 683 _hit_file = fopen(hit_file_name.c_str(), "r"); 684 if (_hit_file == NULL) 685 { 686 throw std::runtime_error("Error: could not open file for reading"); 687 } 688 689 if (inspect_header() == false) 690 { 691 throw std::runtime_error("Error: could not parse SAM header"); 692 } 693 694 // Override header-inferred read group properities with whatever 695 // the user supplied. 696 if (global_read_properties != NULL) 697 { 698 _rg_props = *global_read_properties; 699 } 700 } 701 ~SAMHitFactory()702 ~SAMHitFactory() 703 { 704 if (_hit_file) 705 { 706 fclose(_hit_file); 707 } 708 } 709 reset()710 void reset() { rewind(_hit_file); } 711 records_remain()712 bool records_remain() const { return !feof(_hit_file); } 713 714 bool read_next_hit(ReadHit& buf); 715 716 bool inspect_header(); 717 718 private: 719 static const size_t _hit_buf_max_sz = 10 * 1024; 720 char _hit_buf[_hit_buf_max_sz]; 721 int _line_num; 722 723 FILE* _hit_file; 724 }; 725 726 class BAMHitFactory : public HitFactory 727 { 728 public: BAMHitFactory(ReadTable & insert_table,RefSequenceTable & reference_table)729 BAMHitFactory(ReadTable& insert_table, 730 RefSequenceTable& reference_table) : 731 HitFactory(insert_table, reference_table) 732 {} 733 734 bool get_hit_from_bam1t(const bam1_t* hit_buf, 735 const bam_hdr_t* header, 736 ReadHit& bh); 737 records_remain()738 bool records_remain() const 739 { 740 return !_eof_encountered; 741 } 742 743 protected: 744 int64_t _beginning; 745 bool _eof_encountered; 746 }; 747 748 #ifndef HAVE_HTSLIB 749 750 /****************************************************************************** 751 SamtoolsHitFactory turns SAM alignments into ReadHits 752 *******************************************************************************/ 753 class SamtoolsHitFactory : public BAMHitFactory 754 { 755 public: SamtoolsHitFactory(const string & hit_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)756 SamtoolsHitFactory(const string& hit_file_name, 757 ReadTable& insert_table, 758 RefSequenceTable& reference_table) : 759 BAMHitFactory(insert_table, reference_table) 760 { 761 _hit_file = samopen(hit_file_name.c_str(), "rb", 0); 762 763 memset(&_next_hit, 0, sizeof(_next_hit)); 764 765 if (_hit_file == NULL || _hit_file->header == NULL) 766 { 767 throw std::runtime_error("Fail to open BAM file"); 768 } 769 770 _beginning = bgzf_tell(_hit_file->x.bam); 771 _eof_encountered = false; 772 773 if (inspect_header() == false) 774 { 775 throw std::runtime_error("Error: could not parse BAM header"); 776 } 777 778 // Override header-inferred read group properities with whatever 779 // the user supplied. 780 if (global_read_properties != NULL) 781 { 782 _rg_props = *global_read_properties; 783 } 784 } 785 ~SamtoolsHitFactory()786 ~SamtoolsHitFactory() 787 { 788 if (_hit_file) 789 { 790 samclose(_hit_file); 791 } 792 } 793 reset()794 void reset() 795 { 796 if (_hit_file && _hit_file->x.bam) 797 { 798 bgzf_seek(_hit_file->x.bam, _beginning, SEEK_SET); 799 _eof_encountered = false; 800 } 801 } 802 803 804 bool read_next_hit(ReadHit& bh); 805 806 bool inspect_header(); 807 808 private: 809 samfile_t* _hit_file; 810 bam1_t _next_hit; 811 }; 812 813 #else 814 815 /****************************************************************************** 816 HTSHitFactory uses htslib to turn SAM/BAM/CRAM alignments into ReadHits 817 *******************************************************************************/ 818 819 class HTSHitFactory : public BAMHitFactory 820 { 821 public: HTSHitFactory(const string & hit_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)822 HTSHitFactory(const string& hit_file_name, 823 ReadTable& insert_table, 824 RefSequenceTable& reference_table) : 825 BAMHitFactory(insert_table, reference_table), 826 _hit_file_name(hit_file_name) 827 { 828 _hit_file = hts_open(hit_file_name.c_str(), "r"); 829 if(!_hit_file) 830 throw std::runtime_error("Failed to open SAM/BAM/CRAM file " + hit_file_name); 831 _file_header = sam_hdr_read(_hit_file); 832 if(!_file_header) 833 { 834 hts_close(_hit_file); 835 throw std::runtime_error("Failed to read header from " + hit_file_name); 836 } 837 _next_hit = bam_init1(); 838 if(!_next_hit) 839 throw std::runtime_error("Failed to allocate SAM record"); 840 _eof_encountered = false; 841 842 if (inspect_header() == false) 843 { 844 throw std::runtime_error("Error: could not parse BAM header"); 845 } 846 847 // Override header-inferred read group properities with whatever 848 // the user supplied. 849 if (global_read_properties != NULL) 850 { 851 _rg_props = *global_read_properties; 852 } 853 } 854 ~HTSHitFactory()855 ~HTSHitFactory() 856 { 857 if(_next_hit) 858 bam_destroy1(_next_hit); 859 if(_file_header) 860 bam_hdr_destroy(_file_header); 861 if(_hit_file) 862 hts_close(_hit_file); 863 } 864 reset()865 void reset() 866 { 867 if(_hit_file) 868 { 869 hts_close(_hit_file); 870 _hit_file = hts_open(_hit_file_name.c_str(), "r"); 871 if(!_hit_file) 872 throw std::runtime_error("Failed to reopen SAM/BAM/CRAM file " + _hit_file_name); 873 bam_hdr_t* reread_header = sam_hdr_read(_hit_file); 874 if(!reread_header) 875 throw std::runtime_error("Failed to re-read header from " + _hit_file_name); 876 // Keep the old header: 877 bam_hdr_destroy(reread_header); 878 _eof_encountered = false; 879 } 880 } 881 882 bool read_next_hit(ReadHit& buf); 883 884 bool inspect_header(); 885 886 private: 887 htsFile* _hit_file; 888 bam_hdr_t* _file_header; 889 bam1_t* _next_hit; 890 std::string _hit_file_name; 891 }; 892 893 #endif 894 895 class AbundanceGroup; 896 897 /****************************************************************************** 898 BAMHitFactory turns SAM alignments into ReadHits 899 *******************************************************************************/ 900 class PrecomputedExpressionHitFactory : public HitFactory 901 { 902 public: PrecomputedExpressionHitFactory(const string & expression_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)903 PrecomputedExpressionHitFactory(const string& expression_file_name, 904 ReadTable& insert_table, 905 RefSequenceTable& reference_table) : 906 HitFactory(insert_table, reference_table), _expression_file_name(expression_file_name), _ifs(expression_file_name.c_str()), 907 _ia(boost::shared_ptr<boost::archive::binary_iarchive>(new boost::archive::binary_iarchive(_ifs))) 908 { 909 load_count_tables(expression_file_name); 910 911 if (inspect_header() == false) 912 { 913 throw std::runtime_error("Error: could not parse CXB header"); 914 } 915 916 // Override header-inferred read group properities with whatever 917 // the user supplied. 918 if (global_read_properties != NULL) 919 { 920 _rg_props = *global_read_properties; 921 } 922 923 load_checked_parameters(expression_file_name); 924 925 //map<string, AbundanceGroup> single_sample_tracking; 926 927 _num_loci = 0; 928 *_ia >> _num_loci; 929 930 _curr_locus_idx = 0; 931 _last_locus_id = -1; 932 } 933 ~PrecomputedExpressionHitFactory()934 ~PrecomputedExpressionHitFactory() 935 { 936 937 } 938 records_remain()939 bool records_remain() const 940 { 941 return false; 942 } 943 reset()944 void reset() 945 { 946 _ifs.clear() ; 947 _ifs.seekg(0, ios::beg); 948 _ia = boost::shared_ptr<boost::archive::binary_iarchive>(new boost::archive::binary_iarchive(_ifs)); 949 size_t num_loci = 0; 950 *_ia >> num_loci; 951 _last_locus_id = -1; 952 _curr_locus_idx = 0; 953 _curr_ab_groups.clear(); 954 } 955 956 bool read_next_hit(ReadHit& buf); 957 958 bool inspect_header(); 959 960 boost::shared_ptr<const AbundanceGroup> next_locus(int locus_id, bool cache_locus); 961 962 boost::shared_ptr<const AbundanceGroup> get_abundance_for_locus(int locus_id); 963 void clear_abundance_for_locus(int locus_id); 964 get_compat_mass(int locus_id)965 double get_compat_mass(int locus_id) 966 { 967 map<int, double >::iterator i = compat_mass.find(locus_id); 968 if (i != compat_mass.end()) 969 { 970 return i->second; 971 } 972 else 973 { 974 return 0; 975 } 976 } 977 get_total_mass(int locus_id)978 double get_total_mass(int locus_id) 979 { 980 map<int, double >::iterator i = total_mass.find(locus_id); 981 if (i != total_mass.end()) 982 { 983 return i->second; 984 } 985 else 986 { 987 return 0; 988 } 989 } 990 991 992 private: 993 994 void load_count_tables(const string& expression_file_name); 995 void load_checked_parameters(const string& expression_file_name); 996 997 //map<int, boost::shared_ptr<const AbundanceGroup> > ab_group_table; 998 size_t _num_loci; 999 size_t _curr_locus_idx; 1000 int _last_locus_id; 1001 std::ifstream _ifs; 1002 string _expression_file_name; 1003 boost::shared_ptr<boost::archive::binary_iarchive> _ia; 1004 map<int, double> compat_mass; 1005 map<int, double> total_mass; 1006 map<int, boost::shared_ptr<const AbundanceGroup> > _curr_ab_groups; 1007 1008 boost::shared_ptr<ReadGroupProperties> _cached_rg_props; 1009 1010 #if ENABLE_THREADS 1011 boost::mutex _factory_lock; 1012 #endif 1013 }; 1014 1015 1016 // Forward declaration of BundleFactory, because MateHit will need a pointer 1017 // back to the Factory that created. Ultimately, we should replace this 1018 // with a pointer back to the ReadGroupProperty object corresponding to each 1019 // MateHit. That, however, requires that we link fragment length distributions 1020 // and bias models, etc, with each read group, and that's more than we can 1021 // afford to implement right now. 1022 1023 /******************************************************************************* 1024 MateHit is a class that encapsulates a paired-end alignment as a single object. 1025 MateHits can be "open" when one hit has been read from a stream of individual 1026 read alignments, but the other hasn't. A "closed" MateHit is one where either 1027 both read alignments have been installed in the MateHit, or one read hit has, 1028 but the other will never come (i.e. singletons) 1029 *******************************************************************************/ 1030 class MateHit 1031 { 1032 public: MateHit()1033 MateHit() : 1034 _refid(0), 1035 _left_alignment(NULL), 1036 _right_alignment(NULL), 1037 _collapse_mass(0.0), 1038 _is_mapped(false){} 1039 MateHit(boost::shared_ptr<ReadGroupProperties const> rg_props,RefID refid,const ReadHit * left_alignment,const ReadHit * right_alignment)1040 MateHit(boost::shared_ptr<ReadGroupProperties const> rg_props, 1041 RefID refid, 1042 const ReadHit* left_alignment, 1043 const ReadHit* right_alignment) : 1044 _rg_props(rg_props), 1045 _refid(refid), 1046 _left_alignment(left_alignment), 1047 _right_alignment(right_alignment), 1048 _collapse_mass(0.0), 1049 _is_mapped(false) 1050 { 1051 //_expected_inner_dist = min(genomic_inner_dist(), _expected_inner_dist); 1052 } ~MateHit()1053 ~MateHit() 1054 { 1055 //fprintf(stderr, "Killing hit %lx\n",this); 1056 } 1057 1058 //bool closed() {return _closed;} 1059 read_group_props()1060 boost::shared_ptr<ReadGroupProperties const> read_group_props() const { return _rg_props; } 1061 left_alignment()1062 const ReadHit* left_alignment() const {return _left_alignment;} left_alignment(const ReadHit * left_alignment)1063 void left_alignment(const ReadHit* left_alignment) 1064 { 1065 _left_alignment = left_alignment; 1066 } 1067 right_alignment()1068 const ReadHit* right_alignment() const {return _right_alignment;} right_alignment(const ReadHit * right_alignment)1069 void right_alignment(const ReadHit* right_alignment) 1070 { 1071 _right_alignment = right_alignment; 1072 } 1073 is_mapped()1074 bool is_mapped() const {return _is_mapped;} is_mapped(bool mapped)1075 void is_mapped(bool mapped) 1076 { 1077 _is_mapped = mapped; 1078 } 1079 num_hits()1080 int num_hits() const 1081 { 1082 assert(_left_alignment); 1083 return _left_alignment->num_hits(); 1084 } 1085 is_multi()1086 bool is_multi() const 1087 { 1088 return num_hits() > 1; 1089 } 1090 is_pair()1091 bool is_pair() const 1092 { 1093 return (_left_alignment && _right_alignment); 1094 } 1095 left()1096 int left() const 1097 { 1098 if (_right_alignment && _left_alignment) 1099 { 1100 return min(_right_alignment->left(),_left_alignment->left()); 1101 } 1102 if (_left_alignment) 1103 return _left_alignment->left(); 1104 else if (_right_alignment) 1105 return _right_alignment->left(); 1106 return -1; 1107 } 1108 right()1109 int right() const 1110 { 1111 if (_right_alignment && _left_alignment) 1112 { 1113 return max(_right_alignment->right(),_left_alignment->right()); 1114 } 1115 if (_right_alignment) 1116 return _right_alignment->right(); 1117 else if (_left_alignment) 1118 return _left_alignment->right(); 1119 return -1; 1120 } 1121 strand()1122 CuffStrand strand() const 1123 { 1124 CuffStrand left_strand = CUFF_STRAND_UNKNOWN; 1125 CuffStrand right_strand = CUFF_STRAND_UNKNOWN; 1126 if (_left_alignment) 1127 { 1128 left_strand = _left_alignment->source_strand(); 1129 } 1130 if (_right_alignment) 1131 { 1132 right_strand = _right_alignment->source_strand(); 1133 //assert ( s != CUFF_STRAND_UNKNOWN ? s == r : true); 1134 } 1135 assert (left_strand == right_strand || 1136 left_strand == CUFF_STRAND_UNKNOWN || 1137 right_strand == CUFF_STRAND_UNKNOWN); 1138 1139 return max(left_strand, right_strand); 1140 } 1141 1142 contains_splice()1143 bool contains_splice() const 1144 { 1145 if (_right_alignment) 1146 return (_left_alignment->contains_splice() || _right_alignment->contains_splice()); 1147 return (_left_alignment->contains_splice()); 1148 } 1149 insert_id()1150 InsertID insert_id() const 1151 { 1152 if (_left_alignment) return _left_alignment->insert_id(); 1153 if (_right_alignment) return _right_alignment->insert_id(); 1154 return 0; 1155 } 1156 ref_id()1157 RefID ref_id() const { return _refid; } 1158 genomic_inner_dist()1159 int genomic_inner_dist() const 1160 { 1161 if (_left_alignment && _right_alignment) 1162 { 1163 return _right_alignment->left() - _left_alignment->right(); 1164 } 1165 else 1166 { 1167 return -1; 1168 } 1169 return -1; 1170 } 1171 genomic_outer_span()1172 pair<int,int> genomic_outer_span() const 1173 { 1174 if (_left_alignment && _right_alignment) 1175 { 1176 return make_pair(left(), 1177 right() - 1); 1178 } 1179 1180 return make_pair(-1,-1); 1181 } 1182 genomic_inner_span()1183 pair<int,int> genomic_inner_span() const 1184 { 1185 if (_left_alignment && _right_alignment) 1186 { 1187 return make_pair(_left_alignment->right(), 1188 _right_alignment->left() - 1); 1189 } 1190 1191 return make_pair(-1,-1); 1192 } 1193 1194 // MRT is incorrect and not added to rg_props until after inspect_map 1195 // We are ignoring the mass reported by the ReadHits and re-calculating based on multi-hits mass()1196 double mass() const 1197 { 1198 double base_mass = 1.0; 1199 1200 if (is_multi()) 1201 { 1202 boost::shared_ptr<MultiReadTable> mrt = _rg_props->multi_read_table(); 1203 if (mrt) 1204 return mrt->get_mass(*this); 1205 else 1206 return base_mass/num_hits(); 1207 } 1208 return base_mass; 1209 } 1210 internal_scale_mass()1211 double internal_scale_mass() const 1212 { 1213 double m = mass(); 1214 m *= 1.0 / _rg_props->internal_scale_factor(); 1215 1216 return m; 1217 } 1218 edit_dist()1219 unsigned int edit_dist() const 1220 { 1221 unsigned int edits = 0; 1222 if (_left_alignment) 1223 edits += _left_alignment->edit_dist(); 1224 if (_right_alignment) 1225 edits += _right_alignment->edit_dist(); 1226 return edits; 1227 } 1228 collapse_mass()1229 double collapse_mass() const { return _collapse_mass; } collapse_mass(double m)1230 void collapse_mass(double m) { _collapse_mass = m; } incr_collapse_mass(double incr)1231 void incr_collapse_mass(double incr) { _collapse_mass += incr; } 1232 1233 private: 1234 1235 boost::shared_ptr<ReadGroupProperties const> _rg_props; 1236 RefID _refid; 1237 const ReadHit* _left_alignment; 1238 const ReadHit* _right_alignment; 1239 double _collapse_mass; 1240 bool _is_mapped; 1241 //bool _closed; 1242 }; 1243 1244 bool mate_hit_lt(const MateHit& lhs, const MateHit& rhs); 1245 1246 bool hits_eq_mod_id(const ReadHit& lhs, const ReadHit& rhs); 1247 1248 bool hits_eq_non_multi(const MateHit& lhs, const MateHit& rhs); 1249 bool hits_eq_non_multi_non_replicate(const MateHit& lhs, const MateHit& rhs); 1250 1251 bool hits_equals(const MateHit& lhs, const MateHit& rhs); 1252 1253 bool has_no_collapse_mass(const MateHit& hit); 1254 1255 // Assumes hits are sorted by mate_hit_lt 1256 void collapse_hits(const vector<MateHit>& hits, 1257 vector<MateHit>& non_redundant); 1258 1259 void normalize_counts(std::vector<boost::shared_ptr<ReadGroupProperties> > & all_read_groups); 1260 1261 HitFactory* createSamHitFactory(const string& hit_file_name, ReadTable& it, RefSequenceTable& rt); 1262 1263 #endif 1264