1 #ifndef BWT_MAP_H
2 #define BWT_MAP_H
3 
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7 
8 #include <iostream>
9 #include <fstream>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string>
14 #include <cstring>
15 #include <map>
16 #include <vector>
17 #include <cassert>
18 
19 #include <boost/shared_ptr.hpp>
20 
21 #ifdef HAVE_HTSLIB
22 #include <htslib/hts.h>
23 #include <htslib/hfile.h>
24 #include <htslib/cram.h>
25 #include <htslib/bgzf.h>
26 #include <htslib/sam.h>
27 #else
28 #include <bam/sam.h>
29 #endif
30 
31 #include "common.h"
32 #include "multireads.h"
33 
34 using namespace std;
35 
36 /*
37  *  hits.h
38  *  Cufflinks
39  *
40  *  Created by Cole Trapnell on 3/23/09.
41  *  Copyright 2009 Cole Trapnell. All rights reserved.
42  *
43  */
44 
45 enum CuffStrand { CUFF_STRAND_UNKNOWN = 0, CUFF_FWD = 1, CUFF_REV = 2, CUFF_BOTH = 3 };
46 
47 
48 enum CigarOpCode
49 {
50 	MATCH = BAM_CMATCH,
51 	INS = BAM_CINS,
52 	DEL = BAM_CDEL,
53 	REF_SKIP = BAM_CREF_SKIP,
54 	SOFT_CLIP = BAM_CSOFT_CLIP,
55 	HARD_CLIP = BAM_CHARD_CLIP,
56 	PAD = BAM_CPAD
57 };
58 
59 struct CigarOp
60 {
CigarOpCigarOp61 	CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
62 	CigarOpCode opcode : 3;
63 	uint32_t length : 29;
64 
65 	bool operator==(const CigarOp& rhs) const { return opcode == rhs.opcode && length == rhs.length; }
66 
67 };
68 
69 typedef uint64_t InsertID;
70 typedef uint64_t RefID;
71 
72 extern int num_deleted;
73 
74 /*  Stores the information from a single record of the bowtie map. A given read
75     may have many of these.
76 */
77 struct ReadHit
78 {
ReadHitReadHit79 	ReadHit() :
80 		_ref_id(0),
81 		_insert_id(0),
82         _base_mass(1.0),
83         _edit_dist(0xFFFFFFFF),
84 		_num_hits(1)
85     {
86         num_deleted++;
87     }
88 
ReadHitReadHit89 	ReadHit(RefID ref_id,
90 			InsertID insert_id,
91 			int left,
92 			int read_len,
93 			CuffStrand source_strand,
94 			RefID partner_ref,
95 			int partner_pos,
96 			unsigned int edit_dist,
97 			int num_hits,
98             float base_mass,
99             uint32_t sam_flag) :
100 		_ref_id(ref_id),
101 		_insert_id(insert_id),
102 		_left(left),
103 		_partner_ref_id(partner_ref),
104 		_partner_pos(partner_pos),
105 		_cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
106 		_source_strand(source_strand),
107         _base_mass(base_mass),
108         _edit_dist(edit_dist),
109 		_num_hits(num_hits),
110         _sam_flag(sam_flag)
111 	{
112 		assert(_cigar.capacity() == _cigar.size());
113 		_right = get_right();
114         num_deleted++;
115 	}
116 
ReadHitReadHit117 	ReadHit(RefID ref_id,
118 			InsertID insert_id,
119 			int left,
120 			const vector<CigarOp>& cigar,
121 			CuffStrand source_strand,
122 			RefID partner_ref,
123 			int partner_pos,
124 			unsigned int  edit_dist,
125 			int num_hits,
126             float base_mass,
127             uint32_t sam_flag) :
128 		_ref_id(ref_id),
129 		_insert_id(insert_id),
130 		_left(left),
131 		_partner_ref_id(partner_ref),
132 		_partner_pos(partner_pos),
133 		_cigar(cigar),
134 		_source_strand(source_strand),
135         _base_mass(base_mass),
136         _edit_dist(edit_dist),
137 		_num_hits(num_hits),
138         _sam_flag(sam_flag)
139 	{
140 		assert(_cigar.capacity() == _cigar.size());
141 		_right = get_right();
142         num_deleted++;
143 	}
144 
ReadHitReadHit145     ReadHit(const ReadHit& other)
146     {
147         _ref_id = other._ref_id;
148 		_insert_id = other._insert_id;
149 		_left = other._left;
150 		_partner_ref_id = other._partner_ref_id;
151 		_partner_pos = other._partner_pos;
152 		_cigar = other._cigar;
153 		_source_strand = other._source_strand;
154 		_num_hits = other._num_hits;
155         _base_mass = other._base_mass;
156         _edit_dist = other._edit_dist;
157         _right = get_right();
158         _sam_flag = other._sam_flag;
159         num_deleted++;
160     }
161 
~ReadHitReadHit162     ~ReadHit()
163     {
164         --num_deleted;
165     }
166 
read_lenReadHit167 	int read_len() const
168 	{
169 		int len = 0;
170 		for (size_t i = 0; i < _cigar.size(); ++i)
171 		{
172 			const CigarOp& op = _cigar[i];
173 			switch(op.opcode)
174 			{
175 				case MATCH:
176 				case INS:
177 				case SOFT_CLIP:
178 					len += op.length;
179 					break;
180 				default:
181 					break;
182 			}
183 		}
184 
185 		return len;
186 	}
187 
contains_spliceReadHit188 	bool contains_splice() const
189 	{
190 		for (size_t i = 0; i < _cigar.size(); ++i)
191 		{
192 				if (_cigar[i].opcode == REF_SKIP)
193 					return true;
194 		}
195 		return false;
196 	}
197 
is_singletonReadHit198 	bool is_singleton() const
199 	{
200 		return (partner_ref_id() == 0 ||
201 				partner_ref_id() != ref_id() ||
202 				abs(partner_pos() - left()) > max_partner_dist);
203 	}
204 
205 	bool operator==(const ReadHit& rhs) const
206 	{
207 	    return (_insert_id == rhs._insert_id &&
208 	            _ref_id == rhs._ref_id &&
209 	            antisense_align() == rhs.antisense_align() &&
210 	            _left == rhs._left &&
211 	            _source_strand == rhs._source_strand &&
212 	            /* DO NOT USE ACCEPTED IN COMPARISON */
213 	            _cigar == rhs._cigar);
214     }
215 
ref_idReadHit216 	RefID ref_id() const				{ return _ref_id;			}
insert_idReadHit217 	InsertID insert_id() const			{ return _insert_id;		}
218 
partner_ref_idReadHit219 	RefID partner_ref_id() const		{ return _partner_ref_id;	}
partner_posReadHit220 	int	  partner_pos()	const			{ return _partner_pos;		}
221 
leftReadHit222 	int left() const					{ return _left;				}
rightReadHit223 	int right() const					{ return _right;			}
source_strandReadHit224 	CuffStrand source_strand()	const	{ return _source_strand; }
antisense_alignReadHit225 	bool antisense_align() const		{ return _sam_flag & 0x10;	}
is_firstReadHit226     bool is_first() const               { return _sam_flag & 0x40;  }
227 
228 	// Number of multi-hits for this read
num_hitsReadHit229 	int num_hits() const				{ return _num_hits;			}
230 
231     // We are ignoring the _base_mass and re-calculating based on multi-hits
massReadHit232 	double mass() const
233 	{
234 		if (is_singleton())
235 			return 1.0/_num_hits;
236 		return 0.5 / _num_hits;
237 	}
238 
239 	// For convenience, if you just want a copy of the gap intervals
240 	// for this hit.
gapsReadHit241 	void gaps(vector<pair<int,int> >& gaps_out) const
242 	{
243 		gaps_out.clear();
244 		int pos = _left;
245 		for (size_t i = 0; i < _cigar.size(); ++i)
246 		{
247 			const CigarOp& op = _cigar[i];
248 
249 			switch(op.opcode)
250 			{
251 				case REF_SKIP:
252 					gaps_out.push_back(make_pair(pos, pos + op.length - 1));
253 					pos += op.length;
254 					break;
255                 case SOFT_CLIP:
256                     pos += op.length;
257                     break;
258                 case HARD_CLIP:
259                     break;
260 				case MATCH:
261 					pos += op.length;
262 					break;
263                 case INS:
264                     pos -= op.length;
265 					break;
266                 case DEL:
267                     pos += op.length;
268 					break;
269 				default:
270 					break;
271 			}
272 		}
273 	}
274 
cigarReadHit275 	const vector<CigarOp>& cigar() const { return _cigar; }
276 
contiguousReadHit277 	bool contiguous() const
278 	{
279 		return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
280 	}
281 
edit_distReadHit282     unsigned int  edit_dist() const { return _edit_dist; }
283 
284     void trim(int trimmed_length);
285 
286 	//const string& hitfile_rec() const { return _hitfile_rec; }
287 	//void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
288 
289 private:
290 
get_rightReadHit291 	int get_right() const
292 	{
293 		int r = _left;
294 		for (size_t i = 0; i < _cigar.size(); ++i)
295 		{
296 			const CigarOp& op = _cigar[i];
297 
298 			switch(op.opcode)
299 			{
300 				case MATCH:
301 				case REF_SKIP:
302                 case SOFT_CLIP:
303 				case DEL:
304 					r += op.length;
305 					break;
306                 case INS:
307                 case HARD_CLIP:
308 				default:
309 					break;
310 			}
311 		}
312 		return r;
313 	}
314 
315 	RefID _ref_id;
316 	InsertID _insert_id;   // Id of the sequencing insert
317 	int _left;        // Position in the reference of the left side of the alignment
318 	int _right;
319 
320 	RefID _partner_ref_id;  // Reference contig on which we expect the mate
321 	int _partner_pos;     // Position at which we expect the mate of this hit
322 
323 
324 	vector<CigarOp> _cigar;
325 
326 	CuffStrand _source_strand;    // Which strand the read really came from, if known
327     float _base_mass;
328     unsigned int  _edit_dist;            // Number of mismatches
329 	int _num_hits; // Number of multi-hits (1 by default)
330     uint32_t _sam_flag;
331 	//string _hitfile_rec; // Points to the buffer for the record from which this hit came
332 };
333 
334 class ReadTable
335 {
336 public:
337 
ReadTable()338 	ReadTable() {}
339 
340 	// This function should NEVER return zero
get_id(const string & name)341 	InsertID get_id(const string& name)
342 	{
343 		uint64_t _id = hash_string(name.c_str());
344 		assert(_id);
345 		return _id;
346 	}
347 
348     // Calculate checksum
get_cs(const string & name)349 	InsertID get_cs(const string& name)
350 	{
351         return string_checksum(name.c_str());
352     }
353 
354 private:
355 
356 	// This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
hash_string(const char * __s)357 	inline uint64_t hash_string(const char* __s)
358 	{
359 		uint64_t hash = 0xcbf29ce484222325ull;
360 		for ( ; *__s; ++__s)
361 		{
362 			hash *= 1099511628211ull;
363 			hash ^= *__s;
364 		}
365 		return hash;
366 	}
367 
368 
string_checksum(const char * s)369     inline uint64_t string_checksum(const char * s)
370     {
371         uint64_t c = 0;
372         for ( ; *s; ++s)
373         {
374             c += *s;
375         }
376         return c;
377     }
378 };
379 
380 class RefSequenceTable
381 {
382 public:
383 
384 	typedef std::string Sequence;
385 
386 	struct SequenceInfo
387 	{
SequenceInfoSequenceInfo388 		SequenceInfo(uint32_t _order,
389 					 char* _name,
390 					 Sequence* _seq) :
391 		observation_order(_order),
392 		name(_name),
393 		seq(_seq) {}
394 		uint32_t observation_order;
395 		char* name;
396 		Sequence* seq;
397 	};
398 
399 	typedef map<string, RefID> IDTable;
400 	typedef map<RefID, SequenceInfo> InvertedIDTable;
401 	typedef InvertedIDTable::iterator iterator;
402 	typedef InvertedIDTable::const_iterator const_iterator;
403 
404 	RefSequenceTable(bool keep_names, bool keep_seqs = false) :
405 	_next_id(1),
406 	_keep_names(keep_names) {}
407 
~RefSequenceTable()408 	~RefSequenceTable()
409 	{
410 		for (InvertedIDTable::iterator itr = _by_id.begin();
411 			 itr != _by_id.end();
412 			 ++itr)
413 		{
414 			free(itr->second.name);
415 		}
416 	}
417 
get_id(const string & name,Sequence * seq)418 	RefID get_id(const string& name,
419 				 Sequence* seq)
420 	{
421 		if (name.empty())
422 			return 0;
423 #if ENABLE_THREADS
424         table_lock.lock();
425 #endif
426 		uint64_t _id = hash_string(name.c_str());
427 		pair<InvertedIDTable::iterator, bool> ret =
428 		_by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL)));
429 		if (ret.second == true)
430 		{
431 			char* _name = NULL;
432 			if (_keep_names)
433 				_name = strdup(name.c_str());
434 			ret.first->second.name = _name;
435 			ret.first->second.seq	= seq;
436 			++_next_id;
437 		}
438 		assert (_id);
439 #if ENABLE_THREADS
440         table_lock.unlock();
441 #endif
442 		return _id;
443 	}
444 
445 	// You must call invert() before using this function
get_name(RefID ID)446 	const char* get_name(RefID ID) const
447 	{
448 		InvertedIDTable::const_iterator itr = _by_id.find(ID);
449 		if (itr != _by_id.end())
450         {
451             //const SequenceInfo& info = itr->second;
452 			return itr->second.name;
453         }
454 		else
455         {
456 			return NULL;
457         }
458 	}
459 
get_seq(RefID ID)460 	Sequence* get_seq(RefID ID) const
461 	{
462 		InvertedIDTable::const_iterator itr = _by_id.find(ID);
463 		if (itr != _by_id.end())
464 			return itr->second.seq;
465 		else
466 			return NULL;
467 	}
468 
get_info(RefID ID)469 	const SequenceInfo* get_info(RefID ID) const
470 	{
471 
472 		InvertedIDTable::const_iterator itr = _by_id.find(ID);
473 		if (itr != _by_id.end())
474 		{
475 			return &(itr->second);
476 		}
477 		else
478 			return NULL;
479 	}
480 
observation_order(RefID ID)481 	int observation_order(RefID ID) const
482 	{
483 		InvertedIDTable::const_iterator itr = _by_id.find(ID);
484 		if (itr != _by_id.end())
485 		{
486 			return itr->second.observation_order;
487 		}
488 		else
489 			return -1;
490 	}
491 
order_recs_lexicographically()492     void order_recs_lexicographically()
493     {
494         map<string, RefID> str_to_id;
495 
496         for (InvertedIDTable::iterator i = _by_id.begin(); i != _by_id.end(); ++i)
497         {
498             str_to_id[i->second.name] = i->first;
499             //fprintf(stderr, "%d: %s\n", i->second.observation_order, i->second.name);
500         }
501 
502         size_t new_order = 1;
503         for (map<string, RefID>::iterator i = str_to_id.begin(); i != str_to_id.end(); ++i, ++new_order)
504         {
505             _by_id.find(get_id(i->first, NULL))->second.observation_order = new_order;
506             verbose_msg( "%lu: %s\n", new_order, i->first.c_str());
507         }
508     }
509 
print_rec_ordering()510     void print_rec_ordering()
511     {
512         for (InvertedIDTable::iterator i = _by_id.begin(); i != _by_id.end(); ++i)
513         {
514             verbose_msg( "%lu: %s\n", i->second.observation_order, i->second.name);
515         }
516     }
517 
begin()518 	iterator begin() { return _by_id.begin(); }
end()519 	iterator end() { return _by_id.end(); }
520 
begin()521 	const_iterator begin() const { return _by_id.begin(); }
end()522 	const_iterator end() const { return _by_id.end(); }
523 
size()524 	size_t size() const { return _by_id.size(); }
525 
clear()526 	void clear()
527 	{
528 		//_by_name.clear();
529 		_by_id.clear();
530 	}
531 
532 private:
533 
534 	// This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
hash_string(const char * __s)535 	inline uint64_t hash_string(const char* __s)
536 	{
537 		uint64_t hash = 0xcbf29ce484222325ull;
538 		for ( ; *__s; ++__s)
539 		{
540 			hash *= 1099511628211ull;
541 			hash ^= *__s;
542 		}
543 		return hash;
544 	}
545 
546 	//IDTable _by_name;
547 	RefID _next_id;
548 	bool _keep_names;
549 	InvertedIDTable _by_id;
550 #if ENABLE_THREADS
551     static boost::mutex table_lock;
552 #endif
553 };
554 
555 
556 bool hit_insert_id_lt(const ReadHit& h1, const ReadHit& h2);
557 
558 /******************************************************************************
559  The HitFactory abstract class is responsible for returning a single ReadHit
560  from an alignment file.
561 *******************************************************************************/
562 class HitFactory
563 {
564 public:
565 
HitFactory(ReadTable & insert_table,RefSequenceTable & reference_table)566 	HitFactory(ReadTable& insert_table,
567 			   RefSequenceTable& reference_table) :
568 		_insert_table(insert_table),
569 		_ref_table(reference_table),
570 		_num_seq_header_recs(0),
571 		_undone_hit_present(false) {}
572 
573 	HitFactory& operator=(const HitFactory& rhs)
574 	{
575 		if (this != &rhs)
576 		{
577 			//_hit_file = rhs._hit_file;
578 			_insert_table = rhs._insert_table;
579 			_ref_table = rhs._ref_table;
580 		}
581 		return *this;
582 	}
~HitFactory()583 	virtual ~HitFactory() {}
584 
585 	ReadHit create_hit(const string& insert_name,
586 					   const string& ref_name,
587 					   int left,
588 					   const vector<CigarOp>& cigar,
589 					   CuffStrand source_strand,
590 					   const string& partner_ref,
591 					   int partner_pos,
592 					   unsigned int  edit_dist,
593 					   int num_hits,
594                        float base_mass,
595                        uint32_t sam_flag);
596 
597 	ReadHit create_hit(const string& insert_name,
598 					   const string& ref_name,
599 					   uint32_t left,
600 					   uint32_t read_len,
601 					   CuffStrand source_strand,
602 					   const string& partner_ref,
603 					   int partner_pos,
604 					   unsigned int  edit_dist,
605 					   int num_hits,
606                        float base_mass,
607                        uint32_t sam_flag);
608 
609 	virtual void reset() = 0;
610 
undo_hit()611 	void undo_hit()
612 	{
613 		if(_undone_hit_present)
614 			throw "undo_hit can only back up one hit";
615 		_undone_hit_present=true;
616 	}
617 
next_hit(ReadHit & result)618 	bool next_hit(ReadHit &result)
619 	{
620 		if(_undone_hit_present)
621 		{
622 			_undone_hit_present=false;
623 		}
624 		else
625 		{
626 			if(!read_next_hit(_last_hit))
627 				return false;
628 		}
629 		result=_last_hit;
630 		return true;
631 	}
632 
633 	virtual bool records_remain() const = 0;
634 
ref_table()635 	RefSequenceTable& ref_table() { return _ref_table; }
636 
637 	//FILE* hit_file() { return _hit_file; }
638 
639     virtual bool inspect_header() = 0;
640 
read_group_properties()641     const ReadGroupProperties& read_group_properties()
642     {
643         return _rg_props;
644     }
645 
646 protected:
647 
648 	virtual bool read_next_hit(ReadHit &result) = 0;
649 
650     bool parse_header_string(const string& header_rec,
651                              ReadGroupProperties& rg_props);
652 
653 	void parse_header_lines(const string& headers,
654 							ReadGroupProperties& rg_props);
655 
656     void finalize_rg_props();
657 
658     // TODO: We want to keep a collection of these, indexed by RG ID.  See #180
659     ReadGroupProperties _rg_props;
660 
661 private:
662 
663 
664 	ReadTable& _insert_table;
665 	RefSequenceTable& _ref_table;
666     uint32_t _num_seq_header_recs;
667 	bool _undone_hit_present;
668 	ReadHit _last_hit;
669 };
670 
671 /******************************************************************************
672  SAMHitFactory turns SAM alignments into ReadHits
673 *******************************************************************************/
674 class SAMHitFactory : public HitFactory
675 {
676 public:
SAMHitFactory(const string & hit_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)677 	SAMHitFactory(const string& hit_file_name,
678 				  ReadTable& insert_table,
679 				  RefSequenceTable& reference_table) :
680 		HitFactory(insert_table, reference_table),
681 		_line_num(0)
682 	{
683 		_hit_file = fopen(hit_file_name.c_str(), "r");
684 		if (_hit_file == NULL)
685 		{
686 			throw std::runtime_error("Error: could not open file for reading");
687 		}
688 
689         if (inspect_header() == false)
690         {
691             throw std::runtime_error("Error: could not parse SAM header");
692         }
693 
694         // Override header-inferred read group properities with whatever
695         // the user supplied.
696         if (global_read_properties != NULL)
697         {
698             _rg_props = *global_read_properties;
699         }
700 	}
701 
~SAMHitFactory()702 	~SAMHitFactory()
703 	{
704 		if (_hit_file)
705 		{
706 			fclose(_hit_file);
707 		}
708 	}
709 
reset()710 	void reset() { rewind(_hit_file); }
711 
records_remain()712 	bool records_remain() const { return !feof(_hit_file); }
713 
714 	bool read_next_hit(ReadHit& buf);
715 
716     bool inspect_header();
717 
718 private:
719 	static const size_t _hit_buf_max_sz = 10 * 1024;
720 	char _hit_buf[_hit_buf_max_sz];
721 	int _line_num;
722 
723 	FILE* _hit_file;
724 };
725 
726 class BAMHitFactory : public HitFactory
727 {
728 public:
BAMHitFactory(ReadTable & insert_table,RefSequenceTable & reference_table)729 	BAMHitFactory(ReadTable& insert_table,
730 				  RefSequenceTable& reference_table) :
731 	HitFactory(insert_table, reference_table)
732 	{}
733 
734 	bool get_hit_from_bam1t(const bam1_t* hit_buf,
735 							const bam_hdr_t* header,
736 							ReadHit& bh);
737 
records_remain()738 	bool records_remain() const
739 	{
740 		return !_eof_encountered;
741 	}
742 
743  protected:
744 	int64_t _beginning;
745     bool _eof_encountered;
746 };
747 
748 #ifndef HAVE_HTSLIB
749 
750 /******************************************************************************
751  SamtoolsHitFactory turns SAM alignments into ReadHits
752  *******************************************************************************/
753 class SamtoolsHitFactory : public BAMHitFactory
754 {
755 public:
SamtoolsHitFactory(const string & hit_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)756     SamtoolsHitFactory(const string& hit_file_name,
757 				  ReadTable& insert_table,
758 				  RefSequenceTable& reference_table) :
759 	BAMHitFactory(insert_table, reference_table)
760 	{
761 		_hit_file = samopen(hit_file_name.c_str(), "rb", 0);
762 
763         memset(&_next_hit, 0, sizeof(_next_hit));
764 
765 		if (_hit_file == NULL || _hit_file->header == NULL)
766 		{
767 			throw std::runtime_error("Fail to open BAM file");
768 		}
769 
770 		_beginning = bgzf_tell(_hit_file->x.bam);
771         _eof_encountered = false;
772 
773         if (inspect_header() == false)
774         {
775             throw std::runtime_error("Error: could not parse BAM header");
776         }
777 
778         // Override header-inferred read group properities with whatever
779         // the user supplied.
780         if (global_read_properties != NULL)
781         {
782             _rg_props = *global_read_properties;
783         }
784 	}
785 
~SamtoolsHitFactory()786 	~SamtoolsHitFactory()
787 	{
788 		if (_hit_file)
789 		{
790 			samclose(_hit_file);
791 		}
792 	}
793 
reset()794 	void reset()
795 	{
796 		if (_hit_file && _hit_file->x.bam)
797 		{
798 			bgzf_seek(_hit_file->x.bam, _beginning, SEEK_SET);
799             _eof_encountered = false;
800 		}
801 	}
802 
803 
804 	bool read_next_hit(ReadHit& bh);
805 
806     bool inspect_header();
807 
808 private:
809 	samfile_t* _hit_file;
810 	bam1_t _next_hit;
811 };
812 
813 #else
814 
815 /******************************************************************************
816  HTSHitFactory uses htslib to turn SAM/BAM/CRAM alignments into ReadHits
817  *******************************************************************************/
818 
819 class HTSHitFactory : public BAMHitFactory
820 {
821 public:
HTSHitFactory(const string & hit_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)822     HTSHitFactory(const string& hit_file_name,
823 				  ReadTable& insert_table,
824 				  RefSequenceTable& reference_table) :
825 	BAMHitFactory(insert_table, reference_table),
826 	_hit_file_name(hit_file_name)
827 	{
828 		_hit_file = hts_open(hit_file_name.c_str(), "r");
829 		if(!_hit_file)
830 			throw std::runtime_error("Failed to open SAM/BAM/CRAM file " + hit_file_name);
831 		_file_header = sam_hdr_read(_hit_file);
832 		if(!_file_header)
833 		{
834 			hts_close(_hit_file);
835 			throw std::runtime_error("Failed to read header from " + hit_file_name);
836 		}
837 		_next_hit = bam_init1();
838 		if(!_next_hit)
839 			throw std::runtime_error("Failed to allocate SAM record");
840 		_eof_encountered = false;
841 
842 		if (inspect_header() == false)
843         {
844             throw std::runtime_error("Error: could not parse BAM header");
845         }
846 
847         // Override header-inferred read group properities with whatever
848         // the user supplied.
849         if (global_read_properties != NULL)
850         {
851             _rg_props = *global_read_properties;
852         }
853 	}
854 
~HTSHitFactory()855 	~HTSHitFactory()
856 	{
857 		if(_next_hit)
858 			bam_destroy1(_next_hit);
859 		if(_file_header)
860 			bam_hdr_destroy(_file_header);
861 		if(_hit_file)
862 			hts_close(_hit_file);
863 	}
864 
reset()865 	void reset()
866 	{
867 		if(_hit_file)
868 		{
869 			hts_close(_hit_file);
870 			_hit_file = hts_open(_hit_file_name.c_str(), "r");
871 			if(!_hit_file)
872 				throw std::runtime_error("Failed to reopen SAM/BAM/CRAM file " + _hit_file_name);
873 			bam_hdr_t* reread_header = sam_hdr_read(_hit_file);
874 			if(!reread_header)
875 				throw std::runtime_error("Failed to re-read header from " + _hit_file_name);
876 			// Keep the old header:
877 			bam_hdr_destroy(reread_header);
878 			_eof_encountered = false;
879 		}
880 	}
881 
882 	bool read_next_hit(ReadHit& buf);
883 
884 	bool inspect_header();
885 
886  private:
887 	htsFile* _hit_file;
888 	bam_hdr_t* _file_header;
889 	bam1_t* _next_hit;
890 	std::string _hit_file_name;
891 };
892 
893 #endif
894 
895 class AbundanceGroup;
896 
897 /******************************************************************************
898  BAMHitFactory turns SAM alignments into ReadHits
899  *******************************************************************************/
900 class PrecomputedExpressionHitFactory : public HitFactory
901 {
902 public:
PrecomputedExpressionHitFactory(const string & expression_file_name,ReadTable & insert_table,RefSequenceTable & reference_table)903     PrecomputedExpressionHitFactory(const string& expression_file_name,
904                                     ReadTable& insert_table,
905                                     RefSequenceTable& reference_table) :
906     HitFactory(insert_table, reference_table), _expression_file_name(expression_file_name), _ifs(expression_file_name.c_str()),
907     _ia(boost::shared_ptr<boost::archive::binary_iarchive>(new boost::archive::binary_iarchive(_ifs)))
908     {
909         load_count_tables(expression_file_name);
910 
911         if (inspect_header() == false)
912         {
913             throw std::runtime_error("Error: could not parse CXB header");
914         }
915 
916         // Override header-inferred read group properities with whatever
917         // the user supplied.
918         if (global_read_properties != NULL)
919         {
920             _rg_props = *global_read_properties;
921         }
922 
923         load_checked_parameters(expression_file_name);
924 
925         //map<string, AbundanceGroup> single_sample_tracking;
926 
927         _num_loci = 0;
928         *_ia >> _num_loci;
929 
930         _curr_locus_idx = 0;
931         _last_locus_id = -1;
932     }
933 
~PrecomputedExpressionHitFactory()934     ~PrecomputedExpressionHitFactory()
935     {
936 
937     }
938 
records_remain()939     bool records_remain() const
940     {
941         return false;
942     }
943 
reset()944     void reset()
945     {
946         _ifs.clear() ;
947         _ifs.seekg(0, ios::beg);
948         _ia = boost::shared_ptr<boost::archive::binary_iarchive>(new boost::archive::binary_iarchive(_ifs));
949         size_t num_loci = 0;
950         *_ia >> num_loci;
951         _last_locus_id = -1;
952         _curr_locus_idx = 0;
953         _curr_ab_groups.clear();
954     }
955 
956 	bool read_next_hit(ReadHit& buf);
957 
958     bool inspect_header();
959 
960     boost::shared_ptr<const AbundanceGroup> next_locus(int locus_id, bool cache_locus);
961 
962     boost::shared_ptr<const AbundanceGroup> get_abundance_for_locus(int locus_id);
963     void clear_abundance_for_locus(int locus_id);
964 
get_compat_mass(int locus_id)965     double get_compat_mass(int locus_id)
966     {
967        map<int, double >::iterator i = compat_mass.find(locus_id);
968        if (i != compat_mass.end())
969        {
970            return i->second;
971        }
972        else
973        {
974            return 0;
975        }
976     }
977 
get_total_mass(int locus_id)978     double get_total_mass(int locus_id)
979     {
980         map<int, double >::iterator i = total_mass.find(locus_id);
981         if (i != total_mass.end())
982         {
983             return i->second;
984         }
985         else
986         {
987             return 0;
988         }
989     }
990 
991 
992 private:
993 
994     void load_count_tables(const string& expression_file_name);
995     void load_checked_parameters(const string& expression_file_name);
996 
997     //map<int, boost::shared_ptr<const AbundanceGroup> > ab_group_table;
998     size_t _num_loci;
999     size_t _curr_locus_idx;
1000     int _last_locus_id;
1001     std::ifstream _ifs;
1002     string _expression_file_name;
1003     boost::shared_ptr<boost::archive::binary_iarchive> _ia;
1004     map<int, double> compat_mass;
1005     map<int, double> total_mass;
1006     map<int, boost::shared_ptr<const AbundanceGroup> > _curr_ab_groups;
1007 
1008     boost::shared_ptr<ReadGroupProperties> _cached_rg_props;
1009 
1010 #if ENABLE_THREADS
1011     boost::mutex _factory_lock;
1012 #endif
1013 };
1014 
1015 
1016 // Forward declaration of BundleFactory, because MateHit will need a pointer
1017 // back to the Factory that created.  Ultimately, we should replace this
1018 // with a pointer back to the ReadGroupProperty object corresponding to each
1019 // MateHit.  That, however, requires that we link fragment length distributions
1020 // and bias models, etc, with each read group, and that's more than we can
1021 // afford to implement right now.
1022 
1023 /*******************************************************************************
1024  MateHit is a class that encapsulates a paired-end alignment as a single object.
1025  MateHits can be "open" when one hit has been read from a stream of individual
1026  read alignments, but the other hasn't.  A "closed" MateHit is one where either
1027  both read alignments have been installed in the MateHit, or one read hit has,
1028  but the other will never come (i.e. singletons)
1029 *******************************************************************************/
1030 class MateHit
1031 {
1032 public:
MateHit()1033     MateHit() :
1034     _refid(0),
1035     _left_alignment(NULL),
1036     _right_alignment(NULL),
1037     _collapse_mass(0.0),
1038     _is_mapped(false){}
1039 
MateHit(boost::shared_ptr<ReadGroupProperties const> rg_props,RefID refid,const ReadHit * left_alignment,const ReadHit * right_alignment)1040 	MateHit(boost::shared_ptr<ReadGroupProperties const> rg_props,
1041             RefID refid,
1042 			const ReadHit* left_alignment,
1043 			const ReadHit* right_alignment) :
1044     _rg_props(rg_props),
1045 	_refid(refid),
1046 	_left_alignment(left_alignment),
1047 	_right_alignment(right_alignment),
1048 	_collapse_mass(0.0),
1049 	_is_mapped(false)
1050 	{
1051 		//_expected_inner_dist = min(genomic_inner_dist(), _expected_inner_dist);
1052 	}
~MateHit()1053 	~MateHit()
1054 	{
1055 		//fprintf(stderr, "Killing hit %lx\n",this);
1056 	}
1057 
1058 	//bool closed() {return _closed;}
1059 
read_group_props()1060     boost::shared_ptr<ReadGroupProperties const> read_group_props() const { return _rg_props; }
1061 
left_alignment()1062 	const ReadHit* left_alignment() const {return _left_alignment;}
left_alignment(const ReadHit * left_alignment)1063 	void left_alignment(const ReadHit* left_alignment)
1064 	{
1065 		_left_alignment = left_alignment;
1066 	}
1067 
right_alignment()1068 	const ReadHit* right_alignment() const {return _right_alignment;}
right_alignment(const ReadHit * right_alignment)1069 	void right_alignment(const ReadHit* right_alignment)
1070 	{
1071 		_right_alignment = right_alignment;
1072 	}
1073 
is_mapped()1074 	bool is_mapped() const {return _is_mapped;}
is_mapped(bool mapped)1075 	void is_mapped(bool mapped)
1076 	{
1077 		_is_mapped = mapped;
1078 	}
1079 
num_hits()1080 	int num_hits() const
1081 	{
1082 		assert(_left_alignment);
1083 		return _left_alignment->num_hits();
1084 	}
1085 
is_multi()1086 	bool is_multi() const
1087 	{
1088 		return num_hits() > 1;
1089 	}
1090 
is_pair()1091 	bool is_pair() const
1092 	{
1093 		return (_left_alignment && _right_alignment);
1094 	}
1095 
left()1096 	int left() const
1097 	{
1098 		if (_right_alignment && _left_alignment)
1099 		{
1100 			return min(_right_alignment->left(),_left_alignment->left());
1101 		}
1102 		if (_left_alignment)
1103 			return _left_alignment->left();
1104 		else if (_right_alignment)
1105 			return _right_alignment->left();
1106 		return -1;
1107 	}
1108 
right()1109 	int right() const
1110 	{
1111 		if (_right_alignment && _left_alignment)
1112 		{
1113 			return max(_right_alignment->right(),_left_alignment->right());
1114 		}
1115 		if (_right_alignment)
1116 			return _right_alignment->right();
1117 		else if (_left_alignment)
1118 			return _left_alignment->right();
1119 		return -1;
1120 	}
1121 
strand()1122 	CuffStrand strand() const
1123 	{
1124 		CuffStrand left_strand = CUFF_STRAND_UNKNOWN;
1125 		CuffStrand right_strand = CUFF_STRAND_UNKNOWN;
1126 		if (_left_alignment)
1127 		{
1128 			left_strand = _left_alignment->source_strand();
1129 		}
1130 		if (_right_alignment)
1131 		{
1132 			right_strand = _right_alignment->source_strand();
1133 			//assert ( s != CUFF_STRAND_UNKNOWN ? s == r : true);
1134 		}
1135 		assert (left_strand == right_strand ||
1136 				left_strand == CUFF_STRAND_UNKNOWN ||
1137 				right_strand == CUFF_STRAND_UNKNOWN);
1138 
1139 		return max(left_strand, right_strand);
1140 	}
1141 
1142 
contains_splice()1143 	bool contains_splice() const
1144 	{
1145 		if (_right_alignment)
1146 			return (_left_alignment->contains_splice() || _right_alignment->contains_splice());
1147 		return (_left_alignment->contains_splice());
1148 	}
1149 
insert_id()1150 	InsertID insert_id() const
1151 	{
1152 		if (_left_alignment) return _left_alignment->insert_id();
1153 		if (_right_alignment) return _right_alignment->insert_id();
1154 		return 0;
1155 	}
1156 
ref_id()1157 	RefID ref_id() const { return _refid; }
1158 
genomic_inner_dist()1159 	int genomic_inner_dist() const
1160 	{
1161 		if (_left_alignment && _right_alignment)
1162 		{
1163 			return _right_alignment->left() - _left_alignment->right();
1164 		}
1165 		else
1166 		{
1167 			return -1;
1168 		}
1169 		return -1;
1170 	}
1171 
genomic_outer_span()1172 	pair<int,int> genomic_outer_span() const
1173 	{
1174 		if (_left_alignment && _right_alignment)
1175 		{
1176 			return make_pair(left(),
1177 							 right() - 1);
1178 		}
1179 
1180 		return make_pair(-1,-1);
1181 	}
1182 
genomic_inner_span()1183 	pair<int,int> genomic_inner_span() const
1184 	{
1185 		if (_left_alignment && _right_alignment)
1186 		{
1187 			return make_pair(_left_alignment->right(),
1188 							 _right_alignment->left() - 1);
1189 		}
1190 
1191 		return make_pair(-1,-1);
1192 	}
1193 
1194 	// MRT is incorrect and not added to rg_props until after inspect_map
1195     // We are ignoring the mass reported by the ReadHits and re-calculating based on multi-hits
mass()1196 	double mass() const
1197 	{
1198         double base_mass = 1.0;
1199 
1200         if (is_multi())
1201 		{
1202 			boost::shared_ptr<MultiReadTable> mrt = _rg_props->multi_read_table();
1203 			if (mrt)
1204 				return mrt->get_mass(*this);
1205 			else
1206 				return base_mass/num_hits();
1207 		}
1208 		return base_mass;
1209 	}
1210 
internal_scale_mass()1211 	double internal_scale_mass() const
1212 	{
1213        	double m = mass();
1214         m *= 1.0 / _rg_props->internal_scale_factor();
1215 
1216         return m;
1217 	}
1218 
edit_dist()1219 	unsigned int  edit_dist() const
1220 	{
1221 		unsigned int edits = 0;
1222 		if (_left_alignment)
1223 			edits += _left_alignment->edit_dist();
1224 		if (_right_alignment)
1225 			edits += _right_alignment->edit_dist();
1226 		return edits;
1227 	}
1228 
collapse_mass()1229 	double collapse_mass() const { return _collapse_mass; }
collapse_mass(double m)1230 	void collapse_mass(double m) { _collapse_mass = m; }
incr_collapse_mass(double incr)1231 	void incr_collapse_mass(double incr) { _collapse_mass += incr; }
1232 
1233 private:
1234 
1235     boost::shared_ptr<ReadGroupProperties const> _rg_props;
1236 	RefID _refid;
1237 	const ReadHit* _left_alignment;
1238 	const ReadHit* _right_alignment;
1239 	double _collapse_mass;
1240 	bool _is_mapped;
1241 	//bool _closed;
1242 };
1243 
1244 bool mate_hit_lt(const MateHit& lhs, const MateHit& rhs);
1245 
1246 bool hits_eq_mod_id(const ReadHit& lhs, const ReadHit& rhs);
1247 
1248 bool hits_eq_non_multi(const MateHit& lhs, const MateHit& rhs);
1249 bool hits_eq_non_multi_non_replicate(const MateHit& lhs, const MateHit& rhs);
1250 
1251 bool hits_equals(const MateHit& lhs, const MateHit& rhs);
1252 
1253 bool has_no_collapse_mass(const MateHit& hit);
1254 
1255 // Assumes hits are sorted by mate_hit_lt
1256 void collapse_hits(const vector<MateHit>& hits,
1257 				   vector<MateHit>& non_redundant);
1258 
1259 void normalize_counts(std::vector<boost::shared_ptr<ReadGroupProperties> > & all_read_groups);
1260 
1261 HitFactory* createSamHitFactory(const string& hit_file_name, ReadTable& it, RefSequenceTable& rt);
1262 
1263 #endif
1264