1 #ifndef OLIGOFAR_CSAMALIGNMENTS__HPP
2 #define OLIGOFAR_CSAMALIGNMENTS__HPP
3
4 #include "ialignmentmap.hpp"
5 #include "cseqcoding.hpp"
6 #include "csam.hpp"
7
8 BEGIN_OLIGOFAR_SCOPES
9
10 class CSRead : public IShortRead
11 {
12 public:
~CSRead()13 ~CSRead() { delete[] m_data; --s_count; }
14
GetPairType() const15 virtual EPairType GetPairType() const { return x_GetPairType(); }
GetSequenceCoding() const16 virtual CSeqCoding::ECoding GetSequenceCoding() const { return x_GetSequenceCoding(); }
GetSequenceLength() const17 virtual int GetSequenceLength() const { return x_GetSequenceLength(); }
GetMate() const18 const IShortRead * GetMate() const { return x_GetPairType() ? x_GetMate() : 0; }
GetSequenceData() const19 const char * GetSequenceData() const { return m_data + x_GetSequenceOffset(); }
GetId() const20 const char * GetId() const { return x_GetPairType() == eRead_second ? x_GetMate()->GetId() : m_data + x_GetIdOffset(); }
21
22 CSRead( const string& id, CSeqCoding::ECoding coding, const char * sequence, int length, EPairType intended = eRead_single );
23 CSRead( const string& id, const string& sequence, const string& quality = "", EPairType intended = eRead_single );
24 CSRead( const CSRead& r );
25 CSRead& operator = ( const CSRead& r );
26
27 void SetMate( CSRead * other ); // other becomes second sequence
28
GetIntendedPairType() const29 EPairType GetIntendedPairType() const { return EPairType((m_data[2] >> 2)&3); }
30
31 string GetIupacna( CSeqCoding::EStrand = CSeqCoding::eStrand_pos ) const;
32
GetCount()33 static int GetCount() { return s_count; }
34
35 protected:
x_GetIdLength() const36 Uint2 x_GetIdLength() const { return Uint1( m_data[0] ); }
x_GetSequenceLength() const37 Uint2 x_GetSequenceLength() const { return Uint1( m_data[1] ); }
x_GetPairType() const38 EPairType x_GetPairType() const { return EPairType( m_data[2] & 3 ); }
x_GetSequenceCoding() const39 CSeqCoding::ECoding x_GetSequenceCoding() const { return CSeqCoding::ECoding( Uint1( m_data[3] ) ); }
40
x_GetHeaderSize() const41 Uint1 x_GetHeaderSize() const { return 4 + ( x_GetPairType() ? sizeof( void* ) : 0 ); }
x_GetIdOffset() const42 Uint2 x_GetIdOffset() const { return x_GetHeaderSize(); }
x_GetSequenceOffset() const43 Uint2 x_GetSequenceOffset() const { return x_GetIdOffset() + x_GetIdLength() + 1; }
x_GetDataSize() const44 Uint2 x_GetDataSize() const { return x_GetSequenceOffset() + x_GetSequenceLength() + 1; }
x_GetDataSize(int idlen,int seqlen,EPairType type) const45 Uint2 x_GetDataSize( int idlen, int seqlen, EPairType type ) const {
46 switch( type ) {
47 case eRead_single: return 6 + idlen + seqlen;
48 case eRead_first: return 6 + sizeof( void * ) + idlen + seqlen;
49 case eRead_second: return 5 + sizeof( void * ) + seqlen;
50 }
51 THROW( logic_error, "Unknown read pair type" );
52 }
x_GetMate() const53 const CSRead * x_GetMate() const { return *(CSRead**)(m_data + 4); }
x_SetMate()54 CSRead * & x_SetMate() { return *(CSRead**)(m_data + 4); }
55 protected:
56 char * m_data;
57 static int s_count;
58 };
59
60 class CContig : public INucSeq
61 {
62 public:
~CContig()63 ~CContig() { if( m_owns ) delete[] m_data; --s_count; }
64 CContig( const string& name );
65 CContig( const CContig& ctg );
SetSequenceData(const char * data,int length,int offset=0,CSeqCoding::ECoding coding=CSeqCoding::eCoding_ncbi8na,bool owns=false)66 void SetSequenceData( const char * data, int length, int offset = 0, CSeqCoding::ECoding coding = CSeqCoding::eCoding_ncbi8na, bool owns = false ) {
67 if( m_owns ) delete[] m_data;
68 m_data = const_cast<char*>( data ); // if we own it - we can do anything, otherwise we don't touch
69 m_size = length; m_coding = coding; m_owns = owns;
70 m_offset = offset;
71 }
GetAvailableRange() const72 CRange<int> GetAvailableRange() const { return CRange<int>( m_offset, m_size - 1 ); }
GetId() const73 virtual const char * GetId() const { return m_id.c_str(); }
GetSequenceData() const74 virtual const char * GetSequenceData() const { return m_data - m_offset; } //&m_data[0]; }
GetSequenceLength() const75 virtual int GetSequenceLength() const { return m_size; } // m_data.size(); }
GetSequenceOffset() const76 virtual int GetSequenceOffset() const { return m_offset; }
GetSequenceCoding() const77 virtual CSeqCoding::ECoding GetSequenceCoding() const { return m_coding; } //CSeqCoding::eCoding_ncbi8na; }
78
GetCount()79 static int GetCount() { return s_count; }
80 private:
81 CContig& operator = ( const CContig& );
82 protected:
83 string m_id;
84 char * m_data;
85 int m_size;
86 int m_offset;
87 CSeqCoding::ECoding m_coding;
88 bool m_owns;
89
90 static int s_count;
91 };
92
93 class IAligner;
94 class CMappedAlignment : public IMappedAlignment
95 {
96 public:
97 typedef map<string,string> TTags;
98 enum EFlags { fOwnQuery = 0x01, fOwnSubject = 0x02, fOwnSeqs = fOwnQuery|fOwnSubject };
99 enum ETagType { eType_int = 'i', eType_float = 'f', eType_string = 'Z', eType_NONE = 0 };
GetId() const100 virtual const char * GetId() const { return 0; }
GetQuery() const101 virtual const IShortRead * GetQuery() const { return m_query; }
GetSubject() const102 virtual const INucSeq * GetSubject() const { return m_subject; }
GetSubjectBounding() const103 virtual TRange GetSubjectBounding() const { return TRange( m_sstart, m_sstart + m_slength - 1 ); }
GetQueryBounding() const104 virtual TRange GetQueryBounding() const { return m_qlength > 0 ? TRange( m_qstart, m_qstart + m_qlength - 1 ) : TRange( m_qstart + m_qlength + 1, m_qstart ); }
GetCigar() const105 virtual const TTrSequence GetCigar() const { return m_cigar; }
IsReverseComplement() const106 virtual bool IsReverseComplement() const { return m_qlength < 0; }
GetMate() const107 virtual const IMappedAlignment * GetMate() const { return m_mate; }
SetQuery()108 CSRead * SetQuery() { return m_query; }
SetSubject()109 CContig * SetSubject() { return m_subject; }
GetStrand() const110 CSeqCoding::EStrand GetStrand() const { return IsReverseComplement() ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos; }
111 void SetMate( CMappedAlignment * other );
112 void WriteAsSam( ostream& out ) const;
SetFlags(int flags,bool on=true)113 void SetFlags( int flags, bool on = true ) { if( on ) m_flags |= flags; else m_flags &= ~flags; }
114 void AdjustQueryFromBy( int );
115 void AdjustQueryToBy( int );
116 void AdjustSubjectFromBy( int );
117 void AdjustSubjectToBy( int );
118 void AdjustQueryFromTo( int );
119 void AdjustQueryToTo( int );
120 void AdjustSubjectFromTo( int );
121 void AdjustSubjectToTo( int );
122 void SetTagS( const string& name, const string& val );
123 void SetTagI( const string& name, int val );
124 void SetTagF( const string& name, float val );
125 char GetTagType( const string& name ) const;
126 string GetTagS( const string& name, char type = 'Z' ) const;
127 int GetTagI( const string& name ) const;
128 float GetTagF( const string& name ) const;
129 void RemoveTag( const string& name );
130 void Assign( const CMappedAlignment * other );
SetTags(TTags * tags)131 void SetTags( TTags * tags ) { delete m_tags; m_tags = tags; }
GetTags() const132 const TTags& GetTags() const { static TTags notags; if( m_tags ) return *m_tags; else return notags; }
133 CMappedAlignment * MakeExtended( IAligner * aligner ) const;
Clone() const134 CMappedAlignment * Clone() const { return new CMappedAlignment( *this ); }
135 CMappedAlignment( CSRead * query, CContig * subject, int from, const TTrVector& cigar, CSeqCoding::EStrand strand, int flags = 0, TTags * tags = 0 );
~CMappedAlignment()136 ~CMappedAlignment() { if( m_flags & fOwnQuery ) delete m_query; if( m_flags & fOwnSubject ) delete m_subject; delete m_tags; --s_count; }
PrintDebug(ostream & out) const137 void PrintDebug( ostream& out ) const {
138 out << m_cigar.ToString() << DISPLAY( m_sstart ) << DISPLAY( m_slength ) << DISPLAY( m_qstart ) << DISPLAY( m_qlength ) << hex << DISPLAY( m_flags ) << dec;
139 }
140 bool ValidateConsistency( const string& context = "" ) const;
operator ==(const CMappedAlignment & other) const141 bool operator == ( const CMappedAlignment& other ) const {
142 return
143 strcmp( m_query->GetId(), other.m_query->GetId() ) == 0 &&
144 strcmp( m_subject->GetId(), other.m_subject->GetId() ) == 0 &&
145 m_sstart == other.m_sstart && m_slength == other.m_slength &&
146 m_qstart == other.m_qstart && m_qlength == other.m_qlength &&
147 m_cigar == other.m_cigar;
148 }
operator !=(const CMappedAlignment & other) const149 bool operator != ( const CMappedAlignment& other ) const { return !operator == ( other ); }
150
GetCount()151 static int GetCount() { return s_count; }
152
153 double ScoreAlignment( double id = 1, double mm = -1, double go = -3, double ge = -1.5 ) const;
154 static double ScoreAlignment( const TTrSequence& cigar, double id = 1, double mm = -1, double go = -3, double ge = -1.5 );
155
156 private:
157 CMappedAlignment& operator = ( const CMappedAlignment& );
158 enum EAdjustMode { eAdjust_query, eAdjust_subj };
159 void AdjustAlignment( EAdjustMode, int fromBy, int toBy );
160 protected:
161 //int x_AdjustCigar( int advanceFrontBy, EAdjustMode );
162 CMappedAlignment( const CMappedAlignment& a );
163 protected:
164 CSRead * m_query;
165 CContig * m_subject;
166 const CMappedAlignment * m_mate;
167 Int4 m_sstart;
168 Int2 m_slength;
169 Int2 m_qstart;
170 Int2 m_qlength;
171 Int2 m_flags;
172 TTrSequence m_cigar;
173 TTags * m_tags;
174 static int s_count;
175 public:
176 static bool s_validate_consistency;
177 };
178
179 class CSamSource : public IAlignmentMap, public CSamBase
180 {
181 public:
182 typedef vector<CMappedAlignment*> TAlignmentList;
183 typedef vector<Uint8> TFileOffsets;
184 typedef map<string,CSRead*> TReads;
185 typedef map<string,CContig*> TContigs;
186 typedef map<string,TAlignmentList> TAlignmentMap;
187 typedef map<string,TFileOffsets> TAlignmentOffsetMap;
188 typedef map<pair<string, pair<string,int> >, CMappedAlignment* > TPendingHits;
189
190 enum ERegisterFlags { fRegisterQuery = 0x01, fRegisterSubject = 0x02, fRegisterAlignmentByQuery = 0x04, fRegisterAlignmentBySubject = 0x08 };
191
192 ~CSamSource();
193 CMappedAlignment * ParseSamLine( const vector<string>& samLine, int flags = 0 );
194 CMappedAlignment * ParseSamLine( const string& samLine, int flags = 0 );
195
196 void IndexFile( const string& samFile, unsigned start, unsigned limit );
197
198 static TTrVector GetFullCigar( const TTrVector& cigar, const string& mismatches );
199
GetAlignmentsForQueryId(ICallback *,const char * id,int mates=3)200 virtual void GetAlignmentsForQueryId( ICallback *, const char * id, int mates = 3 ) {}
GetAlignmentsForSubjectId(ICallback *,const char * id,int strands=3,int from=kSequenceBegin,int to=kSequenceEnd)201 virtual void GetAlignmentsForSubjectId( ICallback *, const char * id, int strands = 3, int from = kSequenceBegin, int to = kSequenceEnd ) {}
GetAllAlignments(ICallback *)202 virtual void GetAllAlignments( ICallback * ) {}
GetAllQueries(ICallback *)203 virtual void GetAllQueries( ICallback * ) {}
GetAllSubjects(ICallback *)204 virtual void GetAllSubjects( ICallback * ) {}
205
AddAlignment(IMappedAlignment * ma)206 virtual void AddAlignment( IMappedAlignment * ma ) {}
AddSubject(INucSeq * ma)207 virtual void AddSubject( INucSeq * ma ) {}
AddQuery(IShortRead * ma)208 virtual void AddQuery( IShortRead * ma ) {}
209
GetAlignmentOffsetMap() const210 const TAlignmentOffsetMap& GetAlignmentOffsetMap() const { return m_alignmentOffsets; }
GetAlignmentOffsets(const string & ctg) const211 const TFileOffsets& GetAlignmentOffsets( const string& ctg ) const {
212 TAlignmentOffsetMap::const_iterator x = m_alignmentOffsets.find( ctg );
213 if( x == m_alignmentOffsets.end() ) {
214 static TFileOffsets null;
215 return null;
216 }
217 else return x->second;
218 }
219 protected:
220 TContigs m_contigs;
221 TReads m_reads;
222 TAlignmentMap m_alignmentsByContig;
223 TAlignmentMap m_alignmentsByRead;
224 TAlignmentOffsetMap m_alignmentOffsets;
225 TPendingHits m_pendingHits;
226 auto_ptr<ifstream> m_samFile;
227 };
228
AdjustQueryFromBy(int by)229 inline void CMappedAlignment::AdjustQueryFromBy( int by )
230 {
231 AdjustAlignment( eAdjust_query, by, 0 );
232 }
233
AdjustQueryToBy(int by)234 inline void CMappedAlignment::AdjustQueryToBy( int by )
235 {
236 AdjustAlignment( eAdjust_query, 0, by );
237 }
238
AdjustSubjectFromBy(int by)239 inline void CMappedAlignment::AdjustSubjectFromBy( int by )
240 {
241 AdjustAlignment( eAdjust_subj, by, 0 );
242 }
243
AdjustSubjectToBy(int by)244 inline void CMappedAlignment::AdjustSubjectToBy( int by )
245 {
246 AdjustAlignment( eAdjust_subj, 0, by );
247 }
248
AdjustQueryFromTo(int newFrom)249 inline void CMappedAlignment::AdjustQueryFromTo( int newFrom )
250 {
251 AdjustQueryFromBy( newFrom - GetQueryBounding().GetFrom() );
252 }
253
AdjustQueryToTo(int newTo)254 inline void CMappedAlignment::AdjustQueryToTo( int newTo )
255 {
256 AdjustQueryToBy( newTo - GetQueryBounding().GetTo() );
257 }
258
AdjustSubjectFromTo(int newFrom)259 inline void CMappedAlignment::AdjustSubjectFromTo( int newFrom )
260 {
261 AdjustSubjectFromBy( newFrom - GetSubjectBounding().GetFrom() );
262 }
263
AdjustSubjectToTo(int newTo)264 inline void CMappedAlignment::AdjustSubjectToTo( int newTo )
265 {
266 AdjustSubjectFromBy( newTo - GetSubjectBounding().GetTo() );
267 }
268
269 END_OLIGOFAR_SCOPES
270
271 #endif
272