1 #ifndef OLIGOFAR_CSAMALIGNMENTS__HPP
2 #define OLIGOFAR_CSAMALIGNMENTS__HPP
3 
4 #include "ialignmentmap.hpp"
5 #include "cseqcoding.hpp"
6 #include "csam.hpp"
7 
8 BEGIN_OLIGOFAR_SCOPES
9 
10 class CSRead : public IShortRead
11 {
12 public:
~CSRead()13     ~CSRead() { delete[] m_data; --s_count; }
14 
GetPairType() const15     virtual EPairType GetPairType() const { return x_GetPairType(); }
GetSequenceCoding() const16     virtual CSeqCoding::ECoding GetSequenceCoding() const { return x_GetSequenceCoding(); }
GetSequenceLength() const17     virtual int GetSequenceLength() const { return x_GetSequenceLength(); }
GetMate() const18     const IShortRead * GetMate() const { return x_GetPairType() ? x_GetMate() : 0; }
GetSequenceData() const19     const char * GetSequenceData() const { return m_data + x_GetSequenceOffset(); }
GetId() const20     const char * GetId() const { return x_GetPairType() == eRead_second ? x_GetMate()->GetId() : m_data + x_GetIdOffset(); }
21 
22     CSRead( const string& id, CSeqCoding::ECoding coding, const char * sequence, int length, EPairType intended = eRead_single );
23     CSRead( const string& id, const string& sequence, const string& quality = "", EPairType intended = eRead_single );
24     CSRead( const CSRead& r );
25     CSRead& operator = ( const CSRead& r );
26 
27     void SetMate( CSRead * other ); // other becomes second sequence
28 
GetIntendedPairType() const29     EPairType GetIntendedPairType() const { return EPairType((m_data[2] >> 2)&3); }
30 
31     string GetIupacna( CSeqCoding::EStrand = CSeqCoding::eStrand_pos ) const;
32 
GetCount()33     static int GetCount() { return s_count; }
34 
35 protected:
x_GetIdLength() const36     Uint2 x_GetIdLength() const { return Uint1( m_data[0] ); }
x_GetSequenceLength() const37     Uint2 x_GetSequenceLength() const { return Uint1( m_data[1] ); }
x_GetPairType() const38     EPairType x_GetPairType() const { return EPairType( m_data[2] & 3 ); }
x_GetSequenceCoding() const39     CSeqCoding::ECoding x_GetSequenceCoding() const { return CSeqCoding::ECoding( Uint1( m_data[3] ) ); }
40 
x_GetHeaderSize() const41     Uint1 x_GetHeaderSize() const { return 4 + ( x_GetPairType() ? sizeof( void* ) : 0 ); }
x_GetIdOffset() const42     Uint2 x_GetIdOffset() const { return x_GetHeaderSize(); }
x_GetSequenceOffset() const43     Uint2 x_GetSequenceOffset() const { return x_GetIdOffset() + x_GetIdLength() + 1; }
x_GetDataSize() const44     Uint2 x_GetDataSize() const { return x_GetSequenceOffset() + x_GetSequenceLength() + 1; }
x_GetDataSize(int idlen,int seqlen,EPairType type) const45     Uint2 x_GetDataSize( int idlen, int seqlen, EPairType type ) const {
46         switch( type ) {
47             case eRead_single: return 6 + idlen + seqlen;
48             case eRead_first: return 6 + sizeof( void * ) + idlen + seqlen;
49             case eRead_second: return 5 + sizeof( void * ) + seqlen;
50         }
51         THROW( logic_error, "Unknown read pair type" );
52     }
x_GetMate() const53     const CSRead * x_GetMate() const { return *(CSRead**)(m_data + 4); }
x_SetMate()54     CSRead * & x_SetMate() { return *(CSRead**)(m_data + 4); }
55 protected:
56     char * m_data;
57     static int s_count;
58 };
59 
60 class CContig : public INucSeq
61 {
62 public:
~CContig()63     ~CContig() { if( m_owns ) delete[] m_data; --s_count; }
64     CContig( const string& name );
65     CContig( const CContig& ctg );
SetSequenceData(const char * data,int length,int offset=0,CSeqCoding::ECoding coding=CSeqCoding::eCoding_ncbi8na,bool owns=false)66     void SetSequenceData( const char * data, int length, int offset = 0, CSeqCoding::ECoding coding = CSeqCoding::eCoding_ncbi8na, bool owns = false ) {
67         if( m_owns ) delete[] m_data;
68         m_data = const_cast<char*>( data );  // if we own it - we can do anything, otherwise we don't touch
69         m_size = length; m_coding = coding; m_owns = owns;
70         m_offset = offset;
71     }
GetAvailableRange() const72     CRange<int> GetAvailableRange() const { return CRange<int>( m_offset, m_size - 1 ); }
GetId() const73     virtual const char * GetId() const { return m_id.c_str(); }
GetSequenceData() const74     virtual const char * GetSequenceData() const { return m_data - m_offset; } //&m_data[0]; }
GetSequenceLength() const75     virtual int GetSequenceLength() const { return m_size; } // m_data.size(); }
GetSequenceOffset() const76     virtual int GetSequenceOffset() const { return m_offset; }
GetSequenceCoding() const77     virtual CSeqCoding::ECoding GetSequenceCoding() const { return m_coding; } //CSeqCoding::eCoding_ncbi8na; }
78 
GetCount()79     static int GetCount() { return s_count; }
80 private:
81     CContig& operator = ( const CContig& );
82 protected:
83     string m_id;
84     char * m_data;
85     int m_size;
86     int m_offset;
87     CSeqCoding::ECoding m_coding;
88     bool m_owns;
89 
90     static int s_count;
91 };
92 
93 class IAligner;
94 class CMappedAlignment : public IMappedAlignment
95 {
96 public:
97     typedef map<string,string> TTags;
98     enum EFlags { fOwnQuery = 0x01, fOwnSubject = 0x02, fOwnSeqs = fOwnQuery|fOwnSubject };
99     enum ETagType { eType_int = 'i', eType_float = 'f', eType_string = 'Z', eType_NONE = 0 };
GetId() const100     virtual const char * GetId() const { return 0; }
GetQuery() const101     virtual const IShortRead * GetQuery() const { return m_query; }
GetSubject() const102     virtual const INucSeq * GetSubject() const { return m_subject; }
GetSubjectBounding() const103     virtual TRange GetSubjectBounding() const { return TRange( m_sstart, m_sstart + m_slength - 1 ); }
GetQueryBounding() const104     virtual TRange GetQueryBounding() const { return m_qlength > 0 ? TRange( m_qstart, m_qstart + m_qlength - 1 ) : TRange( m_qstart + m_qlength + 1, m_qstart ); }
GetCigar() const105     virtual const TTrSequence GetCigar() const { return m_cigar; }
IsReverseComplement() const106     virtual bool IsReverseComplement() const { return m_qlength < 0; }
GetMate() const107     virtual const IMappedAlignment * GetMate() const { return m_mate; }
SetQuery()108     CSRead * SetQuery() { return m_query; }
SetSubject()109     CContig * SetSubject() { return m_subject; }
GetStrand() const110     CSeqCoding::EStrand GetStrand() const { return IsReverseComplement() ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos; }
111     void SetMate( CMappedAlignment * other );
112     void WriteAsSam( ostream& out ) const;
SetFlags(int flags,bool on=true)113     void SetFlags( int flags, bool on = true ) { if( on ) m_flags |= flags; else m_flags &= ~flags; }
114     void AdjustQueryFromBy( int );
115     void AdjustQueryToBy( int  );
116     void AdjustSubjectFromBy( int  );
117     void AdjustSubjectToBy( int  );
118     void AdjustQueryFromTo( int );
119     void AdjustQueryToTo( int  );
120     void AdjustSubjectFromTo( int  );
121     void AdjustSubjectToTo( int  );
122     void SetTagS( const string& name, const string& val );
123     void SetTagI( const string& name, int val );
124     void SetTagF( const string& name, float val );
125     char GetTagType( const string& name ) const;
126     string GetTagS( const string& name, char type = 'Z' ) const;
127     int GetTagI( const string& name ) const;
128     float GetTagF( const string& name ) const;
129     void RemoveTag( const string& name );
130     void Assign( const CMappedAlignment * other );
SetTags(TTags * tags)131     void SetTags( TTags * tags ) { delete m_tags; m_tags = tags; }
GetTags() const132     const TTags& GetTags() const { static TTags notags; if( m_tags ) return *m_tags; else return notags; }
133     CMappedAlignment * MakeExtended( IAligner * aligner ) const;
Clone() const134     CMappedAlignment * Clone() const { return new CMappedAlignment( *this ); }
135     CMappedAlignment( CSRead * query, CContig * subject, int from, const TTrVector& cigar, CSeqCoding::EStrand strand, int flags = 0, TTags * tags = 0 );
~CMappedAlignment()136     ~CMappedAlignment() { if( m_flags & fOwnQuery ) delete m_query; if( m_flags & fOwnSubject ) delete m_subject; delete m_tags; --s_count; }
PrintDebug(ostream & out) const137     void PrintDebug( ostream& out ) const {
138         out << m_cigar.ToString() << DISPLAY( m_sstart ) << DISPLAY( m_slength ) << DISPLAY( m_qstart ) << DISPLAY( m_qlength ) << hex << DISPLAY( m_flags ) << dec;
139     }
140     bool ValidateConsistency( const string& context = "" ) const;
operator ==(const CMappedAlignment & other) const141     bool operator == ( const CMappedAlignment& other ) const {
142         return
143             strcmp( m_query->GetId(), other.m_query->GetId() ) == 0 &&
144             strcmp( m_subject->GetId(), other.m_subject->GetId() ) == 0 &&
145             m_sstart == other.m_sstart && m_slength == other.m_slength &&
146             m_qstart == other.m_qstart && m_qlength == other.m_qlength &&
147             m_cigar == other.m_cigar;
148     }
operator !=(const CMappedAlignment & other) const149     bool operator != ( const CMappedAlignment& other ) const { return !operator == ( other ); }
150 
GetCount()151     static int GetCount() { return s_count; }
152 
153     double ScoreAlignment( double id = 1, double mm = -1, double go = -3, double ge = -1.5 ) const;
154     static double ScoreAlignment( const TTrSequence& cigar, double id = 1, double mm = -1, double go = -3, double ge = -1.5 );
155 
156 private:
157     CMappedAlignment& operator = ( const CMappedAlignment& );
158     enum EAdjustMode { eAdjust_query, eAdjust_subj };
159     void AdjustAlignment( EAdjustMode, int fromBy, int toBy  );
160 protected:
161     //int x_AdjustCigar( int advanceFrontBy, EAdjustMode );
162     CMappedAlignment( const CMappedAlignment& a );
163 protected:
164     CSRead * m_query;
165     CContig * m_subject;
166     const CMappedAlignment * m_mate;
167     Int4 m_sstart;
168     Int2 m_slength;
169     Int2 m_qstart;
170     Int2 m_qlength;
171     Int2 m_flags;
172     TTrSequence m_cigar;
173     TTags * m_tags;
174     static int s_count;
175 public:
176     static bool s_validate_consistency;
177 };
178 
179 class CSamSource : public IAlignmentMap, public CSamBase
180 {
181 public:
182     typedef vector<CMappedAlignment*> TAlignmentList;
183     typedef vector<Uint8> TFileOffsets;
184     typedef map<string,CSRead*> TReads;
185     typedef map<string,CContig*> TContigs;
186     typedef map<string,TAlignmentList> TAlignmentMap;
187     typedef map<string,TFileOffsets> TAlignmentOffsetMap;
188     typedef map<pair<string, pair<string,int> >, CMappedAlignment* > TPendingHits;
189 
190     enum ERegisterFlags { fRegisterQuery = 0x01, fRegisterSubject = 0x02, fRegisterAlignmentByQuery = 0x04, fRegisterAlignmentBySubject = 0x08 };
191 
192     ~CSamSource();
193     CMappedAlignment * ParseSamLine( const vector<string>& samLine, int flags = 0 );
194     CMappedAlignment * ParseSamLine( const string& samLine, int flags = 0 );
195 
196     void IndexFile( const string& samFile, unsigned start, unsigned limit );
197 
198     static TTrVector GetFullCigar( const TTrVector& cigar, const string& mismatches );
199 
GetAlignmentsForQueryId(ICallback *,const char * id,int mates=3)200     virtual void GetAlignmentsForQueryId( ICallback *, const char * id, int mates = 3 ) {}
GetAlignmentsForSubjectId(ICallback *,const char * id,int strands=3,int from=kSequenceBegin,int to=kSequenceEnd)201     virtual void GetAlignmentsForSubjectId( ICallback *, const char * id, int strands = 3, int from = kSequenceBegin, int to = kSequenceEnd ) {}
GetAllAlignments(ICallback *)202     virtual void GetAllAlignments( ICallback * ) {}
GetAllQueries(ICallback *)203     virtual void GetAllQueries( ICallback * ) {}
GetAllSubjects(ICallback *)204     virtual void GetAllSubjects( ICallback * ) {}
205 
AddAlignment(IMappedAlignment * ma)206     virtual void AddAlignment( IMappedAlignment * ma ) {}
AddSubject(INucSeq * ma)207     virtual void AddSubject( INucSeq * ma ) {}
AddQuery(IShortRead * ma)208     virtual void AddQuery( IShortRead * ma ) {}
209 
GetAlignmentOffsetMap() const210     const TAlignmentOffsetMap& GetAlignmentOffsetMap() const { return m_alignmentOffsets; }
GetAlignmentOffsets(const string & ctg) const211     const TFileOffsets& GetAlignmentOffsets( const string& ctg ) const {
212         TAlignmentOffsetMap::const_iterator x = m_alignmentOffsets.find( ctg );
213         if( x == m_alignmentOffsets.end() ) {
214             static TFileOffsets null;
215             return null;
216         }
217         else return x->second;
218     }
219 protected:
220     TContigs m_contigs;
221     TReads m_reads;
222     TAlignmentMap m_alignmentsByContig;
223     TAlignmentMap m_alignmentsByRead;
224     TAlignmentOffsetMap m_alignmentOffsets;
225     TPendingHits m_pendingHits;
226     auto_ptr<ifstream> m_samFile;
227 };
228 
AdjustQueryFromBy(int by)229 inline void CMappedAlignment::AdjustQueryFromBy( int by )
230 {
231     AdjustAlignment( eAdjust_query, by, 0 );
232 }
233 
AdjustQueryToBy(int by)234 inline void CMappedAlignment::AdjustQueryToBy( int by )
235 {
236     AdjustAlignment( eAdjust_query, 0, by );
237 }
238 
AdjustSubjectFromBy(int by)239 inline void CMappedAlignment::AdjustSubjectFromBy( int by )
240 {
241     AdjustAlignment( eAdjust_subj, by, 0 );
242 }
243 
AdjustSubjectToBy(int by)244 inline void CMappedAlignment::AdjustSubjectToBy( int by )
245 {
246     AdjustAlignment( eAdjust_subj, 0, by );
247 }
248 
AdjustQueryFromTo(int newFrom)249 inline void CMappedAlignment::AdjustQueryFromTo( int newFrom )
250 {
251     AdjustQueryFromBy( newFrom - GetQueryBounding().GetFrom() );
252 }
253 
AdjustQueryToTo(int newTo)254 inline void CMappedAlignment::AdjustQueryToTo( int newTo )
255 {
256     AdjustQueryToBy( newTo - GetQueryBounding().GetTo() );
257 }
258 
AdjustSubjectFromTo(int newFrom)259 inline void CMappedAlignment::AdjustSubjectFromTo( int newFrom )
260 {
261     AdjustSubjectFromBy( newFrom - GetSubjectBounding().GetFrom() );
262 }
263 
AdjustSubjectToTo(int newTo)264 inline void CMappedAlignment::AdjustSubjectToTo( int newTo )
265 {
266     AdjustSubjectFromBy( newTo - GetSubjectBounding().GetTo() );
267 }
268 
269 END_OLIGOFAR_SCOPES
270 
271 #endif
272