1 #ifndef OLIGOFAR__CALIGNER__HPP
2 #define OLIGOFAR__CALIGNER__HPP
3
4 #include "iscore.hpp"
5 #include "ialigner.hpp"
6 #include "calignparam.hpp"
7 #include "cintron.hpp"
8 #include "ctranscript.hpp"
9
10 #include <iostream>
11
12 BEGIN_OLIGOFAR_SCOPES
13
14 class IScore;
15 class CScoreParam;
16 class CScoringFactory;
17 class CAligner : public IAligner, public CTrBase
18 {
19 public:
CAligner(CScoringFactory * sf)20 CAligner( CScoringFactory * sf ) :
21 m_scoringFactory( sf ),
22 m_scoreParam( 0 ),
23 m_alignParam( 0 ),
24 m_score( 0 ),
25 m_extentionPenaltyDropoff( -1000000 ),
26 m_matrixSize( 0 ),
27 m_matrixOffset( -1 ),
28 m_queryBases( 0 ),
29 m_queryBasesChecked( 0 ),
30 m_scoreCalls( 0 ),
31 m_totalCalls( 0 ),
32 m_successAligns( 0 ),
33 m_algoX0all( 0 ),
34 m_algoX0end( 0 ),
35 m_algoX0win( 0 ),
36 m_algoSWend( 0 ),
37 m_algoSWwin( 0 ),
38 m_wordDistance( 0 )
39 {}
40
41 ~CAligner();
42
SetWordDistance(int k)43 virtual void SetWordDistance( int k ) { m_wordDistance = k; }
44 virtual void SetQueryCoding( CSeqCoding::ECoding );
45 virtual void SetSubjectCoding( CSeqCoding::ECoding );
46 virtual void SetQueryStrand( CSeqCoding::EStrand );
47 virtual void SetSubjectStrand( CSeqCoding::EStrand );
48
49 virtual void SetQuery( const char * begin, int len );
50 virtual void SetSubject( const char * begin, int len );
51 virtual void SetSubjectAnchor( int first, int last ); // last included
52 virtual void SetQueryAnchor( int first, int last ); // last included
53
54 virtual void SetSubjectGuideTranscript( const TTranscript& tr );
55
56 virtual void SetPenaltyLimit( double limit );
57 virtual void SetExtentionPenaltyDropoff( double limit );
58
59 virtual bool Align();
60
GetPenalty() const61 virtual double GetPenalty() const { return m_penalty; }
GetSubjectFrom() const62 virtual int GetSubjectFrom() const { return m_posS1; }
GetSubjectTo() const63 virtual int GetSubjectTo() const { return m_posS2; }
GetQueryFrom() const64 virtual int GetQueryFrom() const { return m_posQ1; }
GetQueryTo() const65 virtual int GetQueryTo() const { return m_posQ2; }
66
GetSubjectTranscript() const67 virtual const TTranscript& GetSubjectTranscript() const { return m_transcript; }
68
GetScoringFactory() const69 virtual CScoringFactory * GetScoringFactory() const { return m_scoringFactory; }
70
GetSpliceSet() const71 const CAlignParam::TSpliceSet& GetSpliceSet() const { return m_alignParam->GetSpliceSet(); }
GetSpliceCount() const72 int GetSpliceCount() const { return GetSpliceSet().size(); }
NoSplicesDefined() const73 bool NoSplicesDefined() const { return GetSpliceSet().empty(); }
74
75 class CCell : public TTrItem
76 {
77 public:
CCell(double score=0,EEvent ev=eEvent_NULL,unsigned cnt=1)78 CCell( double score = 0, EEvent ev = eEvent_NULL, unsigned cnt = 1 ) :
79 TTrItem( ev, cnt ), m_score( score ) {}
GetScore() const80 double GetScore() const { return m_score; }
81 protected:
82 double m_score;
83 };
84 void PrintStatistics( ostream& out = cerr );
85
86 protected:
87
88 class CMaxGapLengths
89 {
90 public:
91 CMaxGapLengths( double maxpenalty, const CScoreParam * sc, const CAlignParam * ap );
AreIndelsAllowed() const92 bool AreIndelsAllowed() const { return m_maxInsLen|m_maxDelLen; }
GetMaxDelLength() const93 int GetMaxDelLength() const { return m_maxDelLen; }
GetMaxInsLength() const94 int GetMaxInsLength() const { return m_maxInsLen; }
95 protected:
96 int m_maxDelLen;
97 int m_maxInsLen;
98 };
99
100 class CIndelLimits : public CMaxGapLengths
101 {
102 public:
103 CIndelLimits( double pm, const CScoreParam * sc, const CAlignParam * ap );
AreIndelsAllowed() const104 bool AreIndelsAllowed() const { return m_delBand|m_insBand; }
GetDelBand() const105 int GetDelBand() const { return m_delBand; }
GetInsBand() const106 int GetInsBand() const { return m_insBand; }
107 protected:
108 int m_delBand;
109 int m_insBand;
110 };
111
112 class CDiagonalScores
113 {
114 public:
CDiagonalScores(double id)115 CDiagonalScores( double id ) : m_aq( 0 ), m_wq( 0 ), m_vq( 0 ), m_id( id ) {}
Add(double score)116 CDiagonalScores& Add( double score ) { m_aq += m_id; m_wq += (m_vq = score); return *this; }
GetLastScore() const117 double GetLastScore() const { return m_vq; }
GetLastPenalty() const118 double GetLastPenalty() const { return m_id - m_vq; }
GetAccumulatedBestScore() const119 double GetAccumulatedBestScore() const { return m_aq; }
GetAccumulatedPenalty() const120 double GetAccumulatedPenalty() const { return m_aq - m_wq; }
GetAccumulatedScore() const121 double GetAccumulatedScore() const { return m_wq; }
ExceedsPenalty(double abslimit) const122 bool ExceedsPenalty( double abslimit ) const { return GetAccumulatedPenalty() > abslimit; }
LastIsMatch() const123 bool LastIsMatch() const { return m_vq > 0; }
GetLastEvent() const124 EEvent GetLastEvent() const { return LastIsMatch() ? eEvent_Match : eEvent_Replaced; }
Print(ostream & o) const125 ostream& Print( ostream& o ) const {
126 return o << "CDiagonalScores"
127 << DISPLAY( m_aq ) << DISPLAY( m_wq ) << DISPLAY( m_vq ) << DISPLAY( m_aq - m_wq ) << DISPLAY( GetLastEvent() );
128 }
129 protected:
130 double m_aq;
131 double m_wq;
132 double m_vq;
133 const double m_id;
134 };
135
136 enum EAlignModeFlags {
137 eAlign_startAfterSplice = 0x01,
138 eAlign_endOnSplice = 0x02
139 };
140
141 void MakeLocalSpliceSet( CIntron::TSet&, int qdir, int qpos ) const;
142 double Score( const char * q, const char * s ) const;
143
144 // AlignWindow() may adjust alignment positions - which is useful especially in the case of indel of
145 // length 2 the window boundary (otherwise neither AlignWindow nor AlignDiag then can see the all indel,
146 // which leads to double accounting for gap base score)
147 bool AlignWindow( const char * & q, const char * & Q, const char * & s, const char * & S, TTranscript& t );
148 bool AlignWinTwoDiag( const char * & q, const char * & Q, const char * & s, const char * & S, int splice, int smin, int smax, TTranscript& t );
149 bool AlignDiag( const char * q, const char * Q, const char * s, const char * S, bool reverseStrand );
150 bool AlignTail( int qlen, int qdir, const char * q, const char * s, int& qe, int& se, TTranscript& t, int qpos );
151 bool AlignTailDiag( int qlen, int qdir, double pmax, const char * qseq, const char * sseq, int& ge, int& se, TTranscript& t, int qpos );
152
153 void AdjustMatrixSize();
154 void AdjustWindowBoundary( const char * & q, const char * & Q, const char * & s, const char * & S, TTranscript& tw );
155 const CCell& GetMatrix( int q, int s ) const;
156 const CCell& SetMatrix( int q, int s, double v, CCell::EEvent e, int c = 1 );
157
158 template <typename Iterator>
AppendTranscript(Iterator b,Iterator e)159 void AppendTranscript( Iterator b, Iterator e ) { while( b != e ) AppendTranscriptItem( *b++ ); }
160
161 void AppendTranscriptItem( const TTrItem& u ); // { ASSERT( u.GetEvent() != eEvent_NULL ); m_transcript.AppendItem( u ); }
AppendTranscriptItem(EEvent ev,unsigned cnt=1)162 void AppendTranscriptItem( EEvent ev, unsigned cnt = 1 ) { AppendTranscriptItem( TTrItem( ev, cnt ) ); }
163
PrintQ(ostream & o,const char * q,const char * Q) const164 ostream& PrintQ( ostream& o, const char * q, const char * Q ) const {
165 for( Q += m_qryInc ; q != Q ; q += m_qryInc ) o << CIupacnaBase( CNcbi8naBase( *q ) );
166 return o;
167 }
PrintS(ostream & o,const char * s,const char * S) const168 ostream& PrintS( ostream& o, const char * s, const char * S ) const {
169 for( S += m_sbjInc ; s != S ; s += m_sbjInc ) o << CIupacnaBase( CNcbi8naBase( *s ) );
170 return o;
171 }
172
173 void x_ColorspaceUpdatePenalty();
174
175 protected:
176 CScoringFactory * m_scoringFactory;
177 const CScoreParam * m_scoreParam;
178 const CAlignParam * m_alignParam;
179 IScore * m_score;
180
181 CSeqCoding::ECoding m_qryCoding;
182 CSeqCoding::ECoding m_sbjCoding;
183 CSeqCoding::EStrand m_qryStrand;
184 CSeqCoding::EStrand m_sbjStrand;
185 int m_anchorQ1;
186 int m_anchorQ2;
187 int m_anchorS1;
188 int m_anchorS2;
189 int m_qryInc;
190 int m_sbjInc;
191 const char * m_qry;
192 const char * m_sbj;
193 int m_qryLength;
194 int m_sbjLength;
195 double m_extentionPenaltyDropoff;
196 double m_penaltyLimit;
197 double m_penalty;
198
199 int m_posQ1;
200 int m_posQ2;
201 int m_posS1;
202 int m_posS2;
203
204 vector<CCell> m_matrix;
205 int m_matrixSize;
206 int m_matrixOffset;
207
208 Uint8 m_queryBases;
209 Uint8 m_queryBasesChecked;
210 mutable Uint8 m_scoreCalls;
211 Uint8 m_totalCalls;
212 Uint8 m_successAligns;
213 Uint4 m_algoX0all;
214 Uint4 m_algoX0end;
215 Uint4 m_algoX0win;
216 Uint4 m_algoSWend;
217 Uint4 m_algoSWwin;
218
219 int m_wordDistance;
220
221 TTranscript m_transcript;
222 TTranscript m_guideTranscript;
223 static map<double,Uint8> CAligner::s_maxPenalty;
224 };
225
AppendTranscriptItem(const TTrItem & u)226 inline void CAligner::AppendTranscriptItem( const TTrItem& u )
227 {
228 ASSERT( u.GetEvent() != eEvent_NULL );
229 if( m_transcript.size() ) {
230 if( m_transcript.back().GetEvent() == eEvent_SoftMask && u.GetEvent() == eEvent_Insertion ) {
231 m_penalty += m_scoreParam->GetGapBasePenalty() + ( m_scoreParam->GetGapExtentionPenalty() + m_scoreParam->GetIdentityScore() - m_scoreParam->GetMismatchPenalty() ) * u.GetCount();
232 m_transcript.AppendItem( TTrItem( eEvent_SoftMask, u.GetCount() ) );
233 return;
234 } else if( u.GetEvent() == eEvent_SoftMask && m_transcript.back().GetEvent() == eEvent_Insertion ) {
235 m_penalty += m_scoreParam->GetGapBasePenalty() + ( m_scoreParam->GetGapExtentionPenalty() + m_scoreParam->GetIdentityScore() - m_scoreParam->GetMismatchPenalty() ) * m_transcript.back().GetCount();
236 m_transcript.back() = TTrItem( eEvent_SoftMask, m_transcript.back().GetCount() );
237 }
238 }
239 m_transcript.AppendItem( u );
240 }
241
242
SetPenaltyLimit(double l)243 inline void CAligner::SetPenaltyLimit( double l ) { m_penaltyLimit = l; }
SetExtentionPenaltyDropoff(double x)244 inline void CAligner::SetExtentionPenaltyDropoff( double x ) { m_extentionPenaltyDropoff = abs( x ); }
SetQueryStrand(CSeqCoding::EStrand s)245 inline void CAligner::SetQueryStrand( CSeqCoding::EStrand s ) { m_qryStrand = s; }
SetQueryCoding(CSeqCoding::ECoding c)246 inline void CAligner::SetQueryCoding( CSeqCoding::ECoding c ) { m_qryCoding = c; }
SetSubjectStrand(CSeqCoding::EStrand s)247 inline void CAligner::SetSubjectStrand( CSeqCoding::EStrand s ) { m_sbjStrand = s; }
SetSubjectCoding(CSeqCoding::ECoding c)248 inline void CAligner::SetSubjectCoding( CSeqCoding::ECoding c ) { m_sbjCoding = c; }
SetQuery(const char * b,int l)249 inline void CAligner::SetQuery( const char * b, int l ) { m_qry = b; m_qryLength = l; }
SetQueryAnchor(int f,int l)250 inline void CAligner::SetQueryAnchor( int f, int l ) { m_anchorQ1 = f; m_anchorQ2 = l; m_guideTranscript.clear(); }
SetSubject(const char * b,int l)251 inline void CAligner::SetSubject( const char * b, int l ) { m_sbj = b; m_sbjLength = l; }
SetSubjectAnchor(int f,int l)252 inline void CAligner::SetSubjectAnchor( int f, int l ) { m_anchorS1 = f; m_anchorS2 = l; m_guideTranscript.clear(); }
SetSubjectGuideTranscript(const TTranscript & tr)253 inline void CAligner::SetSubjectGuideTranscript( const TTranscript& tr )
254 {
255 pair<int,int> lengths = tr.ComputeLengths();
256 if( lengths.second == abs(m_anchorS2 - m_anchorS1 + 1) && lengths.first == abs(m_anchorS2 - m_anchorQ1 + 1) ) {
257 // clip masked regions
258 ITERATE( TTranscript, t, tr ) {
259 switch( t->GetEvent() ) {
260 case eEvent_SoftMask:
261 case eEvent_HardMask:
262 case eEvent_NULL: break;
263 default: m_guideTranscript.AppendItem( *t ); break;
264 }
265 }
266 }
267 }
268
GetMatrix(int q,int s) const269 inline const CAligner::CCell& CAligner::GetMatrix( int q, int s ) const
270 {
271 return m_matrix[ (q - m_matrixOffset)*m_matrixSize + ++s ];
272 }
273
SetMatrix(int q,int s,double v,CCell::EEvent e,int cnt)274 inline const CAligner::CCell& CAligner::SetMatrix( int q, int s, double v, CCell::EEvent e, int cnt )
275 {
276 //#define OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES 0
277 #if !defined(OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES)
278 #define OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES 0
279 #endif
280 #if OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES
281 ASSERT( q >= -1 );
282 ASSERT( s >= -1 );
283 ASSERT( q < m_matrixSize - 1 );
284 ASSERT( s < m_matrixSize - 1 );
285 #endif
286 return m_matrix[ (q - m_matrixOffset)*m_matrixSize + ++s ] = CCell( v, e, cnt );
287 }
288
Score(const char * q,const char * s) const289 inline double CAligner::Score( const char * q, const char * s ) const
290 {
291 ++m_scoreCalls;
292 return m_score->Score( q, s );
293 }
294
295 /* ========================================================================
296 * There are two parameters which control how many values of matrix should
297 * be computed and how meny should be queried.
298 *
299 * Score of the single gap is smaller or equal to the score of multiple
300 * gaps. But if we have limit on xdropof, the limit may be still controlled
301 * by how many gaps of the longest width may be introduced.
302 * ======================================================================== */
303
CMaxGapLengths(double pm,const CScoreParam * sc,const CAlignParam * ap)304 inline CAligner::CMaxGapLengths::CMaxGapLengths( double pm, const CScoreParam * sc, const CAlignParam * ap )
305 {
306 double id = sc->GetIdentityScore();
307 double go = sc->GetGapBasePenalty();
308 double ge = sc->GetGapExtentionPenalty();
309 int maxDel = int( (pm - go)/ge );
310 int maxIns = int((pm + id - go)/(id + ge));
311 m_maxDelLen = max( 0, min( ap->GetMaxDeletionLength(), maxDel ) );
312 m_maxInsLen = max( 0, min( ap->GetMaxInsertionLength(), min( maxIns, maxDel ) ) ); // m_maxDelLen can't be longer then xd
313 }
314
CIndelLimits(double pm,const CScoreParam * sc,const CAlignParam * ap)315 inline CAligner::CIndelLimits::CIndelLimits( double pm, const CScoreParam * sc, const CAlignParam * ap ) : CMaxGapLengths( pm, sc, ap )
316 {
317 double id = sc->GetIdentityScore();
318 double go = sc->GetGapBasePenalty();
319 double ge = sc->GetGapExtentionPenalty();
320
321 int maxDelCnt = int( pm / (go + m_maxDelLen*ge) );
322 int maxInsCnt = int( pm / (go + m_maxInsLen*ge + id) );
323
324 int extraDelLen = max( 0, int( (pm - maxDelCnt*(go + m_maxDelLen*ge) - go)/ge ) );
325 int extraInsLen = max( 0, int( (pm - maxInsCnt*(go + m_maxInsLen*ge) - go - id)/(ge + id) ) );
326
327 m_delBand = min( ap->GetMaxDeletionsCount(), max( m_maxDelLen, maxDelCnt * m_maxDelLen + extraDelLen ) );
328 m_insBand = min( ap->GetMaxInsertionsCount(), max( m_maxInsLen, maxInsCnt * m_maxInsLen + extraInsLen ) );
329 }
330
operator <<(std::ostream & o,const CAligner::CCell & cell)331 inline std::ostream& operator << ( std::ostream& o, const CAligner::CCell& cell )
332 {
333 return o << "[" << cell.GetCount() << cell.GetCigar() << ";" << cell.GetScore() << "]";
334 }
335
336 END_OLIGOFAR_SCOPES
337
338 #endif
339