1 #ifndef OLIGOFAR__CALIGNER__HPP
2 #define OLIGOFAR__CALIGNER__HPP
3 
4 #include "iscore.hpp"
5 #include "ialigner.hpp"
6 #include "calignparam.hpp"
7 #include "cintron.hpp"
8 #include "ctranscript.hpp"
9 
10 #include <iostream>
11 
12 BEGIN_OLIGOFAR_SCOPES
13 
14 class IScore;
15 class CScoreParam;
16 class CScoringFactory;
17 class CAligner : public IAligner, public CTrBase
18 {
19 public:
CAligner(CScoringFactory * sf)20     CAligner( CScoringFactory * sf ) :
21         m_scoringFactory( sf ),
22         m_scoreParam( 0 ),
23         m_alignParam( 0 ),
24         m_score( 0 ),
25         m_extentionPenaltyDropoff( -1000000 ),
26         m_matrixSize( 0 ),
27         m_matrixOffset( -1 ),
28         m_queryBases( 0 ),
29         m_queryBasesChecked( 0 ),
30         m_scoreCalls( 0 ),
31         m_totalCalls( 0 ),
32         m_successAligns( 0 ),
33         m_algoX0all( 0 ),
34         m_algoX0end( 0 ),
35         m_algoX0win( 0 ),
36         m_algoSWend( 0 ),
37         m_algoSWwin( 0 ),
38         m_wordDistance( 0 )
39         {}
40 
41     ~CAligner();
42 
SetWordDistance(int k)43     virtual void SetWordDistance( int k ) { m_wordDistance = k; }
44     virtual void SetQueryCoding( CSeqCoding::ECoding );
45     virtual void SetSubjectCoding( CSeqCoding::ECoding );
46     virtual void SetQueryStrand( CSeqCoding::EStrand );
47     virtual void SetSubjectStrand( CSeqCoding::EStrand );
48 
49     virtual void SetQuery( const char * begin, int len );
50     virtual void SetSubject( const char * begin, int len );
51     virtual void SetSubjectAnchor( int first, int last ); // last included
52     virtual void SetQueryAnchor( int first, int last ); // last included
53 
54     virtual void SetSubjectGuideTranscript( const TTranscript& tr );
55 
56     virtual void SetPenaltyLimit( double limit );
57     virtual void SetExtentionPenaltyDropoff( double limit );
58 
59     virtual bool Align();
60 
GetPenalty() const61     virtual double GetPenalty() const { return m_penalty; }
GetSubjectFrom() const62     virtual int GetSubjectFrom() const { return m_posS1; }
GetSubjectTo() const63     virtual int GetSubjectTo() const { return m_posS2; }
GetQueryFrom() const64     virtual int GetQueryFrom() const { return m_posQ1; }
GetQueryTo() const65     virtual int GetQueryTo() const { return m_posQ2; }
66 
GetSubjectTranscript() const67     virtual const TTranscript& GetSubjectTranscript() const { return m_transcript; }
68 
GetScoringFactory() const69     virtual CScoringFactory * GetScoringFactory() const { return m_scoringFactory; }
70 
GetSpliceSet() const71     const CAlignParam::TSpliceSet& GetSpliceSet() const { return m_alignParam->GetSpliceSet(); }
GetSpliceCount() const72     int   GetSpliceCount() const { return GetSpliceSet().size(); }
NoSplicesDefined() const73     bool  NoSplicesDefined() const { return GetSpliceSet().empty(); }
74 
75     class CCell : public TTrItem
76     {
77     public:
CCell(double score=0,EEvent ev=eEvent_NULL,unsigned cnt=1)78         CCell( double score = 0, EEvent ev = eEvent_NULL, unsigned cnt = 1 ) :
79             TTrItem( ev, cnt ), m_score( score ) {}
GetScore() const80         double GetScore() const { return m_score; }
81     protected:
82         double m_score;
83     };
84     void PrintStatistics( ostream& out = cerr );
85 
86 protected:
87 
88     class CMaxGapLengths
89     {
90     public:
91         CMaxGapLengths( double maxpenalty, const CScoreParam * sc, const CAlignParam * ap );
AreIndelsAllowed() const92         bool AreIndelsAllowed() const { return m_maxInsLen|m_maxDelLen; }
GetMaxDelLength() const93         int GetMaxDelLength() const { return m_maxDelLen; }
GetMaxInsLength() const94         int GetMaxInsLength() const { return m_maxInsLen; }
95     protected:
96         int m_maxDelLen;
97         int m_maxInsLen;
98     };
99 
100     class CIndelLimits : public CMaxGapLengths
101     {
102     public:
103         CIndelLimits( double pm, const CScoreParam * sc, const CAlignParam * ap );
AreIndelsAllowed() const104         bool AreIndelsAllowed() const { return m_delBand|m_insBand; }
GetDelBand() const105         int GetDelBand() const { return m_delBand; }
GetInsBand() const106         int GetInsBand() const { return m_insBand; }
107     protected:
108         int m_delBand;
109         int m_insBand;
110     };
111 
112     class CDiagonalScores
113     {
114     public:
CDiagonalScores(double id)115         CDiagonalScores( double id ) : m_aq( 0 ), m_wq( 0 ), m_vq( 0 ), m_id( id ) {}
Add(double score)116         CDiagonalScores& Add( double score ) { m_aq += m_id; m_wq += (m_vq = score); return *this; }
GetLastScore() const117         double GetLastScore() const { return m_vq; }
GetLastPenalty() const118         double GetLastPenalty() const { return m_id - m_vq; }
GetAccumulatedBestScore() const119         double GetAccumulatedBestScore() const { return m_aq; }
GetAccumulatedPenalty() const120         double GetAccumulatedPenalty() const { return m_aq - m_wq; }
GetAccumulatedScore() const121         double GetAccumulatedScore() const { return m_wq; }
ExceedsPenalty(double abslimit) const122         bool ExceedsPenalty( double abslimit ) const { return GetAccumulatedPenalty() > abslimit; }
LastIsMatch() const123         bool LastIsMatch() const { return m_vq > 0; }
GetLastEvent() const124         EEvent GetLastEvent() const { return LastIsMatch() ? eEvent_Match : eEvent_Replaced; }
Print(ostream & o) const125         ostream& Print( ostream& o ) const {
126             return o << "CDiagonalScores"
127                 << DISPLAY( m_aq ) << DISPLAY( m_wq ) << DISPLAY( m_vq ) << DISPLAY( m_aq - m_wq ) << DISPLAY( GetLastEvent() );
128         }
129     protected:
130         double m_aq;
131         double m_wq;
132         double m_vq;
133         const double m_id;
134     };
135 
136     enum EAlignModeFlags {
137         eAlign_startAfterSplice = 0x01,
138         eAlign_endOnSplice = 0x02
139     };
140 
141     void MakeLocalSpliceSet( CIntron::TSet&, int qdir, int qpos ) const;
142     double Score( const char * q, const char * s ) const;
143 
144     // AlignWindow() may adjust alignment positions - which is useful especially in the case of indel of
145     // length 2 the window boundary (otherwise neither AlignWindow nor AlignDiag then can see the all indel,
146     // which leads to double accounting for gap base score)
147     bool AlignWindow( const char * & q, const char * & Q, const char * & s, const char * & S, TTranscript& t );
148     bool AlignWinTwoDiag( const char * & q, const char * & Q, const char * & s, const char * & S, int splice, int smin, int smax, TTranscript& t );
149     bool AlignDiag( const char * q, const char * Q, const char * s, const char * S, bool reverseStrand );
150     bool AlignTail( int qlen, int qdir, const char * q, const char * s, int& qe, int& se, TTranscript& t, int qpos );
151     bool AlignTailDiag( int qlen, int qdir, double pmax, const char * qseq, const char * sseq, int& ge, int& se, TTranscript& t, int qpos );
152 
153     void AdjustMatrixSize();
154     void AdjustWindowBoundary( const char * & q, const char * & Q, const char * & s, const char * & S, TTranscript& tw );
155     const CCell& GetMatrix( int q, int s ) const;
156     const CCell& SetMatrix( int q, int s, double v, CCell::EEvent e, int c = 1 );
157 
158     template <typename Iterator>
AppendTranscript(Iterator b,Iterator e)159     void  AppendTranscript( Iterator b, Iterator e ) { while( b != e ) AppendTranscriptItem( *b++ ); }
160 
161     void  AppendTranscriptItem( const TTrItem& u ); // { ASSERT( u.GetEvent() != eEvent_NULL ); m_transcript.AppendItem( u ); }
AppendTranscriptItem(EEvent ev,unsigned cnt=1)162     void  AppendTranscriptItem( EEvent ev, unsigned cnt = 1 ) { AppendTranscriptItem( TTrItem( ev, cnt ) ); }
163 
PrintQ(ostream & o,const char * q,const char * Q) const164     ostream& PrintQ( ostream& o, const char * q, const char * Q ) const {
165         for( Q += m_qryInc ; q != Q ; q += m_qryInc ) o << CIupacnaBase( CNcbi8naBase( *q ) );
166         return o;
167     }
PrintS(ostream & o,const char * s,const char * S) const168     ostream& PrintS( ostream& o, const char * s, const char * S ) const {
169         for( S += m_sbjInc ; s != S ; s += m_sbjInc ) o << CIupacnaBase( CNcbi8naBase( *s ) );
170         return o;
171     }
172 
173     void x_ColorspaceUpdatePenalty();
174 
175 protected:
176     CScoringFactory * m_scoringFactory;
177     const CScoreParam * m_scoreParam;
178     const CAlignParam * m_alignParam;
179     IScore * m_score;
180 
181     CSeqCoding::ECoding m_qryCoding;
182     CSeqCoding::ECoding m_sbjCoding;
183     CSeqCoding::EStrand m_qryStrand;
184     CSeqCoding::EStrand m_sbjStrand;
185     int m_anchorQ1;
186     int m_anchorQ2;
187     int m_anchorS1;
188     int m_anchorS2;
189     int m_qryInc;
190     int m_sbjInc;
191     const char * m_qry;
192     const char * m_sbj;
193     int m_qryLength;
194     int m_sbjLength;
195     double m_extentionPenaltyDropoff;
196     double m_penaltyLimit;
197     double m_penalty;
198 
199     int m_posQ1;
200     int m_posQ2;
201     int m_posS1;
202     int m_posS2;
203 
204     vector<CCell> m_matrix;
205     int m_matrixSize;
206     int m_matrixOffset;
207 
208     Uint8 m_queryBases;
209     Uint8 m_queryBasesChecked;
210     mutable Uint8 m_scoreCalls;
211     Uint8 m_totalCalls;
212     Uint8 m_successAligns;
213     Uint4 m_algoX0all;
214     Uint4 m_algoX0end;
215     Uint4 m_algoX0win;
216     Uint4 m_algoSWend;
217     Uint4 m_algoSWwin;
218 
219     int m_wordDistance;
220 
221     TTranscript m_transcript;
222     TTranscript m_guideTranscript;
223     static map<double,Uint8> CAligner::s_maxPenalty;
224 };
225 
AppendTranscriptItem(const TTrItem & u)226 inline void CAligner::AppendTranscriptItem( const TTrItem& u )
227 {
228     ASSERT( u.GetEvent() != eEvent_NULL );
229     if( m_transcript.size() ) {
230         if( m_transcript.back().GetEvent() == eEvent_SoftMask && u.GetEvent() == eEvent_Insertion ) {
231             m_penalty += m_scoreParam->GetGapBasePenalty() + ( m_scoreParam->GetGapExtentionPenalty() + m_scoreParam->GetIdentityScore() - m_scoreParam->GetMismatchPenalty() ) * u.GetCount();
232             m_transcript.AppendItem( TTrItem( eEvent_SoftMask, u.GetCount() ) );
233             return;
234         } else if( u.GetEvent() == eEvent_SoftMask && m_transcript.back().GetEvent() == eEvent_Insertion ) {
235             m_penalty += m_scoreParam->GetGapBasePenalty() + ( m_scoreParam->GetGapExtentionPenalty() + m_scoreParam->GetIdentityScore() - m_scoreParam->GetMismatchPenalty() ) * m_transcript.back().GetCount();
236             m_transcript.back() = TTrItem( eEvent_SoftMask, m_transcript.back().GetCount() );
237         }
238     }
239     m_transcript.AppendItem( u );
240 }
241 
242 
SetPenaltyLimit(double l)243 inline void CAligner::SetPenaltyLimit( double l ) { m_penaltyLimit = l; }
SetExtentionPenaltyDropoff(double x)244 inline void CAligner::SetExtentionPenaltyDropoff( double x ) { m_extentionPenaltyDropoff = abs( x ); }
SetQueryStrand(CSeqCoding::EStrand s)245 inline void CAligner::SetQueryStrand( CSeqCoding::EStrand s ) { m_qryStrand = s; }
SetQueryCoding(CSeqCoding::ECoding c)246 inline void CAligner::SetQueryCoding( CSeqCoding::ECoding c ) { m_qryCoding = c; }
SetSubjectStrand(CSeqCoding::EStrand s)247 inline void CAligner::SetSubjectStrand( CSeqCoding::EStrand s ) { m_sbjStrand = s; }
SetSubjectCoding(CSeqCoding::ECoding c)248 inline void CAligner::SetSubjectCoding( CSeqCoding::ECoding c ) { m_sbjCoding = c; }
SetQuery(const char * b,int l)249 inline void CAligner::SetQuery( const char * b, int l ) { m_qry = b; m_qryLength = l; }
SetQueryAnchor(int f,int l)250 inline void CAligner::SetQueryAnchor( int f, int l ) { m_anchorQ1 = f; m_anchorQ2 = l; m_guideTranscript.clear(); }
SetSubject(const char * b,int l)251 inline void CAligner::SetSubject( const char * b, int l ) { m_sbj = b; m_sbjLength = l; }
SetSubjectAnchor(int f,int l)252 inline void CAligner::SetSubjectAnchor( int f, int l ) { m_anchorS1 = f; m_anchorS2 = l; m_guideTranscript.clear(); }
SetSubjectGuideTranscript(const TTranscript & tr)253 inline void CAligner::SetSubjectGuideTranscript( const TTranscript& tr )
254 {
255     pair<int,int> lengths = tr.ComputeLengths();
256     if( lengths.second == abs(m_anchorS2 - m_anchorS1 + 1) && lengths.first == abs(m_anchorS2 - m_anchorQ1 + 1) ) {
257         // clip masked regions
258         ITERATE( TTranscript, t, tr ) {
259             switch( t->GetEvent() ) {
260                 case eEvent_SoftMask:
261                 case eEvent_HardMask:
262                 case eEvent_NULL: break;
263                 default: m_guideTranscript.AppendItem( *t ); break;
264             }
265         }
266     }
267 }
268 
GetMatrix(int q,int s) const269 inline const CAligner::CCell& CAligner::GetMatrix( int q, int s ) const
270 {
271     return m_matrix[ (q - m_matrixOffset)*m_matrixSize + ++s ];
272 }
273 
SetMatrix(int q,int s,double v,CCell::EEvent e,int cnt)274 inline const CAligner::CCell& CAligner::SetMatrix( int q, int s, double v, CCell::EEvent e, int cnt )
275 {
276 //#define OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES 0
277 #if !defined(OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES)
278 #define OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES 0
279 #endif
280 #if OLIGOFAR_ALIGN_CHECK_MATRIX_BOUNDARIES
281     ASSERT( q >= -1 );
282     ASSERT( s >= -1 );
283     ASSERT( q < m_matrixSize - 1 );
284     ASSERT( s < m_matrixSize - 1 );
285 #endif
286     return m_matrix[ (q - m_matrixOffset)*m_matrixSize + ++s ] = CCell( v, e, cnt );
287 }
288 
Score(const char * q,const char * s) const289 inline double CAligner::Score( const char * q, const char * s ) const
290 {
291     ++m_scoreCalls;
292     return m_score->Score( q, s );
293 }
294 
295 /* ========================================================================
296  * There are two parameters which control how many values of matrix should
297  * be computed and how meny should be queried.
298  *
299  * Score of the single gap is smaller or equal to the score of multiple
300  * gaps. But if we have limit on xdropof, the limit may be still controlled
301  * by how many gaps of the longest width may be introduced.
302  * ======================================================================== */
303 
CMaxGapLengths(double pm,const CScoreParam * sc,const CAlignParam * ap)304 inline CAligner::CMaxGapLengths::CMaxGapLengths( double pm, const CScoreParam * sc, const CAlignParam * ap )
305 {
306     double id = sc->GetIdentityScore();
307     double go = sc->GetGapBasePenalty();
308     double ge = sc->GetGapExtentionPenalty();
309     int maxDel = int( (pm - go)/ge );
310     int maxIns = int((pm + id - go)/(id + ge));
311     m_maxDelLen = max( 0, min( ap->GetMaxDeletionLength(), maxDel ) );
312     m_maxInsLen = max( 0, min( ap->GetMaxInsertionLength(), min( maxIns, maxDel ) ) ); // m_maxDelLen can't be longer then xd
313 }
314 
CIndelLimits(double pm,const CScoreParam * sc,const CAlignParam * ap)315 inline CAligner::CIndelLimits::CIndelLimits( double pm, const CScoreParam * sc, const CAlignParam * ap ) : CMaxGapLengths( pm, sc, ap )
316 {
317     double id = sc->GetIdentityScore();
318     double go = sc->GetGapBasePenalty();
319     double ge = sc->GetGapExtentionPenalty();
320 
321     int maxDelCnt = int( pm / (go + m_maxDelLen*ge) );
322     int maxInsCnt = int( pm / (go + m_maxInsLen*ge + id) );
323 
324     int extraDelLen = max( 0, int( (pm - maxDelCnt*(go + m_maxDelLen*ge) - go)/ge ) );
325     int extraInsLen = max( 0, int( (pm - maxInsCnt*(go + m_maxInsLen*ge) - go - id)/(ge + id) ) );
326 
327     m_delBand = min( ap->GetMaxDeletionsCount(), max( m_maxDelLen, maxDelCnt * m_maxDelLen + extraDelLen ) );
328     m_insBand = min( ap->GetMaxInsertionsCount(), max( m_maxInsLen, maxInsCnt * m_maxInsLen + extraInsLen ) );
329 }
330 
operator <<(std::ostream & o,const CAligner::CCell & cell)331 inline std::ostream& operator << ( std::ostream& o, const CAligner::CCell& cell )
332 {
333     return o << "[" << cell.GetCount() << cell.GetCigar() << ";" << cell.GetScore() << "]";
334 }
335 
336 END_OLIGOFAR_SCOPES
337 
338 #endif
339