1 /* $Id: nw_band_aligner.cpp 198851 2010-07-29 15:16:34Z kiryutin $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Yuri Kapustin
27  *
28  * File Description:  CBandAligner implementation
29  *
30  * ===========================================================================
31  *
32  */
33 
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "messages.hpp"
38 
39 #include <algo/align/nw/align_exception.hpp>
40 #include <algo/align/nw/nw_band_aligner.hpp>
41 
42 BEGIN_NCBI_SCOPE
43 
44 static const size_t kMax_size_t = numeric_limits<size_t>::max();
45 
CBandAligner(const char * seq1,size_t len1,const char * seq2,size_t len2,const SNCBIPackedScoreMatrix * scoremat,size_t band)46 CBandAligner::CBandAligner( const char* seq1, size_t len1,
47                             const char* seq2, size_t len2,
48                             const SNCBIPackedScoreMatrix* scoremat,
49                             size_t band):
50     CNWAligner(seq1, len1, seq2, len2, scoremat),
51     m_band(band),
52     m_Shift(0)
53 {
54 }
55 
56 
CBandAligner(const string & seq1,const string & seq2,const SNCBIPackedScoreMatrix * scoremat,size_t band)57 CBandAligner::CBandAligner(const string& seq1,
58                            const string& seq2,
59                            const SNCBIPackedScoreMatrix* scoremat,
60                            size_t band):
61     CNWAligner(seq1, seq2, scoremat),
62     m_band(band),
63     m_Shift(0)
64 {
65 }
66 
67 
SetShift(Uint1 where,size_t offset)68 void CBandAligner::SetShift(Uint1 where, size_t offset)
69 {
70     switch(where) {
71     case 0: m_Shift = offset;  break;
72     case 1: m_Shift = -(long)offset; break;
73     default:
74         NCBI_THROW(CAlgoAlignException, eBadParameter,
75                    "CBandAligner::SetShift(): Incorrect sequence index specified");
76     }
77 }
78 
79 
GetShift(void) const80 pair<Uint1,size_t> CBandAligner::GetShift(void) const
81 {
82     Uint1 where;
83     size_t offset;
84     if(m_Shift < 0) {
85         where = 1;
86         offset = size_t(-m_Shift);
87     }
88     else {
89         where = 0;
90         offset = size_t(m_Shift);
91     }
92     return pair<Uint1,size_t>(where, offset);
93 }
94 
95 
96 // evaluate score for each possible alignment;
97 // fill out backtrace matrix limited to the band
98 // bit coding (four bits per value): D E Ec Fc
99 // D:  1 if diagonal; 0 - otherwise
100 // E:  1 if space in 1st sequence; 0 if space in 2nd sequence
101 // Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened
102 // Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened
103 //
104 
105 const Uint1 kMaskFc (0x01);
106 const Uint1 kMaskEc (0x02);
107 const Uint1 kMaskE  (0x04);
108 const Uint1 kMaskD  (0x08);
109 
x_Align(SAlignInOut * data)110 CNWAligner::TScore CBandAligner::x_Align(SAlignInOut* data)
111 {
112     x_CheckParameters(data);
113 
114     const size_t N1 (data->m_len1);
115     const size_t N2 (data->m_len2);
116     const size_t fullrow (2*m_band + 1);
117     vector<TScore> stl_rowV (N2), stl_rowF (N2);
118     TScore  * rowV (&stl_rowV.front());
119     TScore  * rowF (&stl_rowF.front());
120     TScore* pV (rowV - 1);
121     const char* seq1 (m_Seq1 + data->m_offset1);
122     const char* seq2 (m_Seq2 + data->m_offset2);
123     const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
124 
125     m_terminate = false;
126 
127     if(m_prg_callback) {
128         m_prg_info.m_iter_total = N1*fullrow;
129         m_prg_info.m_iter_done = 0;
130         if( (m_terminate = m_prg_callback(&m_prg_info)) ) {
131             return 0;
132         }
133     }
134 
135     CBacktraceMatrix4 backtrace_matrix (N1*fullrow);
136 
137     TScore V (0);
138     TScore E (kInfMinus), V2 (kInfMinus), G, n0;
139     Uint1 tracer (0);
140 
141     const size_t ibeg ((m_Shift >= 0 && m_Shift > long(m_band))?
142                        (m_Shift - m_band): 0);
143 
144     const TScore wgleft1 (data->m_esf_L1? 0: m_Wg);
145     const TScore wsleft1 (data->m_esf_L1? 0: m_Ws);
146     const TScore wgleft2 (data->m_esf_L2? 0: m_Wg);
147     const TScore wsleft2 (data->m_esf_L2? 0: m_Ws);
148 
149     TScore wg1 (m_Wg), ws1 (m_Ws);
150     TScore V1 (wgleft2 + wsleft2 * ibeg);
151 
152     const long d1 (N2 + m_band + m_Shift);
153     const size_t iend (d1 <= 0? 0: (d1 < long(N1)? d1: N1));
154 
155     m_LastCoordSeq1 = m_LastCoordSeq2 = m_TermK = kMax_size_t;
156 
157     size_t jendcnt (0);
158     for(size_t i (ibeg); i < iend && !m_terminate; ++i) {
159 
160         TScore wg2 (m_Wg), ws2 (m_Ws);
161 
162         const long   d2   (i - m_Shift - m_band);
163         const size_t jbeg (d2 < 0? 0: d2);
164         const long   d3   (i - m_Shift + m_band + 1);
165         const size_t jend (d3 < long(N2)? d3: N2);
166         if(jend == N2) ++jendcnt;
167 
168         const Uint1 ci (seq1[i]);
169 
170         if(i == 0) {
171             V2 = wgleft1 + wsleft1 * jbeg;
172             TScore s (wgleft1);
173             for(size_t j (0); j < N2; ++j) {
174                 s += wsleft1;
175                 rowV[j] = s;
176             }
177         }
178 
179         if(i + 1 == N1 && data->m_esf_R1) {
180             wg1 = ws1 = 0;
181         }
182 
183         size_t j;
184         size_t k (fullrow * (i - ibeg));
185         const long jbeg0 (i - m_Shift - m_band);
186         if(jbeg0 < 0) {
187             k += jbeg - jbeg0;
188         }
189         if(size_t(k - 1) > m_TermK && (k & 1)) {
190             backtrace_matrix.SetAt(k - 1, 0);
191         }
192 
193         for(j = jbeg; j < jend; ++j) {
194 
195             if(j > 0) {
196                 G = sm[ci][Uint1(seq2[j])] + pV[j];
197                 pV[j] = V;
198             }
199             else {
200                 G = sm[ci][(Uint1)seq2[j]];
201                 if(i > 0) G += V1;
202             }
203 
204             if(j > jbeg) {
205                 n0 = V + wg1;
206                 if(E >= n0) {
207                     E += ws1;      // gap extension
208                     tracer = kMaskEc;
209                 }
210                 else {
211                     E = n0 + ws1;  // gap open
212                     tracer = 0;
213                 }
214             }
215             else if(j == 0 && i < m_Shift + m_band) {
216                 V1 += wsleft2;
217                 E = V1 + wg1 + ws1;
218                 tracer = 0;
219             }
220             else {
221                 E = kInfMinus;
222                 tracer = 0; // to fire during the backtrace
223             }
224 
225             if(j + 1 == N2 && data->m_esf_R2) {
226                 wg2 = ws2 = 0;
227             }
228 
229             if(i == 0) {
230                 if(j + 1 < jend) {
231                     V2 += wsleft1;
232                     rowF[j] = V2 + wg2 + ws2;
233                 }
234                 else {
235                     rowF[j] = kInfMinus;
236                 }
237             }
238             else if(j + 1 < jend || jendcnt > 1) {
239                 n0 = rowV[j] + wg2;
240                 if(rowF[j] >= n0) {
241                     rowF[j] += ws2;
242                     tracer |= kMaskFc;
243                 }
244                 else {
245                     rowF[j] = n0 + ws2;
246                 }
247             }
248             else {
249                 rowF[j] = kInfMinus;
250             }
251 
252             if (E >= rowF[j]) {
253                 if(E >= G) {
254                     V = E;
255                     tracer |= kMaskE;
256                 }
257                 else {
258                     V = G;
259                     tracer |= kMaskD;
260                 }
261             }
262             else {
263                 if(rowF[j] >= G) {
264                     V = rowF[j];
265                 }
266                 else {
267                     V = G;
268                     tracer |= kMaskD;
269                 }
270             }
271             backtrace_matrix.SetAt(k++,tracer);
272         }
273 
274         backtrace_matrix.Purge(k);
275 
276         pV[j] = V;
277 
278         m_TermK = k - 1;
279         m_LastCoordSeq2 = jend - 1;
280 
281         if(m_prg_callback) {
282             m_prg_info.m_iter_done = k;
283             m_terminate = m_prg_callback(&m_prg_info);
284         }
285     }
286 
287     m_LastCoordSeq1 = iend - 1;
288 
289     const bool uninit (m_TermK == kMax_size_t
290                        || m_LastCoordSeq1 == kMax_size_t
291                        || m_LastCoordSeq2 == kMax_size_t);
292 
293     if(uninit && !m_terminate) {
294         NCBI_THROW(CAlgoAlignException, eInternal, g_msg_UnexpectedTermIndex);
295     }
296 
297     if(!m_terminate) {
298         x_DoBackTrace(backtrace_matrix, data);
299     }
300 
301     return V;
302 }
303 
304 
x_CheckParameters(const SAlignInOut * data) const305 void CBandAligner::x_CheckParameters(const SAlignInOut* data) const
306 {
307     if(data->m_len1 < 2 || data->m_len2 < 2) {
308         NCBI_THROW(CAlgoAlignException, eBadParameter,
309                    "Input sequence interval too small.");
310     }
311 
312     if(m_Shift > 0 && m_Shift > long(data->m_len1 + m_band)) {
313         NCBI_THROW(CAlgoAlignException, eBadParameter,
314                    "Shift is greater than the first sequence's length.");
315     }
316 
317     if(m_Shift < 0 && -m_Shift > long(data->m_len2 + m_band)) {
318         NCBI_THROW(CAlgoAlignException, eBadParameter,
319                    "Shift is greater than the second sequence's length.");
320     }
321 
322     // Each of the following checks will verify if a corresponding end
323     // is within the band and, if it is not, whether the end-space free
324     // mode was specified for that end.
325     // When doing restrained alignment, the exception may be caused by improper
326     // adjustment of the band offset.
327 
328     string msg_head;
329 
330     // seq 1, left
331     if(m_Shift >= 0 && m_band < size_t(m_Shift) && !data->m_esf_L2) {
332         msg_head = "Left end of first sequence ";
333     }
334 
335     // seq 1,2, right
336     if(m_Shift + long(data->m_len2 + m_band) < long(data->m_len1) && !data->m_esf_R2)
337     {
338         msg_head = "Right end of first sequence ";
339     }
340     else if(long(data->m_len1 + m_band) - m_Shift < long(data->m_len2)
341             && !data->m_esf_R1)
342     {
343         msg_head = "Right end of second sequence ";
344     }
345 
346     // seq 2, left
347     if(m_Shift < 0 && m_band < size_t(-m_Shift) && !data->m_esf_L1) {
348         msg_head = "Left end of second sequence ";
349     }
350 
351     if(msg_head.size() > 0) {
352         const string msg_tail ("out of band and end-space free flag not set.");
353         const string msg = msg_head + msg_tail;
354         NCBI_THROW(CAlgoAlignException, eBadParameter, msg.c_str());
355     }
356 }
357 
358 
x_DoBackTrace(const CBacktraceMatrix4 & backtrace,CNWAligner::SAlignInOut * data)359 void CBandAligner::x_DoBackTrace(const CBacktraceMatrix4 & backtrace,
360                                  CNWAligner::SAlignInOut* data)
361 {
362     const size_t N1 (data->m_len1);
363     const size_t N2 (2*m_band + 1);
364 
365     data->m_transcript.clear();
366     data->m_transcript.reserve(N1 + N2);
367 
368     size_t k (m_TermK);
369 
370     size_t i1 (m_LastCoordSeq1);
371     size_t i2 (m_LastCoordSeq2);
372 
373     if(m_LastCoordSeq1 + 1 < data->m_len1 && data->m_esf_R2) {
374         data->m_transcript.insert(data->m_transcript.begin(),
375                                   data->m_len1 - m_LastCoordSeq1 - 1,
376 				  eTS_Delete);
377     }
378 
379     if(m_LastCoordSeq2 + 1 < data->m_len2 && data->m_esf_R1) {
380         data->m_transcript.insert(data->m_transcript.begin(),
381                                   data->m_len2 - m_LastCoordSeq2 - 1,
382 				  eTS_Insert);
383     }
384 
385     while (true) {
386 
387         const size_t kOverflow (kMax_size_t - 256), kMax (kMax_size_t);
388 
389         const bool invalid_backtrace_data (
390                                            ( (i1 > kOverflow && i1 != kMax) || (i2 > kOverflow && i2 != kMax) ) ||
391             ((size_t)abs(m_Shift - long(i1) + long(i2))) > m_band );
392 
393         if(invalid_backtrace_data) {
394             NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidBacktraceData);
395         }
396 
397         if(i1 == kMax) {
398             if(i2 == kMax) break;
399             do {
400                 data->m_transcript.push_back(eTS_Insert);
401                 --i2;
402             } while (i2 != kMax);
403             break;
404         }
405 
406         if(i2 == kMax) {
407             do {
408                 data->m_transcript.push_back(eTS_Delete);
409                 --i1;
410             } while (i1 != kMax);
411             break;
412         }
413 
414         Uint1 Key (backtrace[k]);
415 
416         if (Key & kMaskD) {
417             data->m_transcript.push_back(
418                x_GetDiagTS(data->m_offset1 + i1--, data->m_offset2 + i2--));
419             k -= N2;
420         }
421         else if (Key & kMaskE) {
422             data->m_transcript.push_back(eTS_Insert);
423             --k;
424             --i2;
425             while(i2 != kMax && (Key & kMaskEc)) {
426                 data->m_transcript.push_back(eTS_Insert);
427                 Key = backtrace[k--];
428                 --i2;
429             }
430         }
431         else {
432             data->m_transcript.push_back(eTS_Delete);
433             k -= (N2-1);
434             --i1;
435             while(i1 != kMax && (Key & kMaskFc)) {
436                 data->m_transcript.push_back(eTS_Delete);
437                 Key = backtrace[k];
438                 k -= (N2-1);
439                 --i1;
440             }
441         }
442     }
443 }
444 
445 
x_CheckMemoryLimit()446 bool CBandAligner::x_CheckMemoryLimit()
447 {
448     const size_t elem_size (GetElemSize());
449     const size_t gdim (m_guides.size());
450 
451     if(gdim) {
452 
453         size_t dim1 = m_guides[0], dim2 = m_guides[2];
454         double mem = double(max(dim1, dim2))*m_band*elem_size;
455         if(mem >= m_MaxMem) {
456             return false;
457         }
458         for(size_t i = 4; i < gdim; i += 4) {
459             dim1 = m_guides[i] - m_guides[i-3] + 1;
460             dim2 = m_guides[i + 2] - m_guides[i-1] + 1;
461             mem = double(max(dim1, dim2))*m_band*elem_size;
462             if(mem >= m_MaxMem) {
463                 return false;
464             }
465         }
466         dim1 = m_SeqLen1 - m_guides[gdim-3];
467         dim2 = m_SeqLen2 - m_guides[gdim-1];
468         mem = double(max(dim1, dim2))*m_band*elem_size;
469         if(mem >= m_MaxMem) {
470             return false;
471         }
472 
473         return true;
474     }
475     else {
476 
477         size_t max_len = max(m_SeqLen1, m_SeqLen2);
478         double mem = double(max_len)*m_band*elem_size;
479         return mem < m_MaxMem;
480     }
481 }
482 
483 
484 END_NCBI_SCOPE
485