1 /* $Id: nw_band_aligner.cpp 198851 2010-07-29 15:16:34Z kiryutin $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Yuri Kapustin
27 *
28 * File Description: CBandAligner implementation
29 *
30 * ===========================================================================
31 *
32 */
33
34
35 #include <ncbi_pch.hpp>
36
37 #include "messages.hpp"
38
39 #include <algo/align/nw/align_exception.hpp>
40 #include <algo/align/nw/nw_band_aligner.hpp>
41
42 BEGIN_NCBI_SCOPE
43
44 static const size_t kMax_size_t = numeric_limits<size_t>::max();
45
CBandAligner(const char * seq1,size_t len1,const char * seq2,size_t len2,const SNCBIPackedScoreMatrix * scoremat,size_t band)46 CBandAligner::CBandAligner( const char* seq1, size_t len1,
47 const char* seq2, size_t len2,
48 const SNCBIPackedScoreMatrix* scoremat,
49 size_t band):
50 CNWAligner(seq1, len1, seq2, len2, scoremat),
51 m_band(band),
52 m_Shift(0)
53 {
54 }
55
56
CBandAligner(const string & seq1,const string & seq2,const SNCBIPackedScoreMatrix * scoremat,size_t band)57 CBandAligner::CBandAligner(const string& seq1,
58 const string& seq2,
59 const SNCBIPackedScoreMatrix* scoremat,
60 size_t band):
61 CNWAligner(seq1, seq2, scoremat),
62 m_band(band),
63 m_Shift(0)
64 {
65 }
66
67
SetShift(Uint1 where,size_t offset)68 void CBandAligner::SetShift(Uint1 where, size_t offset)
69 {
70 switch(where) {
71 case 0: m_Shift = offset; break;
72 case 1: m_Shift = -(long)offset; break;
73 default:
74 NCBI_THROW(CAlgoAlignException, eBadParameter,
75 "CBandAligner::SetShift(): Incorrect sequence index specified");
76 }
77 }
78
79
GetShift(void) const80 pair<Uint1,size_t> CBandAligner::GetShift(void) const
81 {
82 Uint1 where;
83 size_t offset;
84 if(m_Shift < 0) {
85 where = 1;
86 offset = size_t(-m_Shift);
87 }
88 else {
89 where = 0;
90 offset = size_t(m_Shift);
91 }
92 return pair<Uint1,size_t>(where, offset);
93 }
94
95
96 // evaluate score for each possible alignment;
97 // fill out backtrace matrix limited to the band
98 // bit coding (four bits per value): D E Ec Fc
99 // D: 1 if diagonal; 0 - otherwise
100 // E: 1 if space in 1st sequence; 0 if space in 2nd sequence
101 // Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened
102 // Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened
103 //
104
105 const Uint1 kMaskFc (0x01);
106 const Uint1 kMaskEc (0x02);
107 const Uint1 kMaskE (0x04);
108 const Uint1 kMaskD (0x08);
109
x_Align(SAlignInOut * data)110 CNWAligner::TScore CBandAligner::x_Align(SAlignInOut* data)
111 {
112 x_CheckParameters(data);
113
114 const size_t N1 (data->m_len1);
115 const size_t N2 (data->m_len2);
116 const size_t fullrow (2*m_band + 1);
117 vector<TScore> stl_rowV (N2), stl_rowF (N2);
118 TScore * rowV (&stl_rowV.front());
119 TScore * rowF (&stl_rowF.front());
120 TScore* pV (rowV - 1);
121 const char* seq1 (m_Seq1 + data->m_offset1);
122 const char* seq2 (m_Seq2 + data->m_offset2);
123 const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
124
125 m_terminate = false;
126
127 if(m_prg_callback) {
128 m_prg_info.m_iter_total = N1*fullrow;
129 m_prg_info.m_iter_done = 0;
130 if( (m_terminate = m_prg_callback(&m_prg_info)) ) {
131 return 0;
132 }
133 }
134
135 CBacktraceMatrix4 backtrace_matrix (N1*fullrow);
136
137 TScore V (0);
138 TScore E (kInfMinus), V2 (kInfMinus), G, n0;
139 Uint1 tracer (0);
140
141 const size_t ibeg ((m_Shift >= 0 && m_Shift > long(m_band))?
142 (m_Shift - m_band): 0);
143
144 const TScore wgleft1 (data->m_esf_L1? 0: m_Wg);
145 const TScore wsleft1 (data->m_esf_L1? 0: m_Ws);
146 const TScore wgleft2 (data->m_esf_L2? 0: m_Wg);
147 const TScore wsleft2 (data->m_esf_L2? 0: m_Ws);
148
149 TScore wg1 (m_Wg), ws1 (m_Ws);
150 TScore V1 (wgleft2 + wsleft2 * ibeg);
151
152 const long d1 (N2 + m_band + m_Shift);
153 const size_t iend (d1 <= 0? 0: (d1 < long(N1)? d1: N1));
154
155 m_LastCoordSeq1 = m_LastCoordSeq2 = m_TermK = kMax_size_t;
156
157 size_t jendcnt (0);
158 for(size_t i (ibeg); i < iend && !m_terminate; ++i) {
159
160 TScore wg2 (m_Wg), ws2 (m_Ws);
161
162 const long d2 (i - m_Shift - m_band);
163 const size_t jbeg (d2 < 0? 0: d2);
164 const long d3 (i - m_Shift + m_band + 1);
165 const size_t jend (d3 < long(N2)? d3: N2);
166 if(jend == N2) ++jendcnt;
167
168 const Uint1 ci (seq1[i]);
169
170 if(i == 0) {
171 V2 = wgleft1 + wsleft1 * jbeg;
172 TScore s (wgleft1);
173 for(size_t j (0); j < N2; ++j) {
174 s += wsleft1;
175 rowV[j] = s;
176 }
177 }
178
179 if(i + 1 == N1 && data->m_esf_R1) {
180 wg1 = ws1 = 0;
181 }
182
183 size_t j;
184 size_t k (fullrow * (i - ibeg));
185 const long jbeg0 (i - m_Shift - m_band);
186 if(jbeg0 < 0) {
187 k += jbeg - jbeg0;
188 }
189 if(size_t(k - 1) > m_TermK && (k & 1)) {
190 backtrace_matrix.SetAt(k - 1, 0);
191 }
192
193 for(j = jbeg; j < jend; ++j) {
194
195 if(j > 0) {
196 G = sm[ci][Uint1(seq2[j])] + pV[j];
197 pV[j] = V;
198 }
199 else {
200 G = sm[ci][(Uint1)seq2[j]];
201 if(i > 0) G += V1;
202 }
203
204 if(j > jbeg) {
205 n0 = V + wg1;
206 if(E >= n0) {
207 E += ws1; // gap extension
208 tracer = kMaskEc;
209 }
210 else {
211 E = n0 + ws1; // gap open
212 tracer = 0;
213 }
214 }
215 else if(j == 0 && i < m_Shift + m_band) {
216 V1 += wsleft2;
217 E = V1 + wg1 + ws1;
218 tracer = 0;
219 }
220 else {
221 E = kInfMinus;
222 tracer = 0; // to fire during the backtrace
223 }
224
225 if(j + 1 == N2 && data->m_esf_R2) {
226 wg2 = ws2 = 0;
227 }
228
229 if(i == 0) {
230 if(j + 1 < jend) {
231 V2 += wsleft1;
232 rowF[j] = V2 + wg2 + ws2;
233 }
234 else {
235 rowF[j] = kInfMinus;
236 }
237 }
238 else if(j + 1 < jend || jendcnt > 1) {
239 n0 = rowV[j] + wg2;
240 if(rowF[j] >= n0) {
241 rowF[j] += ws2;
242 tracer |= kMaskFc;
243 }
244 else {
245 rowF[j] = n0 + ws2;
246 }
247 }
248 else {
249 rowF[j] = kInfMinus;
250 }
251
252 if (E >= rowF[j]) {
253 if(E >= G) {
254 V = E;
255 tracer |= kMaskE;
256 }
257 else {
258 V = G;
259 tracer |= kMaskD;
260 }
261 }
262 else {
263 if(rowF[j] >= G) {
264 V = rowF[j];
265 }
266 else {
267 V = G;
268 tracer |= kMaskD;
269 }
270 }
271 backtrace_matrix.SetAt(k++,tracer);
272 }
273
274 backtrace_matrix.Purge(k);
275
276 pV[j] = V;
277
278 m_TermK = k - 1;
279 m_LastCoordSeq2 = jend - 1;
280
281 if(m_prg_callback) {
282 m_prg_info.m_iter_done = k;
283 m_terminate = m_prg_callback(&m_prg_info);
284 }
285 }
286
287 m_LastCoordSeq1 = iend - 1;
288
289 const bool uninit (m_TermK == kMax_size_t
290 || m_LastCoordSeq1 == kMax_size_t
291 || m_LastCoordSeq2 == kMax_size_t);
292
293 if(uninit && !m_terminate) {
294 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_UnexpectedTermIndex);
295 }
296
297 if(!m_terminate) {
298 x_DoBackTrace(backtrace_matrix, data);
299 }
300
301 return V;
302 }
303
304
x_CheckParameters(const SAlignInOut * data) const305 void CBandAligner::x_CheckParameters(const SAlignInOut* data) const
306 {
307 if(data->m_len1 < 2 || data->m_len2 < 2) {
308 NCBI_THROW(CAlgoAlignException, eBadParameter,
309 "Input sequence interval too small.");
310 }
311
312 if(m_Shift > 0 && m_Shift > long(data->m_len1 + m_band)) {
313 NCBI_THROW(CAlgoAlignException, eBadParameter,
314 "Shift is greater than the first sequence's length.");
315 }
316
317 if(m_Shift < 0 && -m_Shift > long(data->m_len2 + m_band)) {
318 NCBI_THROW(CAlgoAlignException, eBadParameter,
319 "Shift is greater than the second sequence's length.");
320 }
321
322 // Each of the following checks will verify if a corresponding end
323 // is within the band and, if it is not, whether the end-space free
324 // mode was specified for that end.
325 // When doing restrained alignment, the exception may be caused by improper
326 // adjustment of the band offset.
327
328 string msg_head;
329
330 // seq 1, left
331 if(m_Shift >= 0 && m_band < size_t(m_Shift) && !data->m_esf_L2) {
332 msg_head = "Left end of first sequence ";
333 }
334
335 // seq 1,2, right
336 if(m_Shift + long(data->m_len2 + m_band) < long(data->m_len1) && !data->m_esf_R2)
337 {
338 msg_head = "Right end of first sequence ";
339 }
340 else if(long(data->m_len1 + m_band) - m_Shift < long(data->m_len2)
341 && !data->m_esf_R1)
342 {
343 msg_head = "Right end of second sequence ";
344 }
345
346 // seq 2, left
347 if(m_Shift < 0 && m_band < size_t(-m_Shift) && !data->m_esf_L1) {
348 msg_head = "Left end of second sequence ";
349 }
350
351 if(msg_head.size() > 0) {
352 const string msg_tail ("out of band and end-space free flag not set.");
353 const string msg = msg_head + msg_tail;
354 NCBI_THROW(CAlgoAlignException, eBadParameter, msg.c_str());
355 }
356 }
357
358
x_DoBackTrace(const CBacktraceMatrix4 & backtrace,CNWAligner::SAlignInOut * data)359 void CBandAligner::x_DoBackTrace(const CBacktraceMatrix4 & backtrace,
360 CNWAligner::SAlignInOut* data)
361 {
362 const size_t N1 (data->m_len1);
363 const size_t N2 (2*m_band + 1);
364
365 data->m_transcript.clear();
366 data->m_transcript.reserve(N1 + N2);
367
368 size_t k (m_TermK);
369
370 size_t i1 (m_LastCoordSeq1);
371 size_t i2 (m_LastCoordSeq2);
372
373 if(m_LastCoordSeq1 + 1 < data->m_len1 && data->m_esf_R2) {
374 data->m_transcript.insert(data->m_transcript.begin(),
375 data->m_len1 - m_LastCoordSeq1 - 1,
376 eTS_Delete);
377 }
378
379 if(m_LastCoordSeq2 + 1 < data->m_len2 && data->m_esf_R1) {
380 data->m_transcript.insert(data->m_transcript.begin(),
381 data->m_len2 - m_LastCoordSeq2 - 1,
382 eTS_Insert);
383 }
384
385 while (true) {
386
387 const size_t kOverflow (kMax_size_t - 256), kMax (kMax_size_t);
388
389 const bool invalid_backtrace_data (
390 ( (i1 > kOverflow && i1 != kMax) || (i2 > kOverflow && i2 != kMax) ) ||
391 ((size_t)abs(m_Shift - long(i1) + long(i2))) > m_band );
392
393 if(invalid_backtrace_data) {
394 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidBacktraceData);
395 }
396
397 if(i1 == kMax) {
398 if(i2 == kMax) break;
399 do {
400 data->m_transcript.push_back(eTS_Insert);
401 --i2;
402 } while (i2 != kMax);
403 break;
404 }
405
406 if(i2 == kMax) {
407 do {
408 data->m_transcript.push_back(eTS_Delete);
409 --i1;
410 } while (i1 != kMax);
411 break;
412 }
413
414 Uint1 Key (backtrace[k]);
415
416 if (Key & kMaskD) {
417 data->m_transcript.push_back(
418 x_GetDiagTS(data->m_offset1 + i1--, data->m_offset2 + i2--));
419 k -= N2;
420 }
421 else if (Key & kMaskE) {
422 data->m_transcript.push_back(eTS_Insert);
423 --k;
424 --i2;
425 while(i2 != kMax && (Key & kMaskEc)) {
426 data->m_transcript.push_back(eTS_Insert);
427 Key = backtrace[k--];
428 --i2;
429 }
430 }
431 else {
432 data->m_transcript.push_back(eTS_Delete);
433 k -= (N2-1);
434 --i1;
435 while(i1 != kMax && (Key & kMaskFc)) {
436 data->m_transcript.push_back(eTS_Delete);
437 Key = backtrace[k];
438 k -= (N2-1);
439 --i1;
440 }
441 }
442 }
443 }
444
445
x_CheckMemoryLimit()446 bool CBandAligner::x_CheckMemoryLimit()
447 {
448 const size_t elem_size (GetElemSize());
449 const size_t gdim (m_guides.size());
450
451 if(gdim) {
452
453 size_t dim1 = m_guides[0], dim2 = m_guides[2];
454 double mem = double(max(dim1, dim2))*m_band*elem_size;
455 if(mem >= m_MaxMem) {
456 return false;
457 }
458 for(size_t i = 4; i < gdim; i += 4) {
459 dim1 = m_guides[i] - m_guides[i-3] + 1;
460 dim2 = m_guides[i + 2] - m_guides[i-1] + 1;
461 mem = double(max(dim1, dim2))*m_band*elem_size;
462 if(mem >= m_MaxMem) {
463 return false;
464 }
465 }
466 dim1 = m_SeqLen1 - m_guides[gdim-3];
467 dim2 = m_SeqLen2 - m_guides[gdim-1];
468 mem = double(max(dim1, dim2))*m_band*elem_size;
469 if(mem >= m_MaxMem) {
470 return false;
471 }
472
473 return true;
474 }
475 else {
476
477 size_t max_len = max(m_SeqLen1, m_SeqLen2);
478 double mem = double(max_len)*m_band*elem_size;
479 return mem < m_MaxMem;
480 }
481 }
482
483
484 END_NCBI_SCOPE
485