1 /* $Id: nw_aligner.cpp 609384 2020-06-01 14:49:07Z brovervv $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Yuri Kapustin, Boris Kiryutin, Alexander Souvorov
27 *
28 * File Description: CNWAligner implementation
29 *
30 * ===========================================================================
31 *
32 */
33
34
35 #include <ncbi_pch.hpp>
36
37 #include "nw_aligner_threads.hpp"
38 #include "messages.hpp"
39
40 #include <corelib/ncbi_system.hpp>
41 #include <algo/align/nw/align_exception.hpp>
42 #include <algo/align/nw/nw_formatter.hpp>
43 #include <objects/seqloc/Seq_id.hpp>
44 #include <objects/seqalign/Dense_seg.hpp>
45
46 #include <objmgr/scope.hpp>
47 #include <objmgr/seq_vector.hpp>
48
49 #include <algorithm>
50
51
52 BEGIN_NCBI_SCOPE
53 USING_SCOPE(objects);
54
55
CNWAligner()56 CNWAligner::CNWAligner()
57 : m_Wm(GetDefaultWm()),
58 m_Wms(GetDefaultWms()),
59 m_Wg(GetDefaultWg()),
60 m_Ws(GetDefaultWs()),
61 m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
62 m_SmithWaterman(false),
63 m_GapPreference(eLater),
64 m_abc(g_nwaligner_nucleotides),
65 m_ScoreMatrixInvalid(true),
66 m_prg_callback(0),
67 m_terminate(false),
68 m_Seq1Vec(),
69 m_Seq1(0), m_SeqLen1(0),
70 m_Seq2Vec(),
71 m_Seq2(0), m_SeqLen2(0),
72 m_PositivesAsMatches(false),
73 m_score(kInfMinus),
74 m_mt(false),
75 m_maxthreads(1),
76 m_MaxMem(GetDefaultSpaceLimit())
77 {
78 SetScoreMatrix(0);
79 }
80
81
CNWAligner(const char * seq1,size_t len1,const char * seq2,size_t len2,const SNCBIPackedScoreMatrix * scoremat)82 CNWAligner::CNWAligner( const char* seq1, size_t len1,
83 const char* seq2, size_t len2,
84 const SNCBIPackedScoreMatrix* scoremat )
85
86 : m_Wm(GetDefaultWm()),
87 m_Wms(GetDefaultWms()),
88 m_Wg(GetDefaultWg()),
89 m_Ws(GetDefaultWs()),
90 m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
91 m_SmithWaterman(false),
92 m_GapPreference(eLater),
93 m_abc(g_nwaligner_nucleotides),
94 m_ScoreMatrixInvalid(true),
95 m_prg_callback(0),
96 m_terminate(false),
97 m_Seq1Vec(&seq1[0], &seq1[0]+len1),
98 m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(len1),
99 m_Seq2Vec(&seq2[0], &seq2[0]+len2),
100 m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(len2),
101 m_PositivesAsMatches(false),
102 m_score(kInfMinus),
103 m_mt(false),
104 m_maxthreads(1),
105 m_MaxMem(GetDefaultSpaceLimit())
106 {
107 SetScoreMatrix(scoremat);
108 SetSequences(seq1, len1, seq2, len2);
109 }
110
111
CNWAligner(const string & seq1,const string & seq2,const SNCBIPackedScoreMatrix * scoremat)112 CNWAligner::CNWAligner(const string& seq1,
113 const string& seq2,
114 const SNCBIPackedScoreMatrix* scoremat)
115 : m_Wm(GetDefaultWm()),
116 m_Wms(GetDefaultWms()),
117 m_Wg(GetDefaultWg()),
118 m_Ws(GetDefaultWs()),
119 m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
120 m_SmithWaterman(false),
121 m_GapPreference(eLater),
122 m_abc(g_nwaligner_nucleotides),
123 m_ScoreMatrixInvalid(true),
124 m_prg_callback(0),
125 m_terminate(false),
126 m_Seq1Vec(seq1.begin(), seq1.end()),
127 m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(seq1.size()),
128 m_Seq2Vec(seq2.begin(), seq2.end()),
129 m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(seq2.size()),
130 m_score(kInfMinus),
131 m_mt(false),
132 m_maxthreads(1),
133 m_MaxMem(GetDefaultSpaceLimit())
134 {
135 SetScoreMatrix(scoremat);
136 SetSequences(seq1, seq2);
137 };
138
139
SetSequences(const char * seq1,size_t len1,const char * seq2,size_t len2,bool verify)140 void CNWAligner::SetSequences(const char* seq1, size_t len1,
141 const char* seq2, size_t len2,
142 bool verify)
143 {
144 if(!seq1 || !seq2) {
145 NCBI_THROW(CAlgoAlignException, eBadParameter,
146 g_msg_NullParameter);
147 }
148
149 if(verify) {
150 size_t iErrPos1 = x_CheckSequence(seq1, len1);
151 if(iErrPos1 < len1) {
152 CNcbiOstrstream oss;
153 oss << "The first sequence is inconsistent with the current "
154 << "scoring matrix type. "
155 << "Position = " << iErrPos1
156 << " Symbol = '" << seq1[iErrPos1] << "'";
157
158 string message = CNcbiOstrstreamToString(oss);
159 NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
160 }
161
162 size_t iErrPos2 = x_CheckSequence(seq2, len2);
163 if(iErrPos2 < len2) {
164 CNcbiOstrstream oss;
165 oss << "The second sequence is inconsistent with the current "
166 << "scoring matrix type. "
167 << "Position = " << iErrPos2
168 << " Symbol = '" << seq2[iErrPos2] << "'";
169
170 string message = CNcbiOstrstreamToString(oss);
171 NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
172 }
173 }
174 m_Seq1Vec.assign(&seq1[0], &seq1[0]+len1);
175 m_Seq2Vec.assign(&seq2[0], &seq2[0]+len2);
176 m_Seq1 = &m_Seq1Vec[0];
177 m_SeqLen1 = len1;
178 m_Seq2 = &m_Seq2Vec[0];
179 m_SeqLen2 = len2;
180 m_Transcript.clear();
181 }
182
183
SetSequences(const string & seq1,const string & seq2,bool verify)184 void CNWAligner::SetSequences(const string& seq1,
185 const string& seq2,
186 bool verify)
187 {
188 SetSequences(seq1.data(), seq1.size(), seq2.data(), seq2.size(), verify);
189 }
190
191
SetEndSpaceFree(bool Left1,bool Right1,bool Left2,bool Right2)192 void CNWAligner::SetEndSpaceFree(bool Left1, bool Right1,
193 bool Left2, bool Right2)
194 {
195 m_esf_L1 = Left1;
196 m_esf_R1 = Right1;
197 m_esf_L2 = Left2;
198 m_esf_R2 = Right2;
199 }
200
SetSmithWaterman(bool SW)201 void CNWAligner::SetSmithWaterman(bool SW)
202 {
203 m_SmithWaterman = SW;
204 if (SW) {
205 /// Smith-Waterman necessarily implies that all four ends are free
206 m_esf_L1 = m_esf_R1 = m_esf_L2 = m_esf_R2 = true;
207 }
208 }
209
SetGapPreference(EGapPreference p)210 void CNWAligner::SetGapPreference(EGapPreference p)
211 {
212 m_GapPreference = p;
213 }
214
215 // evaluate score for each possible alignment;
216 // fill out backtrace matrix
217 // bit coding (four bits per value): D E Ec Fc
218 // D: 1 if diagonal; 0 - otherwise
219 // E: 1 if space in 1st sequence; 0 if space in 2nd sequence
220 // Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened
221 // Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened
222 //
223
224 const unsigned char kMaskFc = 0x01;
225 const unsigned char kMaskEc = 0x02;
226 const unsigned char kMaskE = 0x04;
227 const unsigned char kMaskD = 0x08;
228
x_Align(SAlignInOut * data)229 CNWAligner::TScore CNWAligner::x_Align(SAlignInOut* data)
230 {
231
232
233 //check data integrity
234
235 if( m_SmithWaterman && ( data->m_offset1 || m_SeqLen1 != data->m_len1 ||
236 data->m_offset2 || m_SeqLen2 != data->m_len2 ) ) {
237 NCBI_THROW(CAlgoAlignException, eBadParameter,
238 "Smith-Waterman not compatible with offsets provided");
239 }
240
241 if( m_SmithWaterman && ( !data->m_esf_L1 || !data->m_esf_R1 ||
242 !data->m_esf_L2 || !data->m_esf_R2 ) ) {
243 NCBI_THROW(CAlgoAlignException, eBadParameter,
244 "Smith-Waterman not compatible with end gap penalties");
245 }
246
247 const size_t N1 = data->m_len1 + 1;
248 const size_t N2 = data->m_len2 + 1;
249
250 vector<TScore> stl_rowV (N2), stl_rowF(N2);
251
252 const TNCBIScore (* sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
253
254 if(m_prg_callback) {
255 m_prg_info.m_iter_total = N1*N2;
256 m_prg_info.m_iter_done = 0;
257 if( (m_terminate = m_prg_callback(&m_prg_info)) ) {
258 return 0;
259 }
260 }
261
262 bool bFreeGapLeft1 = data->m_esf_L1 && data->m_offset1 == 0;
263 bool bFreeGapRight1 = data->m_esf_R1 &&
264 m_SeqLen1 == data->m_offset1 + data->m_len1;
265
266 bool bFreeGapLeft2 = data->m_esf_L2 && data->m_offset2 == 0;
267 bool bFreeGapRight2 = data->m_esf_R2 &&
268 m_SeqLen2 == data->m_offset2 + data->m_len2;
269
270 TScore wgleft1 = bFreeGapLeft1? 0: m_Wg;
271 TScore wsleft1 = bFreeGapLeft1? 0: m_Ws;
272 TScore wg1 = m_Wg, ws1 = m_Ws;
273
274 // index calculation: [i,j] = i*n2 + j
275 CBacktraceMatrix4 backtrace_matrix (N1 * N2);
276 backtrace_matrix.SetAt(0, 0);
277
278 // first row
279 // note that stl_rowF[0] is not used in the main cycle,
280 size_t k;
281 stl_rowV[0] = wgleft1;
282 for (k = 1; k < N2; ++k) {
283 stl_rowV[k] = stl_rowV[k-1] + wsleft1;
284 stl_rowF[k] = kInfMinus;
285 backtrace_matrix.SetAt(k, kMaskE | kMaskEc);
286 }
287 backtrace_matrix.Purge(k);
288 stl_rowV[0] = 0;
289
290 if(m_prg_callback) {
291 m_prg_info.m_iter_done = k;
292 m_terminate = m_prg_callback(&m_prg_info);
293 }
294
295 // gap penalties
296 TScore wgleft2 (bFreeGapLeft2? 0: m_Wg);
297 TScore wsleft2 (bFreeGapLeft2? 0: m_Ws);
298
299 const char * seq1 = m_Seq1 + data->m_offset1;
300 const char * seq1_end = seq1 + data->m_len1;
301
302 TScore V0 = wgleft2;
303 TScore V = 0;//best score in the current cell. Will be equal to the NW score at the end
304 TScore best_V = 0;//best score in the whole matrix aka score for SW
305
306 --k;
307
308 for(; seq1 != seq1_end && !m_terminate; ++seq1) {
309
310 backtrace_matrix.SetAt(++k, kMaskFc);
311
312 if( seq1 + 1 == seq1_end && bFreeGapRight1) {
313 wg1 = ws1 = 0;
314 }
315
316 unsigned char tracer;
317 const TNCBIScore * row_sc = sm[(size_t)*seq1];
318
319 const char * seq2 = m_Seq2 + data->m_offset2;
320 const char * seq2_end = seq2 + data->m_len2;
321 TScore wg2 = m_Wg, ws2 = m_Ws;
322
323 //best ending with gap in seq1 open seq1 X- or extended seq1 X--
324 // seq2 XX seq2 XXX
325 TScore E = kInfMinus;
326 //best ending with gap in seq2
327 TScore F;
328 //total best with
329 //best ending with match
330 TScore G;
331 //just temporary
332 TScore n0;
333 //total best
334 TScore * rowV = &stl_rowV[0];//previos row
335 V = V0 += wsleft2; //current row
336 //best ending with match
337 TScore * rowF = &stl_rowF[0];
338
339 for (; seq2 != seq2_end;) {
340
341 G = *rowV + row_sc[(size_t)*seq2++];
342 *rowV = V;
343
344 n0 = V + wg1;
345 if(E >= n0) {
346 E += ws1; // continue the gap
347 tracer = kMaskEc;
348 }
349 else {
350 E = n0 + ws1; // open a new gap
351 tracer = 0;
352 }
353
354 if( bFreeGapRight2 && seq2 == seq2_end ) {
355 wg2 = ws2 = 0;
356 }
357
358 F = *++rowF;
359 n0 = *++rowV + wg2;
360 if(F >= n0) {
361 F += ws2;
362 tracer |= kMaskFc;
363 }
364 else {
365 F = n0 + ws2;
366 }
367 *rowF = F;
368
369 //best score
370 if( G < F || ( G == F && m_GapPreference == eLater) ) {
371 if( E <= F ) {
372 V = F;
373 } else {
374 V = E;
375 tracer |= kMaskE;
376 }
377 } else if( E > G || ( E == G && m_GapPreference == eLater) ) {
378 V = E;
379 tracer |= kMaskE;
380 } else {
381 V = G;
382 tracer |= kMaskD;
383 }
384
385 if (m_SmithWaterman && V < 0 ) {
386 V = 0;
387 }
388
389 backtrace_matrix.SetAt(++k, tracer);
390
391 if (V > best_V) {
392 best_V = V;
393 backtrace_matrix.SetBestPos(k);
394 }
395 }
396 *rowV = V;
397
398 if(m_prg_callback) {
399 m_prg_info.m_iter_done = k;
400 if( (m_terminate = m_prg_callback(&m_prg_info)) ) {
401 break;
402 }
403 }
404 }
405
406 backtrace_matrix.Purge(++k);
407 backtrace_matrix.SetBestScore(best_V);
408
409 /*
410 //print the matrix out
411 {{
412 cout<<endl;
413 int kk, ind1, ind2, width = 4;
414 cout<<setw(width)<<" ";
415 cout<<setw(width)<<"-";
416 for(ind2 = 0; ind2 < N2-1; ++ind2) {
417 cout<<setw(width)<<*(m_Seq2 + data->m_offset2 + ind2);
418 }
419 cout<<endl;
420 for(kk = 0,ind1 = 0; ind1 < N1; ++ind1) {
421 if(ind1) {
422 cout<<setw(width)<<(m_Seq1 + data->m_offset1)[ind1-1];
423 } else {
424 cout<<setw(width)<<"-";
425 }
426 for(ind2 = 0; ind2 < N2; ++ind2,++kk) {
427 string tstr;
428 unsigned char Key (backtrace_matrix[kk]);
429 if( Key & kMaskD ) tstr += "D";
430 else if ( Key & kMaskE ) tstr += "E";
431 else tstr += "F";
432 if( Key & kMaskEc ) tstr += "-";
433 if( Key & kMaskFc ) tstr += "|";
434 cout<<setw(width)<<tstr;
435 }
436 cout<<endl<<endl;
437 }
438 cout<<endl;
439 }}
440 //end of print the matrix out
441 */
442
443 if(!m_terminate) {
444 x_SWDoBackTrace(backtrace_matrix, data);
445 //check back trace
446 TTranscript rv (data->m_transcript.size());
447 copy(data->m_transcript.rbegin(), data->m_transcript.rend(), rv.begin());
448 if(m_SmithWaterman) {
449 if( best_V != ScoreFromTranscript(rv, data->m_offset1, data->m_offset2) ) {
450 NCBI_THROW(CAlgoAlignException, eInternal,
451 "CNWAligner: error in back trace");
452 }
453 } else {
454 if( V != ScoreFromTranscript(rv, data->m_offset1, data->m_offset2) ) {
455 NCBI_THROW(CAlgoAlignException, eInternal,
456 "CNWAligner: error in back trace");
457 }
458 }
459 }
460
461 if(m_SmithWaterman) {
462 return best_V;
463 }
464 return V;
465 }
466
467
Run(CScope & scope,const CSeq_id & id1,const CSeq_id & id2,bool trim_end_gaps)468 CRef<CSeq_align> CNWAligner::Run(CScope &scope, const CSeq_id &id1,
469 const CSeq_id &id2, bool trim_end_gaps)
470 {
471 CSeq_loc loc1, loc2;
472 loc1.SetWhole().Assign(id1);
473 loc2.SetWhole().Assign(id2);
474 return Run(scope, loc1, loc2, trim_end_gaps);
475 }
476
Run(CScope & scope,const CSeq_loc & loc1,const CSeq_loc & loc2,bool trim_end_gaps)477 CRef<CSeq_align> CNWAligner::Run(CScope &scope, const CSeq_loc &loc1,
478 const CSeq_loc &loc2, bool trim_end_gaps)
479 {
480 if ((!loc1.IsInt() && !loc1.IsWhole()) ||
481 (!loc1.IsInt() && !loc1.IsWhole()))
482 {
483 NCBI_THROW(CException, eUnknown,
484 "Only whole and interval locations supported");
485 }
486 CSeqVector vec1(loc1, scope, CBioseq_Handle::eCoding_Iupac);
487 string seq1;
488 vec1.GetSeqData(0, vec1.size(), seq1);
489 CSeqVector vec2(loc2, scope, CBioseq_Handle::eCoding_Iupac);
490 string seq2;
491 vec2.GetSeqData(0, vec2.size(), seq2);
492 SetSequences(seq1,seq2);
493 Run();
494 CRef<CSeq_align> align(new CSeq_align);
495 align->SetType(CSeq_align::eType_partial);
496 align->SetSegs().SetDenseg(*GetDense_seg(
497 loc1.GetStart(eExtreme_Biological), loc1.GetStrand(), *loc1.GetId(),
498 loc2.GetStart(eExtreme_Biological), loc2.GetStrand(), *loc2.GetId(),
499 trim_end_gaps));
500 return align;
501 }
502
Run()503 CNWAligner::TScore CNWAligner::Run()
504 {
505 if(m_ScoreMatrixInvalid) {
506
507 NCBI_THROW(CAlgoAlignException, eInvalidMatrix,
508 "CNWAligner::SetScoreMatrix(NULL) must be called "
509 "after changing match/mismatch scores "
510 "to make sure that the new parameters are engaged.");
511 }
512
513 if(!m_Seq1 || !m_Seq2) {
514 NCBI_THROW(CAlgoAlignException, eNoSeqData,
515 g_msg_DataNotAvailable);
516 }
517
518 if(!x_CheckMemoryLimit()) {
519 NCBI_THROW(CAlgoAlignException, eMemoryLimit, g_msg_HitSpaceLimit);
520 }
521
522 if (m_SmithWaterman && !m_guides.empty()) {
523 NCBI_THROW(CAlgoAlignException, eBadParameter,
524 "Smith-Waterman not compatible with provided pattern");
525 }
526
527 m_score = x_Run();
528
529 return m_score;
530 }
531
532
x_Run()533 CNWAligner::TScore CNWAligner::x_Run()
534 {
535 try {
536
537 m_terminate = false;
538
539 if(m_guides.size() == 0) {
540
541 SAlignInOut data (0, m_SeqLen1, m_esf_L1, m_esf_R1,
542 0, m_SeqLen2, m_esf_L2, m_esf_R2);
543 m_score = x_Align(&data);
544 m_Transcript = data.m_transcript;
545 }
546 // run the algorithm for every segment between hits
547 else if(m_mt && m_maxthreads > 1) {
548
549 size_t guides_dim = m_guides.size() / 4;
550
551 // setup inputs
552 typedef vector<SAlignInOut> TDataVector;
553 TDataVector vdata;
554 vector<size_t> seed_dims;
555 {{
556 vdata.reserve(guides_dim + 1);
557 seed_dims.reserve(guides_dim + 1);
558 size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;
559 for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {
560
561 q0 = m_guides[i - 3] + 1;
562 s0 = m_guides[i - 1] + 1;
563 size_t dim_query = q1 - q0, dim_subj = s1 - s0;
564
565 bool esf_L1 = false, esf_R1 = false,
566 esf_L2 = false, esf_R2 = false;
567 if(i == istart) {
568 esf_R1 = m_esf_R1;
569 esf_R2 = m_esf_R2;
570 }
571
572 SAlignInOut data (q0, dim_query, esf_L1, esf_R1,
573 s0, dim_subj, esf_L2, esf_R2);
574
575 vdata.push_back(data);
576 seed_dims.push_back(m_guides[i-3] - m_guides[i-4] + 1);
577
578 q1 = m_guides[i - 4];
579 s1 = m_guides[i - 2];
580 }
581 SAlignInOut data(0, q1, m_esf_L1, false,
582 0, s1, m_esf_L2, false);
583 vdata.push_back(data);
584 }}
585
586 // rearrange vdata so that the largest chunks come first
587 typedef vector<SAlignInOut*> TDataPtrVector;
588 TDataPtrVector vdata_p (vdata.size());
589 {{
590 TDataPtrVector::iterator jj = vdata_p.begin();
591 NON_CONST_ITERATE(TDataVector, ii, vdata) {
592 *jj++ = &(*ii);
593 }
594 stable_sort(vdata_p.begin(), vdata_p.end(),
595 SAlignInOut::PSpace);
596 }}
597
598 // align over the segments
599 {{
600 m_Transcript.clear();
601 size_t idim = vdata.size();
602
603 typedef vector<CNWAlignerThread_Align*> TThreadVector;
604 TThreadVector threads;
605 threads.reserve(idim);
606
607 ITERATE(TDataPtrVector, ii, vdata_p) {
608
609 SAlignInOut& data = **ii;
610
611 if(data.GetSpace() >= 10000000 &&
612 NW_RequestNewThread(m_maxthreads)) {
613
614 CNWAlignerThread_Align* thread =
615 new CNWAlignerThread_Align(this, &data);
616 threads.push_back(thread);
617 thread->Run();
618 }
619 else {
620 x_Align(&data);
621 }
622 }
623
624 auto_ptr<CException> e;
625 ITERATE(TThreadVector, ii, threads) {
626
627 if(e.get() == 0) {
628 CException* pe = 0;
629 (*ii)->Join(reinterpret_cast<void**>(&pe));
630 if(pe) {
631 e.reset(new CException (*pe));
632 }
633 }
634 else {
635 (*ii)->Join(0);
636 }
637 }
638 if(e.get()) {
639 throw *e;
640 }
641
642 for(size_t idata = 0; idata < idim; ++idata) {
643
644 SAlignInOut& data = vdata[idata];
645 copy(data.m_transcript.begin(), data.m_transcript.end(),
646 back_inserter(m_Transcript));
647 if(idata + 1 < idim) {
648 for(size_t k = 0; k < seed_dims[idata]; ++k) {
649 m_Transcript.push_back(eTS_Match);
650 }
651 }
652 }
653
654 m_score = ScoreFromTranscript(GetTranscript(false), 0, 0);
655 }}
656 }
657 else {
658
659 m_Transcript.clear();
660 size_t guides_dim = m_guides.size() / 4;
661 size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;
662 for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {
663
664 q0 = m_guides[i - 3] + 1;
665 s0 = m_guides[i - 1] + 1;
666 size_t dim_query = q1 - q0, dim_subj = s1 - s0;
667
668 bool esf_L1 = false, esf_R1 = false,
669 esf_L2 = false, esf_R2 = false;
670 if(i == istart) {
671 esf_R1 = m_esf_R1;
672 esf_R2 = m_esf_R2;
673 }
674
675 SAlignInOut data (q0, dim_query, esf_L1, esf_R1,
676 s0, dim_subj, esf_L2, esf_R2);
677
678 x_Align(&data);
679 copy(data.m_transcript.begin(), data.m_transcript.end(),
680 back_inserter(m_Transcript));
681
682 size_t dim_hit = m_guides[i - 3] - m_guides[i - 4] + 1;
683 for(size_t k = 0; k < dim_hit; ++k) {
684 m_Transcript.push_back(eTS_Match);
685 }
686 q1 = m_guides[i - 4];
687 s1 = m_guides[i - 2];
688 }
689 SAlignInOut data(0, q1, m_esf_L1, false,
690 0, s1, m_esf_L2, false);
691 x_Align(&data);
692 copy(data.m_transcript.begin(), data.m_transcript.end(),
693 back_inserter(m_Transcript));
694
695 m_score = ScoreFromTranscript(GetTranscript(false), 0, 0);
696 }
697 }
698
699 catch(std::bad_alloc&) {
700 NCBI_THROW(CAlgoAlignException, eMemoryLimit, g_msg_OutOfSpace);
701 }
702
703 return m_score;
704 }
705
706
x_GetDiagTS(size_t i1,size_t i2) const707 CNWAligner::ETranscriptSymbol CNWAligner::x_GetDiagTS(size_t i1, size_t i2)
708 const
709 {
710 const unsigned char c1 = m_Seq1[i1];
711 const unsigned char c2 = m_Seq2[i2];
712
713 ETranscriptSymbol ts;
714 if(m_PositivesAsMatches) {
715 ts = m_ScoreMatrix.s[c1][c2] > 0? eTS_Match: eTS_Replace;
716 }
717 else {
718 // N vs N should be mismatch for nucleotides,
719 // and X vs X in protein. Check with matrix
720 ts = (toupper(c1) == toupper(c2) && m_ScoreMatrix.s[c1][c2] > 0)? eTS_Match: eTS_Replace;
721 }
722 return ts;
723 }
724
725 // perform backtrace step, NW only
x_DoBackTrace(const CBacktraceMatrix4 & backtrace,SAlignInOut * data)726 void CNWAligner::x_DoBackTrace(const CBacktraceMatrix4 & backtrace,
727 SAlignInOut* data)
728 {
729 const size_t N1 (data->m_len1 + 1);
730 const size_t N2 (data->m_len2 + 1);
731
732 data->m_transcript.clear();
733 data->m_transcript.reserve(N1 + N2);
734
735 size_t k (N1*N2 - 1);
736 size_t i1 (data->m_offset1 + data->m_len1 - 1);
737 size_t i2 (data->m_offset2 + data->m_len2 - 1);
738 while (k != 0) {
739
740 unsigned char Key (backtrace[k]);
741
742 if (Key & kMaskD) {
743
744 data->m_transcript.push_back(x_GetDiagTS(i1--, i2--));
745 k -= N2 + 1;
746 }
747 else if (Key & kMaskE) {
748
749 data->m_transcript.push_back(eTS_Insert);
750 --k;
751 --i2;
752 while(k > 0 && (Key & kMaskEc)) {
753
754 data->m_transcript.push_back(eTS_Insert);
755 Key = backtrace[k--];
756 --i2;
757 }
758 }
759 else {
760
761 data->m_transcript.push_back(eTS_Delete);
762 k -= N2;
763 --i1;
764 while(k > 0 && (Key & kMaskFc)) {
765
766 data->m_transcript.push_back(eTS_Delete);
767 Key = backtrace[k];
768 k -= N2;
769 --i1;
770 }
771 }
772 }
773 }
774
775
776
777 // perform backtrace step, NW + SW
x_SWDoBackTrace(const CBacktraceMatrix4 & backtrace,SAlignInOut * data)778 void CNWAligner::x_SWDoBackTrace(const CBacktraceMatrix4 & backtrace,
779 SAlignInOut* data)
780 {
781 const size_t N1 (data->m_len1 + 1);
782 const size_t N2 (data->m_len2 + 1);
783 const TNCBIScore (* sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
784
785 data->m_transcript.clear();
786 data->m_transcript.reserve(N1 + N2);
787
788 size_t k (N1*N2 - 1);
789 size_t i1 (data->m_offset1 + data->m_len1 - 1);
790 size_t i2 (data->m_offset2 + data->m_len2 - 1);
791 if (m_SmithWaterman) {
792 size_t sw_k = backtrace.BestPos();
793 data->FillEdgeGaps(k - sw_k, true);
794 i1 -= (k - sw_k) / (data->m_len2+1);
795 i2 -= (k - sw_k) % (data->m_len2+1);
796 k = backtrace.BestPos();
797 }
798
799 //score for SmithWaterman. Stop when score == 0
800 TNCBIScore score = backtrace.BestScore();
801
802 while (k > 0 && ( !m_SmithWaterman || score > 0 ) ) {
803
804 unsigned char Key (backtrace[k]);
805
806 if (Key & kMaskD) {
807 score -= sm[(size_t)(m_Seq1[i1])][(size_t)(m_Seq2[i2])];
808 data->m_transcript.push_back(x_GetDiagTS(i1--, i2--));
809 k -= N2 + 1;
810 }
811 else if (Key & kMaskE) {
812 score -= m_Wg + m_Ws;
813 data->m_transcript.push_back(eTS_Insert);
814 --k;
815 --i2;
816 while(k > 0 && (Key & kMaskEc)) {
817 score -= m_Ws;
818 data->m_transcript.push_back(eTS_Insert);
819 Key = backtrace[k--];
820 --i2;
821 }
822 }
823 else {
824 score -= m_Wg + m_Ws;
825 data->m_transcript.push_back(eTS_Delete);
826 k -= N2;
827 --i1;
828 while(k > 0 && (Key & kMaskFc)) {
829 score -= m_Ws;
830 data->m_transcript.push_back(eTS_Delete);
831 Key = backtrace[k];
832 k -= N2;
833 --i1;
834 }
835 }
836 }
837 if( m_SmithWaterman && score != 0 ) {
838 NCBI_THROW(CAlgoAlignException, eInternal,
839 "negative score in Smith-Waterman back trace");
840 }
841 data->FillEdgeGaps(k, false);
842 }
843
844
SetPattern(const vector<size_t> & guides)845 void CNWAligner::SetPattern(const vector<size_t>& guides)
846 {
847 size_t dim = guides.size();
848 const char* err = 0;
849 if(dim % 4 == 0) {
850 for(size_t i = 0; i < dim; i += 4) {
851
852 if( guides[i] > guides[i+1] || guides[i+2] > guides[i+3] ) {
853 err = "Pattern hits must be specified in plus strand";
854 break;
855 }
856
857 if(i > 4) {
858 if(guides[i] <= guides[i-3] || guides[i+2] <= guides[i-2]){
859 err = "Pattern hits coordinates must be sorted";
860 break;
861 }
862 }
863
864 size_t dim1 = guides[i + 1] - guides[i];
865 size_t dim2 = guides[i + 3] - guides[i + 2];
866 if( dim1 != dim2) {
867 err = "Pattern hits must have equal length on both sequences";
868 break;
869 }
870
871 if(guides[i+1] >= m_SeqLen1 || guides[i+3] >= m_SeqLen2) {
872 err = "One or several pattern hits are out of range";
873 break;
874 }
875 }
876 }
877 else {
878 err = "Pattern must have a dimension multiple of four";
879 }
880
881 if(err) {
882 NCBI_THROW(CAlgoAlignException, eBadParameter, err);
883 }
884 else {
885 m_guides = guides;
886 }
887 }
888
889
GetEndSpaceFree(bool * L1,bool * R1,bool * L2,bool * R2) const890 void CNWAligner::GetEndSpaceFree(bool* L1, bool* R1, bool* L2, bool* R2) const
891 {
892 if(L1) *L1 = m_esf_L1;
893 if(R1) *R1 = m_esf_R1;
894 if(L2) *L2 = m_esf_L2;
895 if(R2) *R2 = m_esf_R2;
896 }
897
IsSmithWaterman() const898 bool CNWAligner::IsSmithWaterman() const
899 {
900 return m_SmithWaterman;
901 }
902
GetGapPreference() const903 CNWAligner::EGapPreference CNWAligner::GetGapPreference() const
904 {
905 return m_GapPreference;
906 }
907
908 // return raw transcript
GetTranscript(bool reversed) const909 CNWAligner::TTranscript CNWAligner::GetTranscript(bool reversed) const
910 {
911 TTranscript rv (m_Transcript.size());
912 if(reversed) {
913 copy(m_Transcript.begin(), m_Transcript.end(), rv.begin());
914 }
915 else {
916 copy(m_Transcript.rbegin(), m_Transcript.rend(), rv.begin());
917 }
918 return rv;
919 }
920
921
922 // alignment 'restore' - set raw transcript
SetTranscript(const TTranscript & transcript)923 void CNWAligner::SetTranscript(const TTranscript& transcript)
924 {
925 m_Transcript = transcript;
926 m_score = ScoreFromTranscript(transcript);
927 }
928
929
930 // Return transcript as a readable string
GetTranscriptString(void) const931 string CNWAligner::GetTranscriptString(void) const
932 {
933 const int dim (m_Transcript.size());
934 string s;
935 s.resize(dim);
936 size_t i1 (0), i2 (0), i (0);
937
938 for (int k (dim - 1); k >= 0; --k) {
939
940 ETranscriptSymbol c0 (m_Transcript[k]);
941 char c (0);
942 switch ( c0 ) {
943
944 case eTS_Match: {
945
946 if(m_Seq1 && m_Seq2) {
947 ETranscriptSymbol ts = x_GetDiagTS(i1++, i2++);
948 c = ts == eTS_Match? 'M': 'R';
949 }
950 else {
951 c = 'M';
952 }
953 }
954 break;
955
956 case eTS_Replace: {
957
958 if(m_Seq1 && m_Seq2) {
959 ETranscriptSymbol ts = x_GetDiagTS(i1++, i2++);
960 c = ts == eTS_Match? 'M': 'R';
961 }
962 else {
963 c = 'R';
964 }
965 }
966 break;
967
968 case eTS_Insert: {
969 c = 'I';
970 ++i2;
971 }
972 break;
973
974 case eTS_SlackInsert: {
975 c = 'i';
976 ++i2;
977 }
978 break;
979
980 case eTS_SlackDelete: {
981 c = 'd';
982 ++i1;
983 }
984 break;
985
986 case eTS_Delete: {
987 c = 'D';
988 ++i1;
989 }
990 break;
991
992 case eTS_Intron: {
993 c = '+';
994 ++i2;
995 }
996 break;
997
998 default: {
999 NCBI_THROW(CAlgoAlignException, eInternal,
1000 g_msg_InvalidTranscriptSymbol);
1001 }
1002 }
1003
1004 s[i++] = c;
1005 }
1006
1007 if(i < s.size()) {
1008 s.resize(i + 1);
1009 }
1010
1011 return s;
1012 }
1013
1014
SetProgressCallback(FProgressCallback prg_callback,void * data)1015 void CNWAligner::SetProgressCallback(FProgressCallback prg_callback, void* data)
1016 {
1017 m_prg_callback = prg_callback;
1018 m_prg_info.m_data = data;
1019 }
1020
1021
SetWms(TScore val)1022 void CNWAligner::SetWms(TScore val)
1023 {
1024 m_Wms = val;
1025 m_ScoreMatrixInvalid = true;
1026 }
1027
1028
SetWm(TScore val)1029 void CNWAligner::SetWm(TScore val)
1030 {
1031 m_Wm = val;
1032 m_ScoreMatrixInvalid = true;
1033 }
1034
1035
SetScoreMatrix(const SNCBIPackedScoreMatrix * psm)1036 void CNWAligner::SetScoreMatrix(const SNCBIPackedScoreMatrix* psm)
1037 {
1038 if(psm) {
1039
1040 m_abc = psm->symbols;
1041 NCBISM_Unpack(psm, &m_ScoreMatrix);
1042 }
1043 else { // assume IUPACna
1044
1045 m_abc = g_nwaligner_nucleotides;
1046 const size_t dim = strlen(m_abc);
1047 vector<TNCBIScore> iupacna (dim*dim, m_Wms);
1048 iupacna[0] = iupacna[dim+1] = iupacna[2*(dim+1)] =
1049 iupacna[3*(dim+1)] = m_Wm;
1050 SNCBIPackedScoreMatrix iupacna_psm;
1051 iupacna_psm.symbols = g_nwaligner_nucleotides;
1052 iupacna_psm.scores = &iupacna.front();
1053 iupacna_psm.defscore = m_Wms;
1054 NCBISM_Unpack(&iupacna_psm, &m_ScoreMatrix);
1055 }
1056 m_ScoreMatrixInvalid = false;
1057 }
1058
1059
1060 // Check that all characters in sequence are valid for
1061 // the current sequence type.
1062 // Return an index to the first invalid character
1063 // or len if all characters are valid.
x_CheckSequence(const char * seq,size_t len) const1064 size_t CNWAligner::x_CheckSequence(const char* seq, size_t len) const
1065 {
1066 char Flags [256];
1067 memset(Flags, 0, sizeof Flags);
1068 const size_t abc_size = strlen(m_abc);
1069
1070 size_t k;
1071 for(k = 0; k < abc_size; ++k) {
1072 Flags[unsigned(toupper((unsigned char) m_abc[k]))] = 1;
1073 Flags[unsigned(tolower((unsigned char) m_abc[k]))] = 1;
1074 Flags[unsigned(k)] = 1;
1075 }
1076
1077 for(k = 0; k < len; ++k) {
1078 if(Flags[unsigned(seq[k])] == 0)
1079 break;
1080 }
1081
1082 return k;
1083 }
1084
1085
GetScore() const1086 CNWAligner::TScore CNWAligner::GetScore() const
1087 {
1088 if(m_Transcript.size()) {
1089 return m_score;
1090 }
1091 else {
1092 NCBI_THROW(CAlgoAlignException, eNoSeqData,
1093 g_msg_NoAlignment);
1094 }
1095 }
1096
1097
x_CheckMemoryLimit()1098 bool CNWAligner::x_CheckMemoryLimit()
1099 {
1100 const size_t elem_size (GetElemSize());
1101 const size_t gdim (m_guides.size());
1102 double mem (0);
1103
1104 if(gdim) {
1105
1106 size_t dim1 (m_guides[0]), dim2 (m_guides[2]);
1107 mem = double(dim1) * dim2 * elem_size;
1108 if(mem <= m_MaxMem) {
1109
1110 for(size_t i (4); i < gdim; i += 4) {
1111
1112 dim1 = m_guides[i] - m_guides[i-3] + 1;
1113 dim2 = m_guides[i + 2] - m_guides[i-1] + 1;
1114 mem = double(dim1) * dim2 * elem_size;
1115 if(mem > m_MaxMem) {
1116 break;
1117 }
1118 }
1119
1120 if(mem <= m_MaxMem) {
1121 dim1 = m_SeqLen1 - m_guides[gdim-3];
1122 dim2 = m_SeqLen2 - m_guides[gdim-1];
1123 mem = double(dim1) * dim2 * elem_size;
1124 }
1125 }
1126 }
1127 else {
1128 mem = double(m_SeqLen1 + 1) * (m_SeqLen2 + 1) * elem_size;
1129 }
1130
1131 return mem < m_MaxMem;
1132 }
1133
1134
EnableMultipleThreads(bool enable)1135 void CNWAligner::EnableMultipleThreads(bool enable)
1136 {
1137 m_maxthreads = (m_mt = enable)? GetCpuCount(): 1;
1138 }
1139
1140
ScoreFromTranscript(const TTranscript & transcript,size_t start1,size_t start2) const1141 CNWAligner::TScore CNWAligner::ScoreFromTranscript(
1142 const TTranscript& transcript,
1143 size_t start1, size_t start2) const
1144 {
1145 bool nucl_mode;
1146 if(start1 == kMax_UInt && start2 == kMax_UInt) {
1147 nucl_mode = true;
1148 }
1149 else if(start1 != kMax_UInt && start2 != kMax_UInt) {
1150 nucl_mode = false;
1151 }
1152 else {
1153 NCBI_THROW(CAlgoAlignException, eInternal,
1154 g_msg_InconsistentArguments);
1155 }
1156
1157 size_t dim (transcript.size());
1158 if(dim == 0) {
1159 return 0;
1160 }
1161
1162 TScore score (0);
1163
1164 const char* p1 (m_Seq1 + start1);
1165 const char* p2 (m_Seq2 + start2);
1166
1167 int state1(0); // 0 = normal, 1 = gap;
1168 int state2(0); // 0 = normal, 1 = gap;
1169
1170 size_t i (0);
1171
1172 const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
1173
1174 ///SmithWaterman alterations
1175 if( IsSmithWaterman() ) {
1176 //cut beginning gaps
1177 for(; i<dim; ++i) {
1178 if( transcript[i] == eTS_Insert ) {
1179 ++p2;
1180 } else if( transcript[i] == eTS_Delete ) {
1181 ++p1;
1182 } else if( transcript[i] == eTS_Match ||
1183 transcript[i] == eTS_Replace ) {
1184 break;
1185 } else {
1186 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidTranscriptSymbol);
1187 }
1188 }
1189 if( i == dim ) {//alignment consists of gaps only
1190 return score;
1191 }
1192 //cut trailing gaps
1193 //for(size_t endi = dim-1; endi >=0; --endi) { // never stops CXX-11373
1194 for(size_t endi = dim; endi-- > 0; ) {
1195 if( transcript[endi] == eTS_Match || transcript[endi] == eTS_Replace ) {
1196 dim = endi + 1;
1197 break;
1198 } else if( transcript[endi] != eTS_Insert && transcript[endi] != eTS_Delete ) {
1199 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidTranscriptSymbol);
1200 }
1201 }
1202 }
1203
1204 for(; i < dim; ++i) {
1205
1206 ETranscriptSymbol ts (transcript[i]);
1207
1208 switch(ts) {
1209
1210 case eTS_Match:
1211 case eTS_Replace: {
1212 if(nucl_mode) {
1213 score += (ts == eTS_Match)? m_Wm: m_Wms;
1214 }
1215 else {
1216 unsigned char c1 = *p1;
1217 unsigned char c2 = *p2;
1218 score += sm[c1][c2];
1219 ++p1; ++p2;
1220 }
1221 state1 = state2 = 0;
1222 }
1223 break;
1224
1225 case eTS_Insert: {
1226
1227 if(state1 != 1) score += m_Wg;
1228 state1 = 1; state2 = 0;
1229 score += m_Ws;
1230 ++p2;
1231 }
1232 break;
1233
1234 case eTS_Delete: {
1235
1236 if(state2 != 1) score += m_Wg;
1237 state1 = 0; state2 = 1;
1238 score += m_Ws;
1239 ++p1;
1240 }
1241 break;
1242
1243 default:
1244 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidTranscriptSymbol);
1245 }
1246 }
1247
1248 if( IsSmithWaterman() ) {//end gap scores are already excluded
1249 return score;
1250 }
1251
1252 if(m_esf_L1) {
1253 size_t g = 0;
1254 for(size_t i = 0; i < dim; ++i) {
1255 if(transcript[i] == eTS_Insert) ++g; else break;
1256 }
1257 if(g > 0) {
1258 score -= (m_Wg + g*m_Ws);
1259 }
1260 }
1261
1262 if(m_esf_L2) {
1263 size_t g = 0;
1264 for(size_t i = 0; i < dim; ++i) {
1265 if(transcript[i] == eTS_Delete) ++g; else break;
1266 }
1267 if(g > 0) {
1268 score -= (m_Wg + g*m_Ws);
1269 }
1270 }
1271
1272 if(m_esf_R1) {
1273 size_t g = 0;
1274 for(int i = dim - 1; i >= 0; --i) {
1275 if(transcript[i] == eTS_Insert) ++g; else break;
1276 }
1277 if(g > 0) {
1278 score -= (m_Wg + g*m_Ws);
1279 }
1280 }
1281
1282 if(m_esf_R2) {
1283 size_t g = 0;
1284 for(int i = dim - 1; i >= 0; --i) {
1285 if(transcript[i] == eTS_Delete) ++g; else break;
1286 }
1287 if(g > 0) {
1288 score -= (m_Wg + g*m_Ws);
1289 }
1290 }
1291
1292 return score;
1293 }
1294
1295
GetLeftSeg(size_t * q0,size_t * q1,size_t * s0,size_t * s1,size_t min_size) const1296 size_t CNWAligner::GetLeftSeg(size_t* q0, size_t* q1,
1297 size_t* s0, size_t* s1,
1298 size_t min_size) const
1299 {
1300 size_t trdim = m_Transcript.size();
1301 size_t cur = 0, maxseg = 0;
1302 const char* p1 = m_Seq1;
1303 const char* p2 = m_Seq2;
1304 size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1305
1306 for(int k = trdim - 1; k >= 0; --k) {
1307
1308 switch(m_Transcript[k]) {
1309
1310 case eTS_Insert: {
1311 ++p2;
1312 if(cur > maxseg) {
1313 maxseg = cur;
1314 imax = i0;
1315 jmax = j0;
1316 if(maxseg >= min_size) goto ret_point;
1317 }
1318 cur = 0;
1319 }
1320 break;
1321
1322 case eTS_Delete: {
1323 ++p1;
1324 if(cur > maxseg) {
1325 maxseg = cur;
1326 imax = i0;
1327 jmax = j0;
1328 if(maxseg >= min_size) goto ret_point;
1329 }
1330 cur = 0;
1331 }
1332 break;
1333
1334 case eTS_Match:
1335 case eTS_Replace: {
1336 if(*p1 == *p2) {
1337 if(cur == 0) {
1338 i0 = p1 - m_Seq1;
1339 j0 = p2 - m_Seq2;
1340 }
1341 ++cur;
1342 }
1343 else {
1344 if(cur > maxseg) {
1345 maxseg = cur;
1346 imax = i0;
1347 jmax = j0;
1348 if(maxseg >= min_size) goto ret_point;
1349 }
1350 cur = 0;
1351 }
1352 ++p1;
1353 ++p2;
1354 }
1355 break;
1356
1357 default: {
1358 NCBI_THROW( CAlgoAlignException, eInternal,
1359 g_msg_InvalidTranscriptSymbol );
1360 }
1361 }
1362 }
1363
1364 if(cur > maxseg) {
1365 maxseg = cur;
1366 imax = i0;
1367 jmax = j0;
1368 }
1369
1370 ret_point:
1371
1372 *q0 = imax; *s0 = jmax;
1373 *q1 = *q0 + maxseg - 1;
1374 *s1 = *s0 + maxseg - 1;
1375
1376 return maxseg;
1377 }
1378
1379 /////////////////////////////////////////////////
1380 /// naive pattern generator (a la Rabin-Karp)
1381
1382 struct nwaln_mrnaseg {
nwaln_mrnasegnwaln_mrnaseg1383 nwaln_mrnaseg(size_t i1, size_t i2, unsigned char fp0):
1384 a(i1), b(i2), fp(fp0) {}
1385 size_t a, b;
1386 unsigned char fp;
1387 };
1388
1389 struct nwaln_mrnaguide {
nwaln_mrnaguidenwaln_mrnaguide1390 nwaln_mrnaguide(size_t i1, size_t i2, size_t i3, size_t i4):
1391 q0(i1), q1(i2), s0(i3), s1(i4) {}
1392 size_t q0, q1, s0, s1;
1393 };
1394
1395
x_CalcFingerPrint64(const char * beg,const char * end,size_t & err_index)1396 unsigned char CNWAligner::x_CalcFingerPrint64(
1397 const char* beg, const char* end, size_t& err_index)
1398 {
1399 if(beg >= end) {
1400 return 0xFF;
1401 }
1402
1403 unsigned char fp = 0, code;
1404 for(const char* p = beg; p < end; ++p) {
1405 switch(*p) {
1406 case 'A': code = 0; break;
1407 case 'G': code = 0x01; break;
1408 case 'T': code = 0x02; break;
1409 case 'C': code = 0x03; break;
1410 default: err_index = p - beg; return 0x40; // incorrect char
1411 }
1412 fp = 0x3F & ((fp << 2) | code);
1413 }
1414
1415 return fp;
1416 }
1417
1418
x_FindFingerPrint64(const char * beg,const char * end,unsigned char fingerprint,size_t size,size_t & err_index)1419 const char* CNWAligner::x_FindFingerPrint64(
1420 const char* beg, const char* end,
1421 unsigned char fingerprint, size_t size,
1422 size_t& err_index)
1423 {
1424
1425 if(beg + size > end) {
1426 err_index = 0;
1427 return 0;
1428 }
1429
1430 const char* p0 = beg;
1431
1432 size_t err_idx = 0; --p0;
1433 unsigned char fp = 0x40;
1434 while(fp == 0x40 && p0 < end) {
1435 p0 += err_idx + 1;
1436 fp = x_CalcFingerPrint64(p0, p0 + size, err_idx);
1437 }
1438
1439 if(p0 >= end) {
1440 return end; // not found
1441 }
1442
1443 unsigned char code;
1444 while(fp != fingerprint && ++p0 < end) {
1445
1446 switch(*(p0 + size - 1)) {
1447 case 'A': code = 0; break;
1448 case 'G': code = 0x01; break;
1449 case 'T': code = 0x02; break;
1450 case 'C': code = 0x03; break;
1451 default: err_index = p0 + size - 1 - beg;
1452 return 0;
1453 }
1454
1455 fp = 0x3F & ((fp << 2) | code );
1456 }
1457
1458 return p0;
1459 }
1460
1461
MakePattern(const size_t guide_size,const size_t guide_core)1462 size_t CNWAligner::MakePattern(const size_t guide_size,
1463 const size_t guide_core)
1464 {
1465 if(guide_core > guide_size) {
1466 NCBI_THROW(CAlgoAlignException,
1467 eBadParameter,
1468 g_msg_NullParameter);
1469 }
1470
1471 vector<nwaln_mrnaseg> segs;
1472
1473 size_t err_idx(0);
1474 for(size_t i = 0; i + guide_size <= m_SeqLen1; ) {
1475 const char* beg = m_Seq1 + i;
1476 const char* end = m_Seq1 + i + guide_size;
1477 unsigned char fp = x_CalcFingerPrint64(beg, end, err_idx);
1478 if(fp != 0x40) {
1479 segs.push_back(nwaln_mrnaseg(i, i + guide_size - 1, fp));
1480 i += guide_size;
1481 }
1482 else {
1483 i += err_idx + 1;
1484 }
1485 }
1486
1487 vector<nwaln_mrnaguide> guides;
1488 size_t idx = 0;
1489 const char* beg = m_Seq2 + idx;
1490 const char* end = m_Seq2 + m_SeqLen2;
1491 for(size_t i = 0, seg_count = segs.size();
1492 beg + guide_size <= end && i < seg_count; ++i) {
1493
1494 const char* p = 0;
1495 const char* beg0 = beg;
1496 while( p == 0 && beg + guide_size <= end ) {
1497
1498 p = x_FindFingerPrint64( beg, end, segs[i].fp,
1499 guide_size, err_idx );
1500 if(p == 0) { // incorrect char
1501 beg += err_idx + 1;
1502 }
1503 else if (p < end) {// fingerprints match but check actual sequences
1504 const char* seq1 = m_Seq1 + segs[i].a;
1505 const char* seq2 = p;
1506 size_t k;
1507 for(k = 0; k < guide_size; ++k) {
1508 if(seq1[k] != seq2[k]) break;
1509 }
1510 if(k == guide_size) { // real match
1511 size_t i1 = segs[i].a;
1512 size_t i2 = segs[i].b;
1513 size_t i3 = seq2 - m_Seq2;
1514 size_t i4 = i3 + guide_size - 1;
1515 size_t guides_dim = guides.size();
1516 if( guides_dim == 0 ||
1517 i1 - 1 > guides[guides_dim - 1].q1 ||
1518 i3 - 1 > guides[guides_dim - 1].s1 ) {
1519 guides.push_back(nwaln_mrnaguide(i1, i2, i3, i4));
1520 }
1521 else { // expand the last guide
1522 guides[guides_dim - 1].q1 = i2;
1523 guides[guides_dim - 1].s1 = i4;
1524 }
1525 beg0 = p + guide_size;
1526 }
1527 else { // spurious match
1528 beg = p + 1;
1529 p = 0;
1530 }
1531 }
1532 }
1533 beg = beg0; // restore start pos in genomic sequence
1534 }
1535
1536 // initialize m_guides
1537 size_t guides_dim = guides.size();
1538 m_guides.clear();
1539 m_guides.resize(4*guides_dim);
1540 const size_t offs = guide_core/2 - 1;
1541 for(size_t k = 0; k < guides_dim; ++k) {
1542 size_t q0 = (guides[k].q0 + guides[k].q1) / 2;
1543 size_t s0 = (guides[k].s0 + guides[k].s1) / 2;
1544 m_guides[4*k] = q0 - offs;
1545 m_guides[4*k + 1] = q0 + offs;
1546 m_guides[4*k + 2] = s0 - offs;
1547 m_guides[4*k + 3] = s0 + offs;
1548 }
1549
1550 return m_guides.size();
1551 }
1552
1553 //////////////////////////////////////////////
1554 /////////////////////////////////////////////
1555
GetRightSeg(size_t * q0,size_t * q1,size_t * s0,size_t * s1,size_t min_size) const1556 size_t CNWAligner::GetRightSeg(size_t* q0, size_t* q1,
1557 size_t* s0, size_t* s1,
1558 size_t min_size) const
1559 {
1560 size_t trdim = m_Transcript.size();
1561 size_t cur = 0, maxseg = 0;
1562 const char* seq1_end = m_Seq1 + m_SeqLen1;
1563 const char* seq2_end = m_Seq2 + m_SeqLen2;
1564 const char* p1 = seq1_end - 1;
1565 const char* p2 = seq2_end - 1;
1566 size_t i0 = m_SeqLen1 - 1, j0 = m_SeqLen2 - 1,
1567 imax = i0, jmax = j0;
1568
1569 for(size_t k = 0; k < trdim; ++k) {
1570
1571 switch(m_Transcript[k]) {
1572
1573 case eTS_Insert: {
1574 --p2;
1575 if(cur > maxseg) {
1576 maxseg = cur;
1577 imax = i0;
1578 jmax = j0;
1579 if(maxseg >= min_size) goto ret_point;
1580 }
1581 cur = 0;
1582 }
1583 break;
1584
1585 case eTS_Delete: {
1586 --p1;
1587 if(cur > maxseg) {
1588 maxseg = cur;
1589 imax = i0;
1590 jmax = j0;
1591 if(maxseg >= min_size) goto ret_point;
1592 }
1593 cur = 0;
1594 }
1595 break;
1596
1597 case eTS_Match:
1598 case eTS_Replace: {
1599 if(*p1 == *p2) {
1600 if(cur == 0) {
1601 i0 = p1 - m_Seq1;
1602 j0 = p2 - m_Seq2;
1603 }
1604 ++cur;
1605 }
1606 else {
1607 if(cur > maxseg) {
1608 maxseg = cur;
1609 imax = i0;
1610 jmax = j0;
1611 if(maxseg >= min_size) goto ret_point;
1612 }
1613 cur = 0;
1614 }
1615 --p1;
1616 --p2;
1617 }
1618 break;
1619
1620 default: {
1621 NCBI_THROW( CAlgoAlignException, eInternal,
1622 g_msg_InvalidTranscriptSymbol );
1623 }
1624 }
1625 }
1626
1627 if(cur > maxseg) {
1628 maxseg = cur;
1629 imax = i0;
1630 jmax = j0;
1631 }
1632
1633 ret_point:
1634
1635 *q1 = imax; *s1 = jmax;
1636 *q0 = imax - maxseg + 1;
1637 *s0 = jmax - maxseg + 1;
1638
1639 return maxseg;
1640 }
1641
1642
GetLongestSeg(size_t * q0,size_t * q1,size_t * s0,size_t * s1) const1643 size_t CNWAligner::GetLongestSeg(size_t* q0, size_t* q1,
1644 size_t* s0, size_t* s1) const
1645 {
1646 size_t trdim = m_Transcript.size();
1647 size_t cur = 0, maxseg = 0;
1648 const char* p1 = m_Seq1;
1649 const char* p2 = m_Seq2;
1650 size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1651
1652 for(int k = trdim-1; k >= 0; --k) {
1653
1654 switch(m_Transcript[k]) {
1655
1656 case eTS_Insert: {
1657 ++p2;
1658 if(cur > maxseg) {
1659 maxseg = cur;
1660 imax = i0;
1661 jmax = j0;
1662 }
1663 cur = 0;
1664 }
1665 break;
1666
1667 case eTS_Delete: {
1668 ++p1;
1669 if(cur > maxseg) {
1670 maxseg = cur;
1671 imax = i0;
1672 jmax = j0;
1673 }
1674 cur = 0;
1675 }
1676 break;
1677
1678 case eTS_Match:
1679 case eTS_Replace: {
1680 if(*p1 == *p2) {
1681 if(cur == 0) {
1682 i0 = p1 - m_Seq1;
1683 j0 = p2 - m_Seq2;
1684 }
1685 ++cur;
1686 }
1687 else {
1688 if(cur > maxseg) {
1689 maxseg = cur;
1690 imax = i0;
1691 jmax = j0;
1692 }
1693 cur = 0;
1694 }
1695 ++p1;
1696 ++p2;
1697 }
1698 break;
1699
1700 default: {
1701 NCBI_THROW( CAlgoAlignException, eInternal,
1702 g_msg_InvalidTranscriptSymbol );
1703 }
1704 }
1705 }
1706
1707 if(cur > maxseg) {
1708 maxseg = cur;
1709 imax = i0;
1710 jmax = j0;
1711 }
1712
1713 *q0 = imax; *s0 = jmax;
1714 *q1 = *q0 + maxseg - 1;
1715 *s1 = *s0 + maxseg - 1;
1716
1717 return maxseg;
1718 }
1719
1720
GetDense_seg(TSeqPos query_start,ENa_strand query_strand,TSeqPos subj_start,ENa_strand subj_strand,bool trim_end_gaps) const1721 CRef<CDense_seg> CNWAligner::GetDense_seg(TSeqPos query_start,
1722 ENa_strand query_strand,
1723 TSeqPos subj_start,
1724 ENa_strand subj_strand,
1725 bool trim_end_gaps) const
1726 {
1727 CNWFormatter::ESeqAlignFormatFlags flags;
1728 if(trim_end_gaps) {
1729 flags = CNWFormatter::eSAFF_TrimEndGaps;
1730 } else {
1731 flags = CNWFormatter::eSAFF_None;
1732 }
1733
1734 CNWFormatter fmt(*this);
1735
1736 return fmt.AsDenseSeg(query_start, query_strand,
1737 subj_start, subj_strand, flags);
1738 }
1739
1740
GetDense_seg(TSeqPos query_start,ENa_strand query_strand,const CSeq_id & query_id,TSeqPos subj_start,ENa_strand subj_strand,const CSeq_id & subj_id,bool trim_end_gaps) const1741 CRef<CDense_seg> CNWAligner::GetDense_seg(TSeqPos query_start,
1742 ENa_strand query_strand,
1743 const CSeq_id& query_id,
1744 TSeqPos subj_start,
1745 ENa_strand subj_strand,
1746 const CSeq_id& subj_id,
1747 bool trim_end_gaps) const
1748 {
1749 CNWFormatter::ESeqAlignFormatFlags flags;
1750 if(trim_end_gaps) {
1751 flags = CNWFormatter::eSAFF_TrimEndGaps;
1752 } else {
1753 flags = CNWFormatter::eSAFF_None;
1754 }
1755
1756 CNWFormatter fmt(*this);
1757
1758 CConstRef<CSeq_id> id0(&query_id);
1759 CConstRef<CSeq_id> id1(&subj_id);
1760 fmt.SetSeqIds(id0, id1);
1761
1762 return fmt.AsDenseSeg(query_start, query_strand,
1763 subj_start, subj_strand, flags);
1764 }
1765
1766 END_NCBI_SCOPE
1767