1 /* $Id: nw_aligner.cpp 609384 2020-06-01 14:49:07Z brovervv $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Yuri Kapustin, Boris Kiryutin, Alexander Souvorov
27  *
28  * File Description:  CNWAligner implementation
29  *
30  * ===========================================================================
31  *
32  */
33 
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "nw_aligner_threads.hpp"
38 #include "messages.hpp"
39 
40 #include <corelib/ncbi_system.hpp>
41 #include <algo/align/nw/align_exception.hpp>
42 #include <algo/align/nw/nw_formatter.hpp>
43 #include <objects/seqloc/Seq_id.hpp>
44 #include <objects/seqalign/Dense_seg.hpp>
45 
46 #include <objmgr/scope.hpp>
47 #include <objmgr/seq_vector.hpp>
48 
49 #include <algorithm>
50 
51 
52 BEGIN_NCBI_SCOPE
53 USING_SCOPE(objects);
54 
55 
CNWAligner()56 CNWAligner::CNWAligner()
57     : m_Wm(GetDefaultWm()),
58       m_Wms(GetDefaultWms()),
59       m_Wg(GetDefaultWg()),
60       m_Ws(GetDefaultWs()),
61       m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
62       m_SmithWaterman(false),
63       m_GapPreference(eLater),
64       m_abc(g_nwaligner_nucleotides),
65       m_ScoreMatrixInvalid(true),
66       m_prg_callback(0),
67       m_terminate(false),
68       m_Seq1Vec(),
69       m_Seq1(0), m_SeqLen1(0),
70       m_Seq2Vec(),
71       m_Seq2(0), m_SeqLen2(0),
72       m_PositivesAsMatches(false),
73       m_score(kInfMinus),
74       m_mt(false),
75       m_maxthreads(1),
76       m_MaxMem(GetDefaultSpaceLimit())
77 {
78     SetScoreMatrix(0);
79 }
80 
81 
CNWAligner(const char * seq1,size_t len1,const char * seq2,size_t len2,const SNCBIPackedScoreMatrix * scoremat)82 CNWAligner::CNWAligner( const char* seq1, size_t len1,
83                         const char* seq2, size_t len2,
84                         const SNCBIPackedScoreMatrix* scoremat )
85 
86     : m_Wm(GetDefaultWm()),
87       m_Wms(GetDefaultWms()),
88       m_Wg(GetDefaultWg()),
89       m_Ws(GetDefaultWs()),
90       m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
91       m_SmithWaterman(false),
92       m_GapPreference(eLater),
93       m_abc(g_nwaligner_nucleotides),
94       m_ScoreMatrixInvalid(true),
95       m_prg_callback(0),
96       m_terminate(false),
97       m_Seq1Vec(&seq1[0], &seq1[0]+len1),
98       m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(len1),
99       m_Seq2Vec(&seq2[0], &seq2[0]+len2),
100       m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(len2),
101       m_PositivesAsMatches(false),
102       m_score(kInfMinus),
103       m_mt(false),
104       m_maxthreads(1),
105       m_MaxMem(GetDefaultSpaceLimit())
106 {
107     SetScoreMatrix(scoremat);
108     SetSequences(seq1, len1, seq2, len2);
109 }
110 
111 
CNWAligner(const string & seq1,const string & seq2,const SNCBIPackedScoreMatrix * scoremat)112 CNWAligner::CNWAligner(const string& seq1,
113                        const string& seq2,
114                        const SNCBIPackedScoreMatrix* scoremat)
115     : m_Wm(GetDefaultWm()),
116       m_Wms(GetDefaultWms()),
117       m_Wg(GetDefaultWg()),
118       m_Ws(GetDefaultWs()),
119       m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
120       m_SmithWaterman(false),
121       m_GapPreference(eLater),
122       m_abc(g_nwaligner_nucleotides),
123       m_ScoreMatrixInvalid(true),
124       m_prg_callback(0),
125       m_terminate(false),
126       m_Seq1Vec(seq1.begin(), seq1.end()),
127       m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(seq1.size()),
128       m_Seq2Vec(seq2.begin(), seq2.end()),
129       m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(seq2.size()),
130       m_score(kInfMinus),
131       m_mt(false),
132       m_maxthreads(1),
133       m_MaxMem(GetDefaultSpaceLimit())
134 {
135     SetScoreMatrix(scoremat);
136     SetSequences(seq1, seq2);
137 };
138 
139 
SetSequences(const char * seq1,size_t len1,const char * seq2,size_t len2,bool verify)140 void CNWAligner::SetSequences(const char* seq1, size_t len1,
141 			      const char* seq2, size_t len2,
142 			      bool verify)
143 {
144     if(!seq1 || !seq2) {
145         NCBI_THROW(CAlgoAlignException, eBadParameter,
146                    g_msg_NullParameter);
147     }
148 
149     if(verify) {
150         size_t iErrPos1 = x_CheckSequence(seq1, len1);
151 	if(iErrPos1 < len1) {
152             CNcbiOstrstream oss;
153 	    oss << "The first sequence is inconsistent with the current "
154 		<< "scoring matrix type. "
155                 << "Position = " << iErrPos1
156                 << " Symbol = '" << seq1[iErrPos1] << "'";
157 
158 	    string message = CNcbiOstrstreamToString(oss);
159 	    NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
160 	}
161 
162 	size_t iErrPos2 = x_CheckSequence(seq2, len2);
163 	if(iErrPos2 < len2) {
164             CNcbiOstrstream oss;
165 	    oss << "The second sequence is inconsistent with the current "
166 		<< "scoring matrix type. "
167                 << "Position = " << iErrPos2
168                 << " Symbol = '" << seq2[iErrPos2] << "'";
169 
170 	    string message = CNcbiOstrstreamToString(oss);
171 	    NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
172 	}
173     }
174     m_Seq1Vec.assign(&seq1[0], &seq1[0]+len1);
175     m_Seq2Vec.assign(&seq2[0], &seq2[0]+len2);
176     m_Seq1 = &m_Seq1Vec[0];
177     m_SeqLen1 = len1;
178     m_Seq2 = &m_Seq2Vec[0];
179     m_SeqLen2 = len2;
180     m_Transcript.clear();
181 }
182 
183 
SetSequences(const string & seq1,const string & seq2,bool verify)184 void CNWAligner::SetSequences(const string& seq1,
185                               const string& seq2,
186                               bool verify)
187 {
188     SetSequences(seq1.data(), seq1.size(), seq2.data(), seq2.size(), verify);
189 }
190 
191 
SetEndSpaceFree(bool Left1,bool Right1,bool Left2,bool Right2)192 void CNWAligner::SetEndSpaceFree(bool Left1, bool Right1,
193                                  bool Left2, bool Right2)
194 {
195     m_esf_L1 = Left1;
196     m_esf_R1 = Right1;
197     m_esf_L2 = Left2;
198     m_esf_R2 = Right2;
199 }
200 
SetSmithWaterman(bool SW)201 void CNWAligner::SetSmithWaterman(bool SW)
202 {
203     m_SmithWaterman = SW;
204     if (SW) {
205         /// Smith-Waterman necessarily implies that all four ends are free
206         m_esf_L1 = m_esf_R1 = m_esf_L2 = m_esf_R2 = true;
207     }
208 }
209 
SetGapPreference(EGapPreference p)210 void CNWAligner::SetGapPreference(EGapPreference p)
211 {
212     m_GapPreference = p;
213 }
214 
215 // evaluate score for each possible alignment;
216 // fill out backtrace matrix
217 // bit coding (four bits per value): D E Ec Fc
218 // D:  1 if diagonal; 0 - otherwise
219 // E:  1 if space in 1st sequence; 0 if space in 2nd sequence
220 // Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened
221 // Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened
222 //
223 
224 const unsigned char kMaskFc  = 0x01;
225 const unsigned char kMaskEc  = 0x02;
226 const unsigned char kMaskE   = 0x04;
227 const unsigned char kMaskD   = 0x08;
228 
x_Align(SAlignInOut * data)229 CNWAligner::TScore CNWAligner::x_Align(SAlignInOut* data)
230 {
231 
232 
233     //check data integrity
234 
235     if( m_SmithWaterman && ( data->m_offset1 || m_SeqLen1 != data->m_len1 ||
236                              data->m_offset2 || m_SeqLen2 != data->m_len2 ) ) {
237         NCBI_THROW(CAlgoAlignException, eBadParameter,
238                    "Smith-Waterman not compatible with offsets provided");
239     }
240 
241     if( m_SmithWaterman && ( !data->m_esf_L1 || !data->m_esf_R1 ||
242                              !data->m_esf_L2 || !data->m_esf_R2 ) ) {
243         NCBI_THROW(CAlgoAlignException, eBadParameter,
244                    "Smith-Waterman not compatible with end gap penalties");
245     }
246 
247     const size_t N1 = data->m_len1 + 1;
248     const size_t N2 = data->m_len2 + 1;
249 
250     vector<TScore> stl_rowV (N2), stl_rowF(N2);
251 
252     const TNCBIScore (* sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
253 
254     if(m_prg_callback) {
255         m_prg_info.m_iter_total = N1*N2;
256         m_prg_info.m_iter_done = 0;
257         if( (m_terminate = m_prg_callback(&m_prg_info)) ) {
258             return 0;
259         }
260     }
261 
262     bool bFreeGapLeft1  = data->m_esf_L1 && data->m_offset1 == 0;
263     bool bFreeGapRight1 = data->m_esf_R1 &&
264                           m_SeqLen1 == data->m_offset1 + data->m_len1;
265 
266     bool bFreeGapLeft2  = data->m_esf_L2 && data->m_offset2 == 0;
267     bool bFreeGapRight2 = data->m_esf_R2 &&
268                           m_SeqLen2 == data->m_offset2 + data->m_len2;
269 
270     TScore wgleft1   = bFreeGapLeft1? 0: m_Wg;
271     TScore wsleft1   = bFreeGapLeft1? 0: m_Ws;
272     TScore wg1 = m_Wg, ws1 = m_Ws;
273 
274     // index calculation: [i,j] = i*n2 + j
275     CBacktraceMatrix4 backtrace_matrix (N1 * N2);
276     backtrace_matrix.SetAt(0, 0);
277 
278     // first row
279     // note that stl_rowF[0] is not used in the main cycle,
280     size_t k;
281     stl_rowV[0] = wgleft1;
282     for (k = 1; k < N2; ++k) {
283         stl_rowV[k] = stl_rowV[k-1] + wsleft1;
284         stl_rowF[k] = kInfMinus;
285         backtrace_matrix.SetAt(k, kMaskE | kMaskEc);
286     }
287     backtrace_matrix.Purge(k);
288     stl_rowV[0] = 0;
289 
290     if(m_prg_callback) {
291         m_prg_info.m_iter_done = k;
292         m_terminate = m_prg_callback(&m_prg_info);
293     }
294 
295     // gap penalties
296     TScore wgleft2 (bFreeGapLeft2? 0: m_Wg);
297     TScore wsleft2 (bFreeGapLeft2? 0: m_Ws);
298 
299     const char * seq1 = m_Seq1 + data->m_offset1;
300     const char * seq1_end = seq1 + data->m_len1;
301 
302     TScore V0 = wgleft2;
303     TScore V = 0;//best score in the current cell. Will be equal to the NW score at the end
304     TScore best_V = 0;//best score in the whole matrix aka score for SW
305 
306     --k;
307 
308     for(;  seq1 != seq1_end && !m_terminate;  ++seq1) {
309 
310         backtrace_matrix.SetAt(++k, kMaskFc);
311 
312         if( seq1 + 1 == seq1_end && bFreeGapRight1) {
313                 wg1 = ws1 = 0;
314         }
315 
316         unsigned char tracer;
317         const TNCBIScore * row_sc = sm[(size_t)*seq1];
318 
319         const char * seq2 = m_Seq2 + data->m_offset2;
320         const char * seq2_end = seq2 + data->m_len2;
321         TScore wg2 = m_Wg, ws2 = m_Ws;
322 
323         //best ending with gap in seq1 open  seq1 X- or extended seq1 X--
324         //                                   seq2 XX             seq2 XXX
325         TScore  E = kInfMinus;
326         //best ending with gap in seq2
327         TScore F;
328         //total best with
329         //best ending with match
330         TScore G;
331         //just temporary
332         TScore n0;
333         //total best
334         TScore * rowV    = &stl_rowV[0];//previos row
335         V = V0 += wsleft2;       //current row
336         //best ending with match
337         TScore * rowF    = &stl_rowF[0];
338 
339         for (; seq2 != seq2_end;) {
340 
341             G = *rowV + row_sc[(size_t)*seq2++];
342             *rowV = V;
343 
344             n0 = V + wg1;
345             if(E >= n0) {
346                 E += ws1;      // continue the gap
347                 tracer = kMaskEc;
348             }
349             else {
350                 E = n0 + ws1;  // open a new gap
351                 tracer = 0;
352             }
353 
354             if( bFreeGapRight2 && seq2 == seq2_end ) {
355                 wg2 = ws2 = 0;
356             }
357 
358             F = *++rowF;
359             n0 = *++rowV + wg2;
360             if(F >= n0) {
361                 F += ws2;
362                 tracer |= kMaskFc;
363             }
364             else {
365                 F = n0 + ws2;
366             }
367             *rowF = F;
368 
369             //best score
370             if( G < F || ( G == F && m_GapPreference == eLater) ) {
371                 if( E <= F ) {
372                     V = F;
373                 } else {
374                     V = E;
375                     tracer |= kMaskE;
376                 }
377             } else if( E > G || ( E == G && m_GapPreference == eLater) ) {
378                 V = E;
379                 tracer |= kMaskE;
380             } else {
381                 V = G;
382                 tracer |= kMaskD;
383             }
384 
385             if (m_SmithWaterman && V < 0 ) {
386                 V = 0;
387             }
388 
389             backtrace_matrix.SetAt(++k, tracer);
390 
391             if (V > best_V) {
392                 best_V = V;
393                 backtrace_matrix.SetBestPos(k);
394             }
395         }
396         *rowV = V;
397 
398         if(m_prg_callback) {
399             m_prg_info.m_iter_done = k;
400             if( (m_terminate = m_prg_callback(&m_prg_info)) ) {
401                 break;
402             }
403         }
404     }
405 
406     backtrace_matrix.Purge(++k);
407     backtrace_matrix.SetBestScore(best_V);
408 
409     /*
410     //print the matrix out
411     {{
412     cout<<endl;
413     int kk, ind1, ind2, width = 4;
414     cout<<setw(width)<<" ";
415     cout<<setw(width)<<"-";
416     for(ind2 = 0; ind2 < N2-1; ++ind2) {
417         cout<<setw(width)<<*(m_Seq2 + data->m_offset2 + ind2);
418     }
419     cout<<endl;
420     for(kk = 0,ind1 = 0; ind1 < N1; ++ind1) {
421         if(ind1) {
422             cout<<setw(width)<<(m_Seq1 + data->m_offset1)[ind1-1];
423         } else {
424             cout<<setw(width)<<"-";
425         }
426         for(ind2 = 0; ind2 < N2; ++ind2,++kk) {
427             string tstr;
428             unsigned char Key (backtrace_matrix[kk]);
429             if( Key & kMaskD ) tstr += "D";
430             else if ( Key & kMaskE ) tstr += "E";
431             else tstr += "F";
432             if( Key & kMaskEc )  tstr += "-";
433             if( Key & kMaskFc )  tstr += "|";
434             cout<<setw(width)<<tstr;
435         }
436         cout<<endl<<endl;
437     }
438     cout<<endl;
439     }}
440     //end of print the matrix out
441     */
442 
443     if(!m_terminate) {
444         x_SWDoBackTrace(backtrace_matrix, data);
445         //check back trace
446         TTranscript rv (data->m_transcript.size());
447         copy(data->m_transcript.rbegin(), data->m_transcript.rend(), rv.begin());
448         if(m_SmithWaterman) {
449             if( best_V != ScoreFromTranscript(rv,  data->m_offset1,  data->m_offset2) ) {
450                 NCBI_THROW(CAlgoAlignException, eInternal,
451                            "CNWAligner: error in back trace");
452             }
453         } else {
454             if( V != ScoreFromTranscript(rv,  data->m_offset1,  data->m_offset2) ) {
455                 NCBI_THROW(CAlgoAlignException, eInternal,
456                            "CNWAligner: error in back trace");
457             }
458         }
459     }
460 
461     if(m_SmithWaterman) {
462         return best_V;
463     }
464     return V;
465 }
466 
467 
Run(CScope & scope,const CSeq_id & id1,const CSeq_id & id2,bool trim_end_gaps)468 CRef<CSeq_align> CNWAligner::Run(CScope &scope, const CSeq_id &id1,
469                                  const CSeq_id &id2, bool trim_end_gaps)
470 {
471     CSeq_loc loc1, loc2;
472     loc1.SetWhole().Assign(id1);
473     loc2.SetWhole().Assign(id2);
474     return Run(scope, loc1, loc2, trim_end_gaps);
475 }
476 
Run(CScope & scope,const CSeq_loc & loc1,const CSeq_loc & loc2,bool trim_end_gaps)477 CRef<CSeq_align> CNWAligner::Run(CScope &scope, const CSeq_loc &loc1,
478                                  const CSeq_loc &loc2, bool trim_end_gaps)
479 {
480     if ((!loc1.IsInt() && !loc1.IsWhole()) ||
481         (!loc1.IsInt() && !loc1.IsWhole()))
482     {
483         NCBI_THROW(CException, eUnknown,
484                    "Only whole and interval locations supported");
485     }
486     CSeqVector vec1(loc1, scope, CBioseq_Handle::eCoding_Iupac);
487     string seq1;
488     vec1.GetSeqData(0, vec1.size(), seq1);
489     CSeqVector vec2(loc2, scope, CBioseq_Handle::eCoding_Iupac);
490     string seq2;
491     vec2.GetSeqData(0, vec2.size(), seq2);
492     SetSequences(seq1,seq2);
493     Run();
494     CRef<CSeq_align> align(new CSeq_align);
495     align->SetType(CSeq_align::eType_partial);
496     align->SetSegs().SetDenseg(*GetDense_seg(
497         loc1.GetStart(eExtreme_Biological), loc1.GetStrand(), *loc1.GetId(),
498         loc2.GetStart(eExtreme_Biological), loc2.GetStrand(), *loc2.GetId(),
499         trim_end_gaps));
500     return align;
501 }
502 
Run()503 CNWAligner::TScore CNWAligner::Run()
504 {
505     if(m_ScoreMatrixInvalid) {
506 
507         NCBI_THROW(CAlgoAlignException, eInvalidMatrix,
508                    "CNWAligner::SetScoreMatrix(NULL) must be called "
509                    "after changing match/mismatch scores "
510                    "to make sure that the new parameters are engaged.");
511     }
512 
513     if(!m_Seq1 || !m_Seq2) {
514         NCBI_THROW(CAlgoAlignException, eNoSeqData,
515                    g_msg_DataNotAvailable);
516     }
517 
518     if(!x_CheckMemoryLimit()) {
519         NCBI_THROW(CAlgoAlignException, eMemoryLimit, g_msg_HitSpaceLimit);
520     }
521 
522     if (m_SmithWaterman && !m_guides.empty()) {
523         NCBI_THROW(CAlgoAlignException, eBadParameter,
524                    "Smith-Waterman not compatible with provided pattern");
525     }
526 
527     m_score = x_Run();
528 
529     return m_score;
530 }
531 
532 
x_Run()533 CNWAligner::TScore CNWAligner::x_Run()
534 {
535     try {
536 
537         m_terminate = false;
538 
539         if(m_guides.size() == 0) {
540 
541             SAlignInOut data (0, m_SeqLen1, m_esf_L1, m_esf_R1,
542                               0, m_SeqLen2, m_esf_L2, m_esf_R2);
543             m_score = x_Align(&data);
544             m_Transcript = data.m_transcript;
545         }
546         // run the algorithm for every segment between hits
547         else if(m_mt && m_maxthreads > 1) {
548 
549             size_t guides_dim = m_guides.size() / 4;
550 
551             // setup inputs
552             typedef vector<SAlignInOut> TDataVector;
553             TDataVector vdata;
554             vector<size_t> seed_dims;
555             {{
556                 vdata.reserve(guides_dim + 1);
557                 seed_dims.reserve(guides_dim + 1);
558                 size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;
559                 for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {
560 
561                     q0 = m_guides[i - 3] + 1;
562                     s0 = m_guides[i - 1] + 1;
563                     size_t dim_query = q1 - q0, dim_subj = s1 - s0;
564 
565                     bool esf_L1 = false, esf_R1 = false,
566                         esf_L2 = false, esf_R2 = false;
567                     if(i == istart) {
568                         esf_R1 = m_esf_R1;
569                         esf_R2 = m_esf_R2;
570                     }
571 
572                     SAlignInOut data (q0, dim_query, esf_L1, esf_R1,
573                                       s0, dim_subj, esf_L2, esf_R2);
574 
575                     vdata.push_back(data);
576                     seed_dims.push_back(m_guides[i-3] - m_guides[i-4] + 1);
577 
578                     q1 = m_guides[i - 4];
579                     s1 = m_guides[i - 2];
580                 }
581                 SAlignInOut data(0, q1, m_esf_L1, false,
582                                  0, s1, m_esf_L2, false);
583                 vdata.push_back(data);
584             }}
585 
586             // rearrange vdata so that the largest chunks come first
587             typedef vector<SAlignInOut*> TDataPtrVector;
588             TDataPtrVector vdata_p (vdata.size());
589             {{
590                 TDataPtrVector::iterator jj = vdata_p.begin();
591                 NON_CONST_ITERATE(TDataVector, ii, vdata) {
592                     *jj++ = &(*ii);
593                 }
594                 stable_sort(vdata_p.begin(), vdata_p.end(),
595                             SAlignInOut::PSpace);
596             }}
597 
598             // align over the segments
599             {{
600                 m_Transcript.clear();
601                 size_t idim = vdata.size();
602 
603                 typedef vector<CNWAlignerThread_Align*> TThreadVector;
604                 TThreadVector threads;
605                 threads.reserve(idim);
606 
607                 ITERATE(TDataPtrVector, ii, vdata_p) {
608 
609                     SAlignInOut& data = **ii;
610 
611                     if(data.GetSpace() >= 10000000 &&
612                        NW_RequestNewThread(m_maxthreads)) {
613 
614                         CNWAlignerThread_Align* thread =
615                             new CNWAlignerThread_Align(this, &data);
616                         threads.push_back(thread);
617                         thread->Run();
618                     }
619                     else {
620                         x_Align(&data);
621                     }
622                 }
623 
624                 auto_ptr<CException> e;
625                 ITERATE(TThreadVector, ii, threads) {
626 
627                     if(e.get() == 0) {
628                         CException* pe = 0;
629                         (*ii)->Join(reinterpret_cast<void**>(&pe));
630                         if(pe) {
631                             e.reset(new CException (*pe));
632                         }
633                     }
634                     else {
635                         (*ii)->Join(0);
636                     }
637                 }
638                 if(e.get()) {
639                     throw *e;
640                 }
641 
642                 for(size_t idata = 0; idata < idim; ++idata) {
643 
644                     SAlignInOut& data = vdata[idata];
645                     copy(data.m_transcript.begin(), data.m_transcript.end(),
646                          back_inserter(m_Transcript));
647                     if(idata + 1 < idim) {
648                         for(size_t k = 0; k < seed_dims[idata]; ++k) {
649                             m_Transcript.push_back(eTS_Match);
650                         }
651                     }
652                 }
653 
654                 m_score = ScoreFromTranscript(GetTranscript(false), 0, 0);
655             }}
656         }
657         else {
658 
659             m_Transcript.clear();
660             size_t guides_dim = m_guides.size() / 4;
661             size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;
662             for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {
663 
664                 q0 = m_guides[i - 3] + 1;
665                 s0 = m_guides[i - 1] + 1;
666                 size_t dim_query = q1 - q0, dim_subj = s1 - s0;
667 
668                 bool esf_L1 = false, esf_R1 = false,
669                      esf_L2 = false, esf_R2 = false;
670                 if(i == istart) {
671                     esf_R1 = m_esf_R1;
672                     esf_R2 = m_esf_R2;
673                 }
674 
675                 SAlignInOut data (q0, dim_query, esf_L1, esf_R1,
676                                   s0, dim_subj, esf_L2, esf_R2);
677 
678                 x_Align(&data);
679                 copy(data.m_transcript.begin(), data.m_transcript.end(),
680                      back_inserter(m_Transcript));
681 
682                 size_t dim_hit = m_guides[i - 3] - m_guides[i - 4] + 1;
683                 for(size_t k = 0; k < dim_hit; ++k) {
684                     m_Transcript.push_back(eTS_Match);
685                 }
686                 q1 = m_guides[i - 4];
687                 s1 = m_guides[i - 2];
688             }
689             SAlignInOut data(0, q1, m_esf_L1, false,
690                              0, s1, m_esf_L2, false);
691             x_Align(&data);
692             copy(data.m_transcript.begin(), data.m_transcript.end(),
693                  back_inserter(m_Transcript));
694 
695             m_score = ScoreFromTranscript(GetTranscript(false), 0, 0);
696         }
697     }
698 
699     catch(std::bad_alloc&) {
700         NCBI_THROW(CAlgoAlignException, eMemoryLimit, g_msg_OutOfSpace);
701     }
702 
703     return m_score;
704 }
705 
706 
x_GetDiagTS(size_t i1,size_t i2) const707 CNWAligner::ETranscriptSymbol CNWAligner::x_GetDiagTS(size_t i1, size_t i2)
708 const
709 {
710     const unsigned char c1 = m_Seq1[i1];
711     const unsigned char c2 = m_Seq2[i2];
712 
713     ETranscriptSymbol ts;
714     if(m_PositivesAsMatches) {
715         ts = m_ScoreMatrix.s[c1][c2] > 0? eTS_Match: eTS_Replace;
716     }
717     else {
718         // N vs N should be mismatch for nucleotides,
719         // and X vs X in protein. Check with matrix
720         ts = (toupper(c1) == toupper(c2) && m_ScoreMatrix.s[c1][c2] > 0)? eTS_Match: eTS_Replace;
721     }
722     return ts;
723 }
724 
725 // perform backtrace step, NW only
x_DoBackTrace(const CBacktraceMatrix4 & backtrace,SAlignInOut * data)726 void CNWAligner::x_DoBackTrace(const CBacktraceMatrix4 & backtrace,
727                                SAlignInOut* data)
728 {
729     const size_t N1 (data->m_len1 + 1);
730     const size_t N2 (data->m_len2 + 1);
731 
732     data->m_transcript.clear();
733     data->m_transcript.reserve(N1 + N2);
734 
735     size_t k  (N1*N2 - 1);
736     size_t i1 (data->m_offset1 + data->m_len1 - 1);
737     size_t i2 (data->m_offset2 + data->m_len2 - 1);
738     while (k != 0) {
739 
740         unsigned char Key (backtrace[k]);
741 
742         if (Key & kMaskD) {
743 
744             data->m_transcript.push_back(x_GetDiagTS(i1--, i2--));
745             k -= N2 + 1;
746         }
747         else if (Key & kMaskE) {
748 
749             data->m_transcript.push_back(eTS_Insert);
750             --k;
751             --i2;
752             while(k > 0 && (Key & kMaskEc)) {
753 
754                 data->m_transcript.push_back(eTS_Insert);
755                 Key = backtrace[k--];
756                 --i2;
757             }
758         }
759         else {
760 
761             data->m_transcript.push_back(eTS_Delete);
762             k -= N2;
763             --i1;
764             while(k > 0 && (Key & kMaskFc)) {
765 
766                 data->m_transcript.push_back(eTS_Delete);
767                 Key = backtrace[k];
768                 k -= N2;
769                 --i1;
770             }
771         }
772     }
773 }
774 
775 
776 
777 // perform backtrace step, NW + SW
x_SWDoBackTrace(const CBacktraceMatrix4 & backtrace,SAlignInOut * data)778 void CNWAligner::x_SWDoBackTrace(const CBacktraceMatrix4 & backtrace,
779                                SAlignInOut* data)
780 {
781     const size_t N1 (data->m_len1 + 1);
782     const size_t N2 (data->m_len2 + 1);
783     const TNCBIScore (* sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
784 
785     data->m_transcript.clear();
786     data->m_transcript.reserve(N1 + N2);
787 
788     size_t k  (N1*N2 - 1);
789     size_t i1 (data->m_offset1 + data->m_len1 - 1);
790     size_t i2 (data->m_offset2 + data->m_len2 - 1);
791     if (m_SmithWaterman) {
792         size_t sw_k =  backtrace.BestPos();
793         data->FillEdgeGaps(k - sw_k, true);
794         i1 -= (k - sw_k) / (data->m_len2+1);
795         i2 -= (k - sw_k) % (data->m_len2+1);
796         k = backtrace.BestPos();
797     }
798 
799     //score for SmithWaterman. Stop when score == 0
800     TNCBIScore score = backtrace.BestScore();
801 
802     while (k > 0 && ( !m_SmithWaterman || score > 0 ) ) {
803 
804         unsigned char Key (backtrace[k]);
805 
806         if (Key & kMaskD) {
807             score -= sm[(size_t)(m_Seq1[i1])][(size_t)(m_Seq2[i2])];
808             data->m_transcript.push_back(x_GetDiagTS(i1--, i2--));
809             k -= N2 + 1;
810         }
811         else if (Key & kMaskE) {
812             score -= m_Wg + m_Ws;
813             data->m_transcript.push_back(eTS_Insert);
814             --k;
815             --i2;
816            while(k > 0 && (Key & kMaskEc)) {
817                score -= m_Ws;
818                 data->m_transcript.push_back(eTS_Insert);
819                 Key = backtrace[k--];
820                 --i2;
821             }
822          }
823         else {
824             score -= m_Wg + m_Ws;
825             data->m_transcript.push_back(eTS_Delete);
826             k -= N2;
827             --i1;
828             while(k > 0 && (Key & kMaskFc)) {
829                score -= m_Ws;
830                 data->m_transcript.push_back(eTS_Delete);
831                 Key = backtrace[k];
832                 k -= N2;
833                 --i1;
834             }
835         }
836     }
837     if( m_SmithWaterman && score != 0 ) {
838         NCBI_THROW(CAlgoAlignException, eInternal,
839                    "negative score in Smith-Waterman back trace");
840     }
841     data->FillEdgeGaps(k, false);
842 }
843 
844 
SetPattern(const vector<size_t> & guides)845 void  CNWAligner::SetPattern(const vector<size_t>& guides)
846 {
847     size_t dim = guides.size();
848     const char* err = 0;
849     if(dim % 4 == 0) {
850         for(size_t i = 0; i < dim; i += 4) {
851 
852             if( guides[i] > guides[i+1] || guides[i+2] > guides[i+3] ) {
853                 err = "Pattern hits must be specified in plus strand";
854 		break;
855             }
856 
857             if(i > 4) {
858                 if(guides[i] <= guides[i-3] || guides[i+2] <= guides[i-2]){
859                     err = "Pattern hits coordinates must be sorted";
860                     break;
861                 }
862             }
863 
864             size_t dim1 = guides[i + 1] - guides[i];
865             size_t dim2 = guides[i + 3] - guides[i + 2];
866             if( dim1 != dim2) {
867                 err = "Pattern hits must have equal length on both sequences";
868                 break;
869             }
870 
871             if(guides[i+1] >= m_SeqLen1 || guides[i+3] >= m_SeqLen2) {
872                 err = "One or several pattern hits are out of range";
873                 break;
874             }
875         }
876     }
877     else {
878         err = "Pattern must have a dimension multiple of four";
879     }
880 
881     if(err) {
882         NCBI_THROW(CAlgoAlignException, eBadParameter, err);
883     }
884     else {
885         m_guides = guides;
886     }
887 }
888 
889 
GetEndSpaceFree(bool * L1,bool * R1,bool * L2,bool * R2) const890 void CNWAligner::GetEndSpaceFree(bool* L1, bool* R1, bool* L2, bool* R2) const
891 {
892     if(L1) *L1 = m_esf_L1;
893     if(R1) *R1 = m_esf_R1;
894     if(L2) *L2 = m_esf_L2;
895     if(R2) *R2 = m_esf_R2;
896 }
897 
IsSmithWaterman() const898 bool CNWAligner::IsSmithWaterman() const
899 {
900     return m_SmithWaterman;
901 }
902 
GetGapPreference() const903 CNWAligner::EGapPreference CNWAligner::GetGapPreference() const
904 {
905     return m_GapPreference;
906 }
907 
908 // return raw transcript
GetTranscript(bool reversed) const909 CNWAligner::TTranscript CNWAligner::GetTranscript(bool reversed) const
910 {
911     TTranscript rv (m_Transcript.size());
912     if(reversed) {
913         copy(m_Transcript.begin(), m_Transcript.end(), rv.begin());
914     }
915     else {
916         copy(m_Transcript.rbegin(), m_Transcript.rend(), rv.begin());
917     }
918     return rv;
919 }
920 
921 
922 // alignment 'restore' - set raw transcript
SetTranscript(const TTranscript & transcript)923 void CNWAligner::SetTranscript(const TTranscript& transcript)
924 {
925     m_Transcript = transcript;
926     m_score = ScoreFromTranscript(transcript);
927 }
928 
929 
930 // Return transcript as a readable string
GetTranscriptString(void) const931 string CNWAligner::GetTranscriptString(void) const
932 {
933     const int dim (m_Transcript.size());
934     string s;
935     s.resize(dim);
936     size_t i1 (0), i2 (0), i (0);
937 
938     for (int k (dim - 1); k >= 0; --k) {
939 
940         ETranscriptSymbol c0 (m_Transcript[k]);
941         char c (0);
942         switch ( c0 ) {
943 
944             case eTS_Match: {
945 
946                 if(m_Seq1 && m_Seq2) {
947                     ETranscriptSymbol ts = x_GetDiagTS(i1++, i2++);
948                     c = ts == eTS_Match? 'M': 'R';
949                 }
950                 else {
951                     c = 'M';
952                 }
953             }
954             break;
955 
956             case eTS_Replace: {
957 
958                 if(m_Seq1 && m_Seq2) {
959                     ETranscriptSymbol ts = x_GetDiagTS(i1++, i2++);
960                     c = ts == eTS_Match? 'M': 'R';
961                 }
962                 else {
963                     c = 'R';
964                 }
965             }
966             break;
967 
968             case eTS_Insert: {
969                 c = 'I';
970                 ++i2;
971             }
972             break;
973 
974             case eTS_SlackInsert: {
975                 c = 'i';
976                 ++i2;
977             }
978             break;
979 
980             case eTS_SlackDelete: {
981                 c = 'd';
982                 ++i1;
983             }
984             break;
985 
986             case eTS_Delete: {
987                 c = 'D';
988                 ++i1;
989             }
990             break;
991 
992             case eTS_Intron: {
993                 c = '+';
994                 ++i2;
995             }
996             break;
997 
998 	    default: {
999 	      NCBI_THROW(CAlgoAlignException, eInternal,
1000                          g_msg_InvalidTranscriptSymbol);
1001 	    }
1002         }
1003 
1004         s[i++] = c;
1005     }
1006 
1007     if(i < s.size()) {
1008         s.resize(i + 1);
1009     }
1010 
1011     return s;
1012 }
1013 
1014 
SetProgressCallback(FProgressCallback prg_callback,void * data)1015 void CNWAligner::SetProgressCallback(FProgressCallback prg_callback, void* data)
1016 {
1017     m_prg_callback = prg_callback;
1018     m_prg_info.m_data = data;
1019 }
1020 
1021 
SetWms(TScore val)1022 void CNWAligner::SetWms(TScore val)
1023 {
1024     m_Wms = val;
1025     m_ScoreMatrixInvalid = true;
1026 }
1027 
1028 
SetWm(TScore val)1029 void CNWAligner::SetWm(TScore val)
1030 {
1031     m_Wm = val;
1032     m_ScoreMatrixInvalid = true;
1033 }
1034 
1035 
SetScoreMatrix(const SNCBIPackedScoreMatrix * psm)1036 void CNWAligner::SetScoreMatrix(const SNCBIPackedScoreMatrix* psm)
1037 {
1038     if(psm) {
1039 
1040         m_abc = psm->symbols;
1041 	NCBISM_Unpack(psm, &m_ScoreMatrix);
1042     }
1043     else { // assume IUPACna
1044 
1045         m_abc = g_nwaligner_nucleotides;
1046         const size_t dim = strlen(m_abc);
1047         vector<TNCBIScore> iupacna (dim*dim, m_Wms);
1048         iupacna[0] = iupacna[dim+1] = iupacna[2*(dim+1)] =
1049             iupacna[3*(dim+1)] = m_Wm;
1050         SNCBIPackedScoreMatrix iupacna_psm;
1051         iupacna_psm.symbols = g_nwaligner_nucleotides;
1052         iupacna_psm.scores = &iupacna.front();
1053         iupacna_psm.defscore = m_Wms;
1054         NCBISM_Unpack(&iupacna_psm, &m_ScoreMatrix);
1055     }
1056     m_ScoreMatrixInvalid = false;
1057 }
1058 
1059 
1060 // Check that all characters in sequence are valid for
1061 // the current sequence type.
1062 // Return an index to the first invalid character
1063 // or len if all characters are valid.
x_CheckSequence(const char * seq,size_t len) const1064 size_t CNWAligner::x_CheckSequence(const char* seq, size_t len) const
1065 {
1066     char Flags [256];
1067     memset(Flags, 0, sizeof Flags);
1068     const size_t abc_size = strlen(m_abc);
1069 
1070     size_t k;
1071     for(k = 0; k < abc_size; ++k) {
1072         Flags[unsigned(toupper((unsigned char) m_abc[k]))] = 1;
1073         Flags[unsigned(tolower((unsigned char) m_abc[k]))] = 1;
1074         Flags[unsigned(k)] = 1;
1075     }
1076 
1077     for(k = 0; k < len; ++k) {
1078         if(Flags[unsigned(seq[k])] == 0)
1079             break;
1080     }
1081 
1082     return k;
1083 }
1084 
1085 
GetScore() const1086 CNWAligner::TScore CNWAligner::GetScore() const
1087 {
1088   if(m_Transcript.size()) {
1089       return m_score;
1090   }
1091   else {
1092     NCBI_THROW(CAlgoAlignException, eNoSeqData,
1093                g_msg_NoAlignment);
1094   }
1095 }
1096 
1097 
x_CheckMemoryLimit()1098 bool CNWAligner::x_CheckMemoryLimit()
1099 {
1100     const size_t elem_size (GetElemSize());
1101     const size_t gdim (m_guides.size());
1102     double mem (0);
1103 
1104     if(gdim) {
1105 
1106         size_t dim1 (m_guides[0]), dim2 (m_guides[2]);
1107         mem = double(dim1) * dim2 * elem_size;
1108         if(mem <= m_MaxMem) {
1109 
1110             for(size_t i (4); i < gdim; i += 4) {
1111 
1112                 dim1 = m_guides[i] - m_guides[i-3] + 1;
1113                 dim2 = m_guides[i + 2] - m_guides[i-1] + 1;
1114                 mem = double(dim1) * dim2 * elem_size;
1115                 if(mem > m_MaxMem) {
1116                     break;
1117                 }
1118             }
1119 
1120             if(mem <= m_MaxMem) {
1121                 dim1 = m_SeqLen1 - m_guides[gdim-3];
1122                 dim2 = m_SeqLen2 - m_guides[gdim-1];
1123                 mem = double(dim1) * dim2 * elem_size;
1124             }
1125         }
1126     }
1127     else {
1128         mem = double(m_SeqLen1 + 1) * (m_SeqLen2 + 1) * elem_size;
1129     }
1130 
1131     return mem < m_MaxMem;
1132 }
1133 
1134 
EnableMultipleThreads(bool enable)1135 void CNWAligner::EnableMultipleThreads(bool enable)
1136 {
1137     m_maxthreads = (m_mt = enable)? GetCpuCount(): 1;
1138 }
1139 
1140 
ScoreFromTranscript(const TTranscript & transcript,size_t start1,size_t start2) const1141 CNWAligner::TScore CNWAligner::ScoreFromTranscript(
1142                        const TTranscript& transcript,
1143                        size_t start1, size_t start2) const
1144 {
1145     bool nucl_mode;
1146     if(start1 == kMax_UInt && start2 == kMax_UInt) {
1147         nucl_mode = true;
1148     }
1149     else if(start1 != kMax_UInt && start2 != kMax_UInt) {
1150         nucl_mode = false;
1151     }
1152     else {
1153         NCBI_THROW(CAlgoAlignException, eInternal,
1154                    g_msg_InconsistentArguments);
1155     }
1156 
1157     size_t dim (transcript.size());
1158     if(dim == 0) {
1159         return 0;
1160     }
1161 
1162     TScore score (0);
1163 
1164     const char* p1 (m_Seq1 + start1);
1165     const char* p2 (m_Seq2 + start2);
1166 
1167     int state1(0);   // 0 = normal, 1 = gap;
1168     int state2(0);   // 0 = normal, 1 = gap;
1169 
1170     size_t i (0);
1171 
1172     const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
1173 
1174     ///SmithWaterman alterations
1175     if( IsSmithWaterman() ) {
1176         //cut beginning gaps
1177         for(; i<dim; ++i) {
1178             if( transcript[i] == eTS_Insert ) {
1179                 ++p2;
1180             } else if( transcript[i] == eTS_Delete ) {
1181                 ++p1;
1182             } else if( transcript[i] == eTS_Match ||
1183                        transcript[i] == eTS_Replace ) {
1184                 break;
1185             } else {
1186                 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidTranscriptSymbol);
1187             }
1188         }
1189         if( i == dim ) {//alignment consists of gaps only
1190             return score;
1191         }
1192         //cut trailing gaps
1193       //for(size_t endi = dim-1; endi >=0; --endi) {   // never stops CXX-11373
1194         for(size_t endi = dim; endi-- > 0; ) {
1195               if( transcript[endi] == eTS_Match || transcript[endi] == eTS_Replace ) {
1196                 dim = endi + 1;
1197                 break;
1198             } else if( transcript[endi] != eTS_Insert  && transcript[endi] != eTS_Delete ) {
1199                 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidTranscriptSymbol);
1200             }
1201         }
1202     }
1203 
1204     for(; i < dim; ++i) {
1205 
1206         ETranscriptSymbol ts (transcript[i]);
1207 
1208         switch(ts) {
1209 
1210         case eTS_Match:
1211         case eTS_Replace: {
1212             if(nucl_mode) {
1213                 score += (ts == eTS_Match)? m_Wm: m_Wms;
1214             }
1215             else {
1216                 unsigned char c1 = *p1;
1217                 unsigned char c2 = *p2;
1218                 score += sm[c1][c2];
1219                 ++p1; ++p2;
1220             }
1221             state1 = state2 = 0;
1222         }
1223         break;
1224 
1225         case eTS_Insert: {
1226 
1227             if(state1 != 1) score += m_Wg;
1228             state1 = 1; state2 = 0;
1229             score += m_Ws;
1230             ++p2;
1231         }
1232         break;
1233 
1234         case eTS_Delete: {
1235 
1236             if(state2 != 1) score += m_Wg;
1237             state1 = 0; state2 = 1;
1238             score += m_Ws;
1239             ++p1;
1240         }
1241         break;
1242 
1243         default:
1244             NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidTranscriptSymbol);
1245         }
1246     }
1247 
1248     if( IsSmithWaterman() ) {//end gap scores are already excluded
1249         return score;
1250     }
1251 
1252     if(m_esf_L1) {
1253         size_t g = 0;
1254         for(size_t i = 0; i < dim; ++i) {
1255             if(transcript[i] == eTS_Insert) ++g; else break;
1256         }
1257         if(g > 0) {
1258             score -= (m_Wg + g*m_Ws);
1259         }
1260     }
1261 
1262     if(m_esf_L2) {
1263         size_t g = 0;
1264         for(size_t i = 0; i < dim; ++i) {
1265             if(transcript[i] == eTS_Delete) ++g; else break;
1266         }
1267         if(g > 0) {
1268             score -= (m_Wg + g*m_Ws);
1269         }
1270     }
1271 
1272     if(m_esf_R1) {
1273         size_t g = 0;
1274         for(int i = dim - 1; i >= 0; --i) {
1275             if(transcript[i] == eTS_Insert) ++g; else break;
1276         }
1277         if(g > 0) {
1278             score -= (m_Wg + g*m_Ws);
1279         }
1280     }
1281 
1282     if(m_esf_R2) {
1283         size_t g = 0;
1284         for(int i = dim - 1; i >= 0; --i) {
1285             if(transcript[i] == eTS_Delete) ++g; else break;
1286         }
1287         if(g > 0) {
1288             score -= (m_Wg + g*m_Ws);
1289         }
1290     }
1291 
1292     return score;
1293 }
1294 
1295 
GetLeftSeg(size_t * q0,size_t * q1,size_t * s0,size_t * s1,size_t min_size) const1296 size_t CNWAligner::GetLeftSeg(size_t* q0, size_t* q1,
1297                               size_t* s0, size_t* s1,
1298                               size_t min_size) const
1299 {
1300     size_t trdim = m_Transcript.size();
1301     size_t cur = 0, maxseg = 0;
1302     const char* p1 = m_Seq1;
1303     const char* p2 = m_Seq2;
1304     size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1305 
1306     for(int k = trdim - 1; k >= 0; --k) {
1307 
1308         switch(m_Transcript[k]) {
1309 
1310             case eTS_Insert: {
1311                 ++p2;
1312                 if(cur > maxseg) {
1313                     maxseg = cur;
1314                     imax = i0;
1315                     jmax = j0;
1316                     if(maxseg >= min_size) goto ret_point;
1317                 }
1318                 cur = 0;
1319             }
1320             break;
1321 
1322             case eTS_Delete: {
1323             ++p1;
1324             if(cur > maxseg) {
1325                 maxseg = cur;
1326                 imax = i0;
1327                 jmax = j0;
1328                 if(maxseg >= min_size) goto ret_point;
1329             }
1330             cur = 0;
1331             }
1332             break;
1333 
1334             case eTS_Match:
1335             case eTS_Replace: {
1336                 if(*p1 == *p2) {
1337                     if(cur == 0) {
1338                         i0 = p1 - m_Seq1;
1339                         j0 = p2 - m_Seq2;
1340                     }
1341                     ++cur;
1342                 }
1343                 else {
1344                     if(cur > maxseg) {
1345                         maxseg = cur;
1346                         imax = i0;
1347                         jmax = j0;
1348                         if(maxseg >= min_size) goto ret_point;
1349                     }
1350                     cur = 0;
1351                 }
1352                 ++p1;
1353                 ++p2;
1354             }
1355             break;
1356 
1357             default: {
1358                 NCBI_THROW( CAlgoAlignException, eInternal,
1359                             g_msg_InvalidTranscriptSymbol );
1360             }
1361         }
1362     }
1363 
1364     if(cur > maxseg) {
1365       maxseg = cur;
1366       imax = i0;
1367       jmax = j0;
1368     }
1369 
1370  ret_point:
1371 
1372     *q0 = imax; *s0 = jmax;
1373     *q1 = *q0 + maxseg - 1;
1374     *s1 = *s0 + maxseg - 1;
1375 
1376     return maxseg;
1377 }
1378 
1379 /////////////////////////////////////////////////
1380 /// naive pattern generator (a la Rabin-Karp)
1381 
1382 struct nwaln_mrnaseg {
nwaln_mrnasegnwaln_mrnaseg1383     nwaln_mrnaseg(size_t i1, size_t i2, unsigned char fp0):
1384         a(i1), b(i2), fp(fp0) {}
1385     size_t a, b;
1386     unsigned char fp;
1387 };
1388 
1389 struct nwaln_mrnaguide {
nwaln_mrnaguidenwaln_mrnaguide1390     nwaln_mrnaguide(size_t i1, size_t i2, size_t i3, size_t i4):
1391         q0(i1), q1(i2), s0(i3), s1(i4) {}
1392     size_t q0, q1, s0, s1;
1393 };
1394 
1395 
x_CalcFingerPrint64(const char * beg,const char * end,size_t & err_index)1396 unsigned char CNWAligner::x_CalcFingerPrint64(
1397     const char* beg, const char* end, size_t& err_index)
1398 {
1399     if(beg >= end) {
1400         return 0xFF;
1401     }
1402 
1403     unsigned char fp = 0, code;
1404     for(const char* p = beg; p < end; ++p) {
1405         switch(*p) {
1406         case 'A': code = 0;    break;
1407         case 'G': code = 0x01; break;
1408         case 'T': code = 0x02; break;
1409         case 'C': code = 0x03; break;
1410         default:  err_index = p - beg; return 0x40; // incorrect char
1411         }
1412         fp = 0x3F & ((fp << 2) | code);
1413     }
1414 
1415     return fp;
1416 }
1417 
1418 
x_FindFingerPrint64(const char * beg,const char * end,unsigned char fingerprint,size_t size,size_t & err_index)1419 const char* CNWAligner::x_FindFingerPrint64(
1420     const char* beg, const char* end,
1421     unsigned char fingerprint, size_t size,
1422     size_t& err_index)
1423 {
1424 
1425     if(beg + size > end) {
1426         err_index = 0;
1427         return 0;
1428     }
1429 
1430     const char* p0 = beg;
1431 
1432     size_t err_idx = 0; --p0;
1433     unsigned char fp = 0x40;
1434     while(fp == 0x40 && p0 < end) {
1435         p0 += err_idx + 1;
1436         fp = x_CalcFingerPrint64(p0, p0 + size, err_idx);
1437     }
1438 
1439     if(p0 >= end) {
1440         return end;  // not found
1441     }
1442 
1443     unsigned char code;
1444     while(fp != fingerprint && ++p0 < end) {
1445 
1446         switch(*(p0 + size - 1)) {
1447         case 'A': code = 0;    break;
1448         case 'G': code = 0x01; break;
1449         case 'T': code = 0x02; break;
1450         case 'C': code = 0x03; break;
1451         default:  err_index = p0 + size - 1 - beg;
1452                   return 0;
1453         }
1454 
1455         fp = 0x3F & ((fp << 2) | code );
1456     }
1457 
1458     return p0;
1459 }
1460 
1461 
MakePattern(const size_t guide_size,const size_t guide_core)1462 size_t CNWAligner::MakePattern(const size_t guide_size,
1463                                const size_t guide_core)
1464 {
1465     if(guide_core > guide_size) {
1466         NCBI_THROW(CAlgoAlignException,
1467                    eBadParameter,
1468                    g_msg_NullParameter);
1469     }
1470 
1471     vector<nwaln_mrnaseg> segs;
1472 
1473     size_t err_idx(0);
1474     for(size_t i = 0; i + guide_size <= m_SeqLen1; ) {
1475         const char* beg = m_Seq1 + i;
1476         const char* end = m_Seq1 + i + guide_size;
1477         unsigned char fp = x_CalcFingerPrint64(beg, end, err_idx);
1478         if(fp != 0x40) {
1479             segs.push_back(nwaln_mrnaseg(i, i + guide_size - 1, fp));
1480             i += guide_size;
1481         }
1482         else {
1483             i += err_idx + 1;
1484         }
1485     }
1486 
1487     vector<nwaln_mrnaguide> guides;
1488     size_t idx = 0;
1489     const char* beg = m_Seq2 + idx;
1490     const char* end = m_Seq2 + m_SeqLen2;
1491     for(size_t i = 0, seg_count = segs.size();
1492         beg + guide_size <= end && i < seg_count; ++i) {
1493 
1494         const char* p = 0;
1495         const char* beg0 = beg;
1496         while( p == 0 && beg + guide_size <= end ) {
1497 
1498             p = x_FindFingerPrint64( beg, end, segs[i].fp,
1499                                      guide_size, err_idx );
1500             if(p == 0) { // incorrect char
1501                 beg += err_idx + 1;
1502             }
1503             else if (p < end) {// fingerprints match but check actual sequences
1504                 const char* seq1 = m_Seq1 + segs[i].a;
1505                 const char* seq2 = p;
1506                 size_t k;
1507                 for(k = 0; k < guide_size; ++k) {
1508                     if(seq1[k] != seq2[k]) break;
1509                 }
1510                 if(k == guide_size) { // real match
1511                     size_t i1 = segs[i].a;
1512                     size_t i2 = segs[i].b;
1513                     size_t i3 = seq2 - m_Seq2;
1514                     size_t i4 = i3 + guide_size - 1;
1515                     size_t guides_dim = guides.size();
1516                     if( guides_dim == 0 ||
1517                         i1 - 1 > guides[guides_dim - 1].q1 ||
1518                         i3 - 1 > guides[guides_dim - 1].s1    ) {
1519                         guides.push_back(nwaln_mrnaguide(i1, i2, i3, i4));
1520                     }
1521                     else { // expand the last guide
1522                         guides[guides_dim - 1].q1 = i2;
1523                         guides[guides_dim - 1].s1 = i4;
1524                     }
1525                     beg0 = p + guide_size;
1526                 }
1527                 else {  // spurious match
1528                     beg = p + 1;
1529                     p = 0;
1530                 }
1531             }
1532         }
1533         beg = beg0; // restore start pos in genomic sequence
1534     }
1535 
1536     // initialize m_guides
1537     size_t guides_dim = guides.size();
1538     m_guides.clear();
1539     m_guides.resize(4*guides_dim);
1540     const size_t offs = guide_core/2 - 1;
1541     for(size_t k = 0; k < guides_dim; ++k) {
1542         size_t q0 = (guides[k].q0 + guides[k].q1) / 2;
1543         size_t s0 = (guides[k].s0 + guides[k].s1) / 2;
1544         m_guides[4*k]         = q0 - offs;
1545         m_guides[4*k + 1]     = q0 + offs;
1546         m_guides[4*k + 2]     = s0 - offs;
1547         m_guides[4*k + 3]     = s0 + offs;
1548     }
1549 
1550     return m_guides.size();
1551 }
1552 
1553 //////////////////////////////////////////////
1554 /////////////////////////////////////////////
1555 
GetRightSeg(size_t * q0,size_t * q1,size_t * s0,size_t * s1,size_t min_size) const1556 size_t CNWAligner::GetRightSeg(size_t* q0, size_t* q1,
1557                                size_t* s0, size_t* s1,
1558                                size_t min_size) const
1559 {
1560     size_t trdim = m_Transcript.size();
1561     size_t cur = 0, maxseg = 0;
1562     const char* seq1_end = m_Seq1 + m_SeqLen1;
1563     const char* seq2_end = m_Seq2 + m_SeqLen2;
1564     const char* p1 = seq1_end - 1;
1565     const char* p2 = seq2_end - 1;
1566     size_t i0 = m_SeqLen1 - 1, j0 = m_SeqLen2 - 1,
1567            imax = i0, jmax = j0;
1568 
1569     for(size_t k = 0; k < trdim; ++k) {
1570 
1571         switch(m_Transcript[k]) {
1572 
1573             case eTS_Insert: {
1574                 --p2;
1575                 if(cur > maxseg) {
1576                     maxseg = cur;
1577                     imax = i0;
1578                     jmax = j0;
1579                     if(maxseg >= min_size) goto ret_point;
1580                 }
1581                 cur = 0;
1582             }
1583             break;
1584 
1585             case eTS_Delete: {
1586                 --p1;
1587                 if(cur > maxseg) {
1588                     maxseg = cur;
1589                     imax = i0;
1590                     jmax = j0;
1591                     if(maxseg >= min_size) goto ret_point;
1592 		}
1593                 cur = 0;
1594             }
1595             break;
1596 
1597             case eTS_Match:
1598             case eTS_Replace: {
1599                 if(*p1 == *p2) {
1600                     if(cur == 0) {
1601                         i0 = p1 - m_Seq1;
1602                         j0 = p2 - m_Seq2;
1603                     }
1604                     ++cur;
1605                 }
1606                 else {
1607                     if(cur > maxseg) {
1608                         maxseg = cur;
1609                         imax = i0;
1610                         jmax = j0;
1611                         if(maxseg >= min_size) goto ret_point;
1612                     }
1613                     cur = 0;
1614                 }
1615                 --p1;
1616                 --p2;
1617             }
1618             break;
1619 
1620             default: {
1621                 NCBI_THROW( CAlgoAlignException, eInternal,
1622                             g_msg_InvalidTranscriptSymbol );
1623             }
1624         }
1625     }
1626 
1627     if(cur > maxseg) {
1628       maxseg = cur;
1629       imax = i0;
1630       jmax = j0;
1631     }
1632 
1633  ret_point:
1634 
1635     *q1 = imax; *s1 = jmax;
1636     *q0 = imax - maxseg + 1;
1637     *s0 = jmax - maxseg + 1;
1638 
1639     return maxseg;
1640 }
1641 
1642 
GetLongestSeg(size_t * q0,size_t * q1,size_t * s0,size_t * s1) const1643 size_t CNWAligner::GetLongestSeg(size_t* q0, size_t* q1,
1644                                  size_t* s0, size_t* s1) const
1645 {
1646     size_t trdim = m_Transcript.size();
1647     size_t cur = 0, maxseg = 0;
1648     const char* p1 = m_Seq1;
1649     const char* p2 = m_Seq2;
1650     size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1651 
1652     for(int k = trdim-1; k >= 0; --k) {
1653 
1654         switch(m_Transcript[k]) {
1655 
1656             case eTS_Insert: {
1657                 ++p2;
1658                 if(cur > maxseg) {
1659                     maxseg = cur;
1660                     imax = i0;
1661                     jmax = j0;
1662                 }
1663                 cur = 0;
1664             }
1665             break;
1666 
1667             case eTS_Delete: {
1668             ++p1;
1669             if(cur > maxseg) {
1670                 maxseg = cur;
1671                 imax = i0;
1672                 jmax = j0;
1673             }
1674             cur = 0;
1675             }
1676             break;
1677 
1678             case eTS_Match:
1679             case eTS_Replace: {
1680                 if(*p1 == *p2) {
1681                     if(cur == 0) {
1682                         i0 = p1 - m_Seq1;
1683                         j0 = p2 - m_Seq2;
1684                     }
1685                     ++cur;
1686                 }
1687                 else {
1688 		    if(cur > maxseg) {
1689                         maxseg = cur;
1690                         imax = i0;
1691                         jmax = j0;
1692                     }
1693                     cur = 0;
1694                 }
1695                 ++p1;
1696                 ++p2;
1697             }
1698             break;
1699 
1700             default: {
1701                 NCBI_THROW( CAlgoAlignException, eInternal,
1702                             g_msg_InvalidTranscriptSymbol );
1703             }
1704         }
1705     }
1706 
1707     if(cur > maxseg) {
1708         maxseg = cur;
1709         imax = i0;
1710         jmax = j0;
1711     }
1712 
1713     *q0 = imax; *s0 = jmax;
1714     *q1 = *q0 + maxseg - 1;
1715     *s1 = *s0 + maxseg - 1;
1716 
1717     return maxseg;
1718 }
1719 
1720 
GetDense_seg(TSeqPos query_start,ENa_strand query_strand,TSeqPos subj_start,ENa_strand subj_strand,bool trim_end_gaps) const1721 CRef<CDense_seg> CNWAligner::GetDense_seg(TSeqPos query_start,
1722                                           ENa_strand query_strand,
1723                                           TSeqPos subj_start,
1724                                           ENa_strand subj_strand,
1725                                           bool trim_end_gaps) const
1726 {
1727     CNWFormatter::ESeqAlignFormatFlags  flags;
1728     if(trim_end_gaps) {
1729         flags = CNWFormatter::eSAFF_TrimEndGaps;
1730     } else {
1731         flags = CNWFormatter::eSAFF_None;
1732     }
1733 
1734     CNWFormatter fmt(*this);
1735 
1736     return fmt.AsDenseSeg(query_start, query_strand,
1737                           subj_start, subj_strand, flags);
1738 }
1739 
1740 
GetDense_seg(TSeqPos query_start,ENa_strand query_strand,const CSeq_id & query_id,TSeqPos subj_start,ENa_strand subj_strand,const CSeq_id & subj_id,bool trim_end_gaps) const1741 CRef<CDense_seg> CNWAligner::GetDense_seg(TSeqPos query_start,
1742                                           ENa_strand query_strand,
1743                                           const CSeq_id& query_id,
1744                                           TSeqPos subj_start,
1745                                           ENa_strand subj_strand,
1746                                           const CSeq_id& subj_id,
1747                                           bool trim_end_gaps) const
1748 {
1749     CNWFormatter::ESeqAlignFormatFlags  flags;
1750     if(trim_end_gaps) {
1751         flags = CNWFormatter::eSAFF_TrimEndGaps;
1752     } else {
1753         flags = CNWFormatter::eSAFF_None;
1754     }
1755 
1756     CNWFormatter fmt(*this);
1757 
1758     CConstRef<CSeq_id> id0(&query_id);
1759     CConstRef<CSeq_id> id1(&subj_id);
1760     fmt.SetSeqIds(id0, id1);
1761 
1762     return fmt.AsDenseSeg(query_start, query_strand,
1763                           subj_start, subj_strand, flags);
1764 }
1765 
1766 END_NCBI_SCOPE
1767