1 /*  $Id: glb_align.cpp 477516 2015-08-31 14:15:41Z souvorov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon_model.hpp>
34 #include <algo/gnomon/glb_align.hpp>
35 #include <sstream>
36 
37 BEGIN_NCBI_SCOPE
38 BEGIN_SCOPE(gnomon)
39 
40 using namespace std;
41 
PushFront(const SElement & el)42 void CCigar::PushFront(const SElement& el) {
43     if(el.m_type == 'M') {
44         m_qfrom -= el.m_len;
45         m_sfrom -= el.m_len;
46     } else if(el.m_type == 'D')
47         m_sfrom -= el.m_len;
48     else
49         m_qfrom -= el.m_len;
50 
51     if(m_elements.empty() || m_elements.front().m_type != el.m_type)
52         m_elements.push_front(el);
53     else
54         m_elements.front().m_len += el.m_len;
55 }
56 
PushBack(const SElement & el)57 void CCigar::PushBack(const SElement& el) {
58     if(el.m_type == 'M') {
59         m_qto += el.m_len;
60         m_sto += el.m_len;
61     } else if(el.m_type == 'D')
62         m_sto += el.m_len;
63     else
64         m_qto += el.m_len;
65 
66     if(m_elements.empty() || m_elements.back().m_type != el.m_type)
67         m_elements.push_back(el);
68     else
69         m_elements.back().m_len += el.m_len;
70 }
71 
CigarString(int qstart,int qlen) const72 string CCigar::CigarString(int qstart, int qlen) const {
73     string cigar;
74     ITERATE(list<SElement>, i, m_elements)
75         cigar += NStr::IntToString(i->m_len)+i->m_type;
76 
77     int missingstart = qstart+m_qfrom;
78     if(missingstart > 0)
79         cigar = NStr::IntToString(missingstart)+"S"+cigar;
80     int missingend = qlen-1-m_qto-qstart;
81     if(missingend > 0)
82         cigar += NStr::IntToString(missingend)+"S";
83 
84     return cigar;
85 }
86 
GetInDels(int sstart,const char * const query,const char * subject) const87 TInDels CCigar::GetInDels(int sstart, const  char* const query, const  char* subject) const {
88     TInDels indels;
89 
90     int qpos = m_qfrom;
91     int spos = m_sfrom;
92     ITERATE(list<SElement>, i, m_elements) {
93         if(i->m_type == 'M') {
94             bool is_match = *(query+qpos) == *(subject+spos);
95             int len = 0;
96             for(int l = 0; l < i->m_len; ++l) {
97                 if((*(query+qpos) == *(subject+spos)) == is_match) {
98                     ++len;
99                 } else {
100                     if(!is_match) {
101                         CInDelInfo indl(spos-len+sstart, len, CInDelInfo::eMism, string(query+qpos-len, len));
102                         indels.push_back(indl);
103                     }
104                     is_match = !is_match;
105                     len = 1;
106                 }
107                 ++qpos;
108                 ++spos;
109             }
110             if(!is_match) {
111                 CInDelInfo indl(spos-len+sstart, len, CInDelInfo::eMism, string(query+qpos-len, len));
112                 indels.push_back(indl);
113             }
114         } else if(i->m_type == 'D') {
115             CInDelInfo indl(spos+sstart, i->m_len, CInDelInfo::eIns);
116             indels.push_back(indl);
117             spos += i->m_len;
118         } else {
119             CInDelInfo indl(spos+sstart, i->m_len, CInDelInfo::eDel, string(query+qpos, i->m_len));
120             indels.push_back(indl);
121             qpos += i->m_len;
122         }
123     }
124 
125     return indels;
126 }
127 
DetailedCigarString(int qstart,int qlen,const char * query,const char * subject) const128 string CCigar::DetailedCigarString(int qstart, int qlen, const  char* query, const  char* subject) const {
129     string cigar;
130     query += m_qfrom;
131     subject += m_sfrom;
132     ITERATE(list<SElement>, i, m_elements) {
133         if(i->m_type == 'M') {
134             bool is_match = *query == *subject;
135             int len = 0;
136             for(int l = 0; l < i->m_len; ++l) {
137                 if((*query == *subject) == is_match) {
138                     ++len;
139                 } else {
140                     cigar += NStr::IntToString(len)+ (is_match ? "=" : "X");
141                     is_match = !is_match;
142                     len = 1;
143                 }
144                 ++query;
145                 ++subject;
146             }
147             cigar += NStr::IntToString(len)+ (is_match ? "=" : "X");
148         } else if(i->m_type == 'D') {
149             cigar += NStr::IntToString(i->m_len)+i->m_type;
150             subject += i->m_len;
151         } else {
152             cigar += NStr::IntToString(i->m_len)+i->m_type;
153             query += i->m_len;
154         }
155     }
156 
157     int missingstart = qstart+m_qfrom;
158     if(missingstart > 0)
159         cigar = NStr::IntToString(missingstart)+"S"+cigar;
160     int missingend = qlen-1-m_qto-qstart;
161     if(missingend > 0)
162         cigar += NStr::IntToString(missingend)+"S";
163 
164     return cigar;
165 }
166 
ToAlign(const char * query,const char * subject) const167 TCharAlign CCigar::ToAlign(const  char* query, const  char* subject) const {
168     TCharAlign align;
169     query += m_qfrom;
170     subject += m_sfrom;
171     ITERATE(list<SElement>, i, m_elements) {
172         if(i->m_type == 'M') {
173             align.first.insert(align.first.end(), query, query+i->m_len);
174             query += i->m_len;
175             align.second.insert(align.second.end(), subject, subject+i->m_len);
176             subject += i->m_len;
177         } else if(i->m_type == 'D') {
178             align.first.insert(align.first.end(), i->m_len, '-');
179             align.second.insert(align.second.end(), subject, subject+i->m_len);
180             subject += i->m_len;
181         } else {
182             align.first.insert(align.first.end(), query, query+i->m_len);
183             query += i->m_len;
184             align.second.insert(align.second.end(), i->m_len, '-');
185         }
186     }
187 
188     return align;
189 }
190 
Matches(const char * query,const char * subject) const191 int CCigar::Matches(const  char* query, const  char* subject) const {
192     int matches = 0;
193     query += m_qfrom;
194     subject += m_sfrom;
195     ITERATE(list<SElement>, i, m_elements) {
196         if(i->m_type == 'M') {
197             for(int l = 0; l < i->m_len; ++l) {
198                 if(*query == *subject)
199                     ++matches;
200                 ++query;
201                 ++subject;
202             }
203         } else if(i->m_type == 'D') {
204             subject += i->m_len;
205         } else {
206             query += i->m_len;
207         }
208     }
209 
210     return matches;
211 }
212 
Distance(const char * query,const char * subject) const213 int CCigar::Distance(const  char* query, const  char* subject) const {
214     int dist = 0;
215     query += m_qfrom;
216     subject += m_sfrom;
217     ITERATE(list<SElement>, i, m_elements) {
218         if(i->m_type == 'M') {
219             for(int l = 0; l < i->m_len; ++l) {
220                 if(*query != *subject)
221                     ++dist;
222                 ++query;
223                 ++subject;
224             }
225         } else if(i->m_type == 'D') {
226             subject += i->m_len;
227             dist += i->m_len;
228         } else {
229             query += i->m_len;
230             dist += i->m_len;
231         }
232     }
233 
234     return dist;
235 }
236 
Score(const char * query,const char * subject,int gopen,int gapextend,const char delta[256][256]) const237 int CCigar::Score(const  char* query, const  char* subject, int gopen, int gapextend, const char delta[256][256]) const {
238     int score = 0;
239 
240     query += m_qfrom;
241     subject += m_sfrom;
242     ITERATE(list<SElement>, i, m_elements) {
243         if(i->m_type == 'M') {
244             for(int l = 0; l < i->m_len; ++l) {
245                 score += delta[(int)*query][(int)*subject];
246                 ++query;
247                 ++subject;
248             }
249         } else if(i->m_type == 'D') {
250             subject += i->m_len;
251             score -= gopen+gapextend*i->m_len;
252         } else {
253             query += i->m_len;
254             score -= gopen+gapextend*i->m_len;
255         }
256     }
257 
258     return score;
259 }
260 
261 
GlbAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,const char delta[256][256])262 CCigar GlbAlign(const  char* a, int na, const  char*  b, int nb, int rho, int sigma, const char delta[256][256]) {
263     //	rho - new gap penalty (one base gap rho+sigma)
264     // sigma - extension penalty
265 
266 	int* s = new int[nb+1];                                 // best scores in current a-raw
267 	int* sm = new int[nb+1];                                // best scores in previous a-raw
268 	int* gapb = new int[nb+1];                              // best score with b-gap
269     char* mtrx = new  char[(na+1)*(nb+1)];                  // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart)
270 
271     int rs = rho+sigma;
272     int bignegative = numeric_limits<int>::min()/2;
273 
274 	sm[0] = 0;
275 	sm[1] = -rs;                  // scores for --------------   (the best scores for i == -1)
276 	for(int i = 2; i <= nb; ++i)  //            BBBBBBBBBBBBBB
277         sm[i] = sm[i-1]-sigma;
278 	s[0] = -rs;                   // score for A   (the best score for j == -1 and i == 0)
279                                   //           -
280 	for(int i = 0; i <= nb; ++i)
281         gapb[i] = bignegative;
282 
283 	enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8};
284 
285     mtrx[0] = 0;
286     for(int i = 1; i <= nb; ++i) {  // ---------------
287         mtrx[i] = Agap;             // BBBBBBBBBBBBBBB
288     }
289     mtrx[1] |= Astart;
290 
291     char* m = mtrx+nb;
292 	for(int i = 0; i < na; ++i) {
293 		*(++m) = Bstart|Bgap;       //AAAAAAAAAAAAAAA
294                                     //---------------
295         int gapa = bignegative;
296 		int ai = a[i];
297         const char* matrix = delta[ai];
298 		int* sp = s;
299 		for(int j = 0; j < nb; ) {
300 			*(++m) = 0;
301 			int ss = sm[j]+matrix[(int)b[j]];
302 
303 			gapa -= sigma;
304 			if(*sp-rs > gapa) {    // for j == 0 this will open   AAAAAAAAAAA-  which could be used if mismatch is very expensive
305 				gapa = *sp-rs;     //                             -----------B
306 				*m |= Astart;
307 			}
308 
309 			int& gapbj = gapb[++j];
310 			gapbj -= sigma;
311 			if(sm[j]-rs > gapbj) {  // for i == 0 this will open  BBBBBBBBBBB- which could be used if mismatch is very expensive
312 				gapbj = sm[j]-rs;   //                            -----------A
313 				*m |= Bstart;
314 			}
315 
316 			if(gapa > gapbj) {
317 				if(ss > gapa) {
318 					*(++sp) = ss;
319 				} else {
320 					*(++sp) = gapa;
321 					*m |= Agap;
322 				}
323 			} else {
324 				if(ss > gapbj) {
325 					*(++sp) = ss;
326 				} else {
327 					*(++sp) = gapbj;
328 					*m |= Bgap;
329 				}
330 			}
331 		}
332 		swap(sm,s);
333 		*s = *sm-sigma;
334 	}
335 
336 	int ia = na-1;
337 	int ib = nb-1;
338 	m = mtrx+(na+1)*(nb+1)-1;
339     CCigar track(ia, ib);
340     while(ia >= 0 || ib >= 0) {
341         if(*m&Agap) {
342             int len = 1;
343             while(!(*m&Astart)) {
344                 ++len;
345                 --m;
346             }
347             --m;
348             ib -= len;
349             track.PushFront(CCigar::SElement(len,'D'));
350         } else if(*m&Bgap) {
351             int len = 1;
352             while(!(*m&Bstart)) {
353                 ++len;
354                 m -= nb+1;
355             }
356             m -= nb+1;
357             ia -= len;
358             track.PushFront(CCigar::SElement(len,'I'));
359         } else {
360             track.PushFront(CCigar::SElement(1,'M'));
361             --ia;
362             --ib;
363             m -= nb+2;
364         }
365     }
366 
367 	delete[] s;
368 	delete[] sm;
369 	delete[] gapb;
370 	delete[] mtrx;
371 
372 	return track;
373 }
374 
375 
LclAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,const char delta[256][256])376 CCigar LclAlign(const  char* a, int na, const  char*  b, int nb, int rho, int sigma, const char delta[256][256]) {
377     //	rho - new gap penalty (one base gap rho+sigma)
378     // sigma - extension penalty
379 
380 	int* s = new int[nb+1];                                 // best scores in current a-raw
381 	int* sm = new int[nb+1];                                // best scores in previous a-raw
382 	int* gapb = new int[nb+1];                              // best score with b-gap
383     char* mtrx = new  char[(na+1)*(nb+1)];                  // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
384 
385     int rs = rho+sigma;
386 
387     enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
388 
389     for(int i = 0; i <= nb; ++i) {
390         sm[i] = 0;
391         mtrx[i] = Zero;
392         gapb[i] = 0;
393     }
394     s[0] = 0;
395 
396     int max_score = 0;
397     char* max_ptr = mtrx;
398     char* m = mtrx+nb;
399 
400     for(int i = 0; i < na; ++i) {
401 		*(++m) = Zero;
402 		int gapa = 0;
403 		int ai = a[i];
404         const char* matrix = delta[ai];
405 		int* sp = s;
406 		for(int j = 0; j < nb; ) {
407 			*(++m) = 0;
408 			int ss = sm[j]+matrix[(int)b[j]];
409 
410             gapa -= sigma;
411 			if(*sp-rs > gapa) {
412 				gapa = *sp-rs;
413 				*m |= Astart;
414 			}
415 
416 			int& gapbj = gapb[++j];
417 			gapbj -= sigma;
418 			if(sm[j]-rs > gapbj) {
419 				gapbj = sm[j]-rs;
420 				*m |= Bstart;
421 			}
422 
423 			if(gapa > gapbj) {
424 				if(ss > gapa) {
425 					*(++sp) = ss;
426                     if(ss > max_score) {
427                         max_score = ss;
428                         max_ptr = m;
429                     }
430 				} else {
431 					*(++sp) = gapa;
432 					*m |= Agap;
433 				}
434 			} else {
435 				if(ss > gapbj) {
436 					*(++sp) = ss;
437                     if(ss > max_score) {
438                         max_score = ss;
439                         max_ptr = m;
440                     }
441 				} else {
442 					*(++sp) = gapbj;
443 					*m |= Bgap;
444 				}
445 			}
446             if(*sp <= 0) {
447                 *sp = 0;
448                 *m |= Zero;
449             }
450 		}
451 		swap(sm,s);
452 	}
453 
454     int ia = (max_ptr-mtrx)/(nb+1)-1;
455     int ib = (max_ptr-mtrx)%(nb+1)-1;
456     CCigar track(ia, ib);
457     m = max_ptr;
458     while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
459         if(*m&Agap) {
460             int len = 1;
461             while(!(*m&Astart)) {
462                 ++len;
463                 --m;
464             }
465             --m;
466             ib -= len;
467             track.PushFront(CCigar::SElement(len,'D'));
468         } else if(*m&Bgap) {
469             int len = 1;
470             while(!(*m&Bstart)) {
471                 ++len;
472                 m -= nb+1;
473             }
474             m -= nb+1;
475             ia -= len;
476             track.PushFront(CCigar::SElement(len,'I'));
477         } else {
478             track.PushFront(CCigar::SElement(1,'M'));
479             --ia;
480             --ib;
481             m -= nb+2;
482         }
483     }
484 
485     delete[] s;
486     delete[] sm;
487     delete[] gapb;
488     delete[] mtrx;
489 
490     return track;
491 }
492 
493 
LclAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,bool pinleft,bool pinright,const char delta[256][256])494 CCigar LclAlign(const  char* a, int na, const  char*  b, int nb, int rho, int sigma, bool pinleft, bool pinright, const char delta[256][256]) {
495     //	rho - new gap penalty (one base gap rho+sigma)
496     // sigma - extension penalty
497 
498 	int* s = new int[nb+1];                                 // best scores in current a-raw
499 	int* sm = new int[nb+1];                                // best scores in previous a-raw
500 	int* gapb = new int[nb+1];                              // best score with b-gap
501     char* mtrx = new  char[(na+1)*(nb+1)];                  // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
502 
503     int rs = rho+sigma;
504     int bignegative = numeric_limits<int>::min()/2;
505 
506     enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
507 
508     sm[0] = 0;
509     mtrx[0] = 0;
510     gapb[0] = bignegative;  // not used
511     if(pinleft) {
512         if(nb > 0) {
513             sm[1] = -rs;
514             mtrx[1] = Astart|Agap;
515             gapb[1] = bignegative;
516             for(int i = 2; i <= nb; ++i) {
517                 sm[i] = sm[i-1]-sigma;
518                 mtrx[i] = Agap;
519                 gapb[i] = bignegative;
520             }
521         }
522         s[0] = -rs;
523     } else {
524         for(int i = 1; i <= nb; ++i) {
525             sm[i] = 0;
526             mtrx[i] = Zero;
527             gapb[i] = bignegative;
528         }
529         s[0] = 0;
530     }
531 
532     int max_score = 0;
533     char* max_ptr = mtrx;
534 
535     char* m = mtrx+nb;
536 	for(int i = 0; i < na; ++i) {
537 		*(++m) = pinleft ? Bstart|Bgap : Zero;
538 		int gapa = bignegative;
539 		int ai = a[i];
540 
541 		const char* matrix = delta[ai];
542 		int* sp = s;
543 		for(int j = 0; j < nb; ) {
544 			*(++m) = 0;
545 			int ss = sm[j]+matrix[(int)b[j]];
546 
547             gapa -= sigma;
548 			if(*sp-rs > gapa) {
549 				gapa = *sp-rs;
550 				*m |= Astart;
551 			}
552 
553 			int& gapbj = gapb[++j];
554 			gapbj -= sigma;
555 			if(sm[j]-rs > gapbj) {
556 				gapbj = sm[j]-rs;
557 				*m |= Bstart;
558 			}
559 
560 			if(gapa > gapbj) {
561 				if(ss > gapa) {
562 					*(++sp) = ss;
563                     if(ss > max_score) {
564                         max_score = ss;
565                         max_ptr = m;
566                     }
567 				} else {
568 					*(++sp) = gapa;
569 					*m |= Agap;
570 				}
571 			} else {
572 				if(ss > gapbj) {
573 					*(++sp) = ss;
574                     if(ss > max_score) {
575                         max_score = ss;
576                         max_ptr = m;
577                     }
578 				} else {
579 					*(++sp) = gapbj;
580 					*m |= Bgap;
581 				}
582 			}
583             if(*sp <= 0 && !pinleft) {
584                 *sp = 0;
585                 *m |= Zero;
586             }
587 		}
588 		swap(sm,s);
589         if(pinleft)
590             *s = *sm-sigma;
591 	}
592 
593     int maxa, maxb;
594     if(pinright) {
595         maxa = na-1;
596         maxb = nb-1;
597         max_score = sm[nb];
598     } else {
599         maxa = (max_ptr-mtrx)/(nb+1)-1;
600         maxb = (max_ptr-mtrx)%(nb+1)-1;
601         m = max_ptr;
602     }
603     int ia = maxa;
604     int ib = maxb;
605     CCigar track(ia, ib);
606     while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
607         if(*m&Agap) {
608             int len = 1;
609             while(!(*m&Astart)) {
610                 ++len;
611                 --m;
612             }
613             --m;
614             ib -= len;
615             track.PushFront(CCigar::SElement(len,'D'));
616         } else if(*m&Bgap) {
617             int len = 1;
618             while(!(*m&Bstart)) {
619                 ++len;
620                 m -= nb+1;
621             }
622             m -= nb+1;
623             ia -= len;
624             track.PushFront(CCigar::SElement(len,'I'));
625         } else {
626             track.PushFront(CCigar::SElement(1,'M'));
627             --ia;
628             --ib;
629             m -= nb+2;
630         }
631     }
632 
633     delete[] s;
634     delete[] sm;
635     delete[] gapb;
636     delete[] mtrx;
637 
638     return track;
639 }
640 
VariBandAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,const char delta[256][256],const TSignedSeqRange * blimits)641 CCigar VariBandAlign(const  char* a, int na, const  char*  b, int nb, int rho, int sigma, const char delta[256][256], const TSignedSeqRange* blimits) {
642     //	rho - new gap penalty (one base gap rho+sigma)
643     // sigma - extension penalty
644 
645 	int* s = new int[nb+1];                                 // best scores in current a-raw
646 	int* sm = new int[nb+1];                                // best scores in previous a-raw
647 	int* gapb = new int[nb+1];                              // best score with b-gap
648     char* mtrx = new  char[(na+1)*(nb+1)];                  // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
649 
650     int rs = rho+sigma;
651 
652     enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
653 
654     for(int i = 0; i <= nb; ++i) {
655         s[i] = 0;
656         sm[i] = 0;
657         gapb[i] = 0;
658         mtrx[i] = Zero;
659     }
660 
661     int max_score = 0;
662     char* max_ptr = mtrx;
663     char* m = mtrx+nb;
664 
665     const TSignedSeqRange* last = blimits+na;
666     while(true) {
667 		int ai = *a++;
668 		const char* matrix = delta[ai];
669 
670         int bleft = blimits->GetFrom();
671         int bright = blimits->GetTo();
672         m += bleft;
673         *(++m) = Zero;
674 		int gapa = 0;
675         int* sp = s+bleft;
676         *sp = 0;
677 		for(int j = bleft; j <= bright; ) {
678 			*(++m) = 0;
679 			int ss = sm[j]+matrix[(int)b[j]];
680 
681             gapa -= sigma;
682 			if(*sp-rs > gapa) {
683 				gapa = *sp-rs;
684 				*m |= Astart;
685 			}
686 
687 			int& gapbj = gapb[++j];
688 			gapbj -= sigma;
689 			if(sm[j]-rs > gapbj) {
690 				gapbj = sm[j]-rs;
691 				*m |= Bstart;
692 			}
693 
694 			if(gapa > gapbj) {
695 				if(ss > gapa) {
696 					*(++sp) = ss;
697                     if(ss > max_score) {
698                         max_score = ss;
699                         max_ptr = m;
700                     }
701 				} else {
702 					*(++sp) = gapa;
703 					*m |= Agap;
704 				}
705 			} else {
706 				if(ss > gapbj) {
707 					*(++sp) = ss;
708                     if(ss > max_score) {
709                         max_score = ss;
710                         max_ptr = m;
711                     }
712 				} else {
713 					*(++sp) = gapbj;
714 					*m |= Bgap;
715 				}
716 			}
717             if(*sp <= 0) {
718                 *sp = 0;
719                 *m |= Zero;
720             }
721 		}
722         if(++blimits == last)
723             break;
724 
725 		swap(sm,s);
726 
727         int next = blimits->GetTo();
728         for( ; bright < next-1; ++bright)
729             *(++m) = Zero;
730 
731         m += nb-bright-1;
732 	}
733 
734     int ia = (max_ptr-mtrx)/(nb+1)-1;
735     int ib = (max_ptr-mtrx)%(nb+1)-1;
736     CCigar track(ia, ib);
737     m = max_ptr;
738     while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
739         if(*m&Agap) {
740             int len = 1;
741             while(!(*m&Astart)) {
742                 ++len;
743                 --m;
744             }
745             --m;
746             ib -= len;
747             track.PushFront(CCigar::SElement(len,'D'));
748         } else if(*m&Bgap) {
749             int len = 1;
750             while(!(*m&Bstart)) {
751                 ++len;
752                 m -= nb+1;
753             }
754             m -= nb+1;
755             ia -= len;
756             track.PushFront(CCigar::SElement(len,'I'));
757         } else {
758             track.PushFront(CCigar::SElement(1,'M'));
759             --ia;
760             --ib;
761             m -= nb+2;
762         }
763     }
764 
765     delete[] s;
766     delete[] sm;
767     delete[] gapb;
768     delete[] mtrx;
769 
770     return track;
771 }
772 
773 
SMatrix(int match,int mismatch)774 SMatrix::SMatrix(int match, int mismatch) { // matrix for DNA
775 
776     for(int i = 0; i < 256;  ++i) {
777         int c = toupper(i);
778         for(int j = 0; j < 256;  ++j) {
779             if(c != 'N' && c == toupper(j)) matrix[i][j] = match;
780             else matrix[i][j] = -mismatch;
781         }
782     }
783 }
784 
SMatrix()785 SMatrix::SMatrix() { // matrix for proteins
786 
787     string aa("ARNDCQEGHILKMFPSTWYVBZX*");
788     int scores[] = {
789  4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4,
790 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4,
791 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4,
792 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 4, 1,-1,-4,
793  0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4,
794 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, 0, 3,-1,-4,
795 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4,
796  0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-2,-1,-4,
797 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, 0, 0,-1,-4,
798 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-3,-3,-1,-4,
799 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-3,-1,-4,
800 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, 0, 1,-1,-4,
801 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-3,-1,-1,-4,
802 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-3,-3,-1,-4,
803 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-1,-2,-4,
804  1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0, 0, 0,-4,
805  0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-1,-1, 0,-4,
806 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-3,-2,-4,
807 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-3,-2,-1,-4,
808  0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-3,-2,-1,-4,
809 -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, 1,-1,-4,
810 -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4,
811  0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,-4,
812 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1
813     };
814 
815     for(int i = 0; i < 256;  ++i) {
816         for(int j = 0; j < 256;  ++j) {
817             matrix[i][j] = 0;
818         }
819     }
820 
821     int num = aa.size();
822     for(int i = 0; i < num; ++i) {
823         char c = aa[i];
824         for(int j = 0; j < num; ++j) {
825             int score = scores[num*j+i];
826             char d = aa[j];
827             matrix[(int)c][(int)d] = score;
828             matrix[(int)tolower(c)][(int)tolower(d)] = score;
829             matrix[(int)c][(int)tolower(d)] = score;
830             matrix[(int)tolower(c)][(int)d] = score;
831         }
832     }
833 }
834 
Entropy(const string & seq)835 double Entropy(const string& seq) {
836     int length = seq.size();
837     if(length == 0)
838         return 0;
839     double tA = 1.e-8;
840     double tC = 1.e-8;
841     double tG = 1.e-8;
842     double tT = 1.e-8;
843     ITERATE(string, i, seq) {
844         switch(*i) {
845         case 'A': tA += 1; break;
846         case 'C': tC += 1; break;
847         case 'G': tG += 1; break;
848         case 'T': tT += 1; break;
849         default: break;
850         }
851     }
852     double entropy = -(tA*log(tA/length)+tC*log(tC/length)+tG*log(tG/length)+tT*log(tT/length))/(length*log(4.));
853 
854     return entropy;
855 }
856 
857 
858 
859 END_SCOPE(gnomon)
860 END_SCOPE(ncbi)
861 
862 
863 
864