1 /* $Id: glb_align.cpp 477516 2015-08-31 14:15:41Z souvorov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alexandre Souvorov
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon_model.hpp>
34 #include <algo/gnomon/glb_align.hpp>
35 #include <sstream>
36
37 BEGIN_NCBI_SCOPE
38 BEGIN_SCOPE(gnomon)
39
40 using namespace std;
41
PushFront(const SElement & el)42 void CCigar::PushFront(const SElement& el) {
43 if(el.m_type == 'M') {
44 m_qfrom -= el.m_len;
45 m_sfrom -= el.m_len;
46 } else if(el.m_type == 'D')
47 m_sfrom -= el.m_len;
48 else
49 m_qfrom -= el.m_len;
50
51 if(m_elements.empty() || m_elements.front().m_type != el.m_type)
52 m_elements.push_front(el);
53 else
54 m_elements.front().m_len += el.m_len;
55 }
56
PushBack(const SElement & el)57 void CCigar::PushBack(const SElement& el) {
58 if(el.m_type == 'M') {
59 m_qto += el.m_len;
60 m_sto += el.m_len;
61 } else if(el.m_type == 'D')
62 m_sto += el.m_len;
63 else
64 m_qto += el.m_len;
65
66 if(m_elements.empty() || m_elements.back().m_type != el.m_type)
67 m_elements.push_back(el);
68 else
69 m_elements.back().m_len += el.m_len;
70 }
71
CigarString(int qstart,int qlen) const72 string CCigar::CigarString(int qstart, int qlen) const {
73 string cigar;
74 ITERATE(list<SElement>, i, m_elements)
75 cigar += NStr::IntToString(i->m_len)+i->m_type;
76
77 int missingstart = qstart+m_qfrom;
78 if(missingstart > 0)
79 cigar = NStr::IntToString(missingstart)+"S"+cigar;
80 int missingend = qlen-1-m_qto-qstart;
81 if(missingend > 0)
82 cigar += NStr::IntToString(missingend)+"S";
83
84 return cigar;
85 }
86
GetInDels(int sstart,const char * const query,const char * subject) const87 TInDels CCigar::GetInDels(int sstart, const char* const query, const char* subject) const {
88 TInDels indels;
89
90 int qpos = m_qfrom;
91 int spos = m_sfrom;
92 ITERATE(list<SElement>, i, m_elements) {
93 if(i->m_type == 'M') {
94 bool is_match = *(query+qpos) == *(subject+spos);
95 int len = 0;
96 for(int l = 0; l < i->m_len; ++l) {
97 if((*(query+qpos) == *(subject+spos)) == is_match) {
98 ++len;
99 } else {
100 if(!is_match) {
101 CInDelInfo indl(spos-len+sstart, len, CInDelInfo::eMism, string(query+qpos-len, len));
102 indels.push_back(indl);
103 }
104 is_match = !is_match;
105 len = 1;
106 }
107 ++qpos;
108 ++spos;
109 }
110 if(!is_match) {
111 CInDelInfo indl(spos-len+sstart, len, CInDelInfo::eMism, string(query+qpos-len, len));
112 indels.push_back(indl);
113 }
114 } else if(i->m_type == 'D') {
115 CInDelInfo indl(spos+sstart, i->m_len, CInDelInfo::eIns);
116 indels.push_back(indl);
117 spos += i->m_len;
118 } else {
119 CInDelInfo indl(spos+sstart, i->m_len, CInDelInfo::eDel, string(query+qpos, i->m_len));
120 indels.push_back(indl);
121 qpos += i->m_len;
122 }
123 }
124
125 return indels;
126 }
127
DetailedCigarString(int qstart,int qlen,const char * query,const char * subject) const128 string CCigar::DetailedCigarString(int qstart, int qlen, const char* query, const char* subject) const {
129 string cigar;
130 query += m_qfrom;
131 subject += m_sfrom;
132 ITERATE(list<SElement>, i, m_elements) {
133 if(i->m_type == 'M') {
134 bool is_match = *query == *subject;
135 int len = 0;
136 for(int l = 0; l < i->m_len; ++l) {
137 if((*query == *subject) == is_match) {
138 ++len;
139 } else {
140 cigar += NStr::IntToString(len)+ (is_match ? "=" : "X");
141 is_match = !is_match;
142 len = 1;
143 }
144 ++query;
145 ++subject;
146 }
147 cigar += NStr::IntToString(len)+ (is_match ? "=" : "X");
148 } else if(i->m_type == 'D') {
149 cigar += NStr::IntToString(i->m_len)+i->m_type;
150 subject += i->m_len;
151 } else {
152 cigar += NStr::IntToString(i->m_len)+i->m_type;
153 query += i->m_len;
154 }
155 }
156
157 int missingstart = qstart+m_qfrom;
158 if(missingstart > 0)
159 cigar = NStr::IntToString(missingstart)+"S"+cigar;
160 int missingend = qlen-1-m_qto-qstart;
161 if(missingend > 0)
162 cigar += NStr::IntToString(missingend)+"S";
163
164 return cigar;
165 }
166
ToAlign(const char * query,const char * subject) const167 TCharAlign CCigar::ToAlign(const char* query, const char* subject) const {
168 TCharAlign align;
169 query += m_qfrom;
170 subject += m_sfrom;
171 ITERATE(list<SElement>, i, m_elements) {
172 if(i->m_type == 'M') {
173 align.first.insert(align.first.end(), query, query+i->m_len);
174 query += i->m_len;
175 align.second.insert(align.second.end(), subject, subject+i->m_len);
176 subject += i->m_len;
177 } else if(i->m_type == 'D') {
178 align.first.insert(align.first.end(), i->m_len, '-');
179 align.second.insert(align.second.end(), subject, subject+i->m_len);
180 subject += i->m_len;
181 } else {
182 align.first.insert(align.first.end(), query, query+i->m_len);
183 query += i->m_len;
184 align.second.insert(align.second.end(), i->m_len, '-');
185 }
186 }
187
188 return align;
189 }
190
Matches(const char * query,const char * subject) const191 int CCigar::Matches(const char* query, const char* subject) const {
192 int matches = 0;
193 query += m_qfrom;
194 subject += m_sfrom;
195 ITERATE(list<SElement>, i, m_elements) {
196 if(i->m_type == 'M') {
197 for(int l = 0; l < i->m_len; ++l) {
198 if(*query == *subject)
199 ++matches;
200 ++query;
201 ++subject;
202 }
203 } else if(i->m_type == 'D') {
204 subject += i->m_len;
205 } else {
206 query += i->m_len;
207 }
208 }
209
210 return matches;
211 }
212
Distance(const char * query,const char * subject) const213 int CCigar::Distance(const char* query, const char* subject) const {
214 int dist = 0;
215 query += m_qfrom;
216 subject += m_sfrom;
217 ITERATE(list<SElement>, i, m_elements) {
218 if(i->m_type == 'M') {
219 for(int l = 0; l < i->m_len; ++l) {
220 if(*query != *subject)
221 ++dist;
222 ++query;
223 ++subject;
224 }
225 } else if(i->m_type == 'D') {
226 subject += i->m_len;
227 dist += i->m_len;
228 } else {
229 query += i->m_len;
230 dist += i->m_len;
231 }
232 }
233
234 return dist;
235 }
236
Score(const char * query,const char * subject,int gopen,int gapextend,const char delta[256][256]) const237 int CCigar::Score(const char* query, const char* subject, int gopen, int gapextend, const char delta[256][256]) const {
238 int score = 0;
239
240 query += m_qfrom;
241 subject += m_sfrom;
242 ITERATE(list<SElement>, i, m_elements) {
243 if(i->m_type == 'M') {
244 for(int l = 0; l < i->m_len; ++l) {
245 score += delta[(int)*query][(int)*subject];
246 ++query;
247 ++subject;
248 }
249 } else if(i->m_type == 'D') {
250 subject += i->m_len;
251 score -= gopen+gapextend*i->m_len;
252 } else {
253 query += i->m_len;
254 score -= gopen+gapextend*i->m_len;
255 }
256 }
257
258 return score;
259 }
260
261
GlbAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,const char delta[256][256])262 CCigar GlbAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256]) {
263 // rho - new gap penalty (one base gap rho+sigma)
264 // sigma - extension penalty
265
266 int* s = new int[nb+1]; // best scores in current a-raw
267 int* sm = new int[nb+1]; // best scores in previous a-raw
268 int* gapb = new int[nb+1]; // best score with b-gap
269 char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart)
270
271 int rs = rho+sigma;
272 int bignegative = numeric_limits<int>::min()/2;
273
274 sm[0] = 0;
275 sm[1] = -rs; // scores for -------------- (the best scores for i == -1)
276 for(int i = 2; i <= nb; ++i) // BBBBBBBBBBBBBB
277 sm[i] = sm[i-1]-sigma;
278 s[0] = -rs; // score for A (the best score for j == -1 and i == 0)
279 // -
280 for(int i = 0; i <= nb; ++i)
281 gapb[i] = bignegative;
282
283 enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8};
284
285 mtrx[0] = 0;
286 for(int i = 1; i <= nb; ++i) { // ---------------
287 mtrx[i] = Agap; // BBBBBBBBBBBBBBB
288 }
289 mtrx[1] |= Astart;
290
291 char* m = mtrx+nb;
292 for(int i = 0; i < na; ++i) {
293 *(++m) = Bstart|Bgap; //AAAAAAAAAAAAAAA
294 //---------------
295 int gapa = bignegative;
296 int ai = a[i];
297 const char* matrix = delta[ai];
298 int* sp = s;
299 for(int j = 0; j < nb; ) {
300 *(++m) = 0;
301 int ss = sm[j]+matrix[(int)b[j]];
302
303 gapa -= sigma;
304 if(*sp-rs > gapa) { // for j == 0 this will open AAAAAAAAAAA- which could be used if mismatch is very expensive
305 gapa = *sp-rs; // -----------B
306 *m |= Astart;
307 }
308
309 int& gapbj = gapb[++j];
310 gapbj -= sigma;
311 if(sm[j]-rs > gapbj) { // for i == 0 this will open BBBBBBBBBBB- which could be used if mismatch is very expensive
312 gapbj = sm[j]-rs; // -----------A
313 *m |= Bstart;
314 }
315
316 if(gapa > gapbj) {
317 if(ss > gapa) {
318 *(++sp) = ss;
319 } else {
320 *(++sp) = gapa;
321 *m |= Agap;
322 }
323 } else {
324 if(ss > gapbj) {
325 *(++sp) = ss;
326 } else {
327 *(++sp) = gapbj;
328 *m |= Bgap;
329 }
330 }
331 }
332 swap(sm,s);
333 *s = *sm-sigma;
334 }
335
336 int ia = na-1;
337 int ib = nb-1;
338 m = mtrx+(na+1)*(nb+1)-1;
339 CCigar track(ia, ib);
340 while(ia >= 0 || ib >= 0) {
341 if(*m&Agap) {
342 int len = 1;
343 while(!(*m&Astart)) {
344 ++len;
345 --m;
346 }
347 --m;
348 ib -= len;
349 track.PushFront(CCigar::SElement(len,'D'));
350 } else if(*m&Bgap) {
351 int len = 1;
352 while(!(*m&Bstart)) {
353 ++len;
354 m -= nb+1;
355 }
356 m -= nb+1;
357 ia -= len;
358 track.PushFront(CCigar::SElement(len,'I'));
359 } else {
360 track.PushFront(CCigar::SElement(1,'M'));
361 --ia;
362 --ib;
363 m -= nb+2;
364 }
365 }
366
367 delete[] s;
368 delete[] sm;
369 delete[] gapb;
370 delete[] mtrx;
371
372 return track;
373 }
374
375
LclAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,const char delta[256][256])376 CCigar LclAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256]) {
377 // rho - new gap penalty (one base gap rho+sigma)
378 // sigma - extension penalty
379
380 int* s = new int[nb+1]; // best scores in current a-raw
381 int* sm = new int[nb+1]; // best scores in previous a-raw
382 int* gapb = new int[nb+1]; // best score with b-gap
383 char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
384
385 int rs = rho+sigma;
386
387 enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
388
389 for(int i = 0; i <= nb; ++i) {
390 sm[i] = 0;
391 mtrx[i] = Zero;
392 gapb[i] = 0;
393 }
394 s[0] = 0;
395
396 int max_score = 0;
397 char* max_ptr = mtrx;
398 char* m = mtrx+nb;
399
400 for(int i = 0; i < na; ++i) {
401 *(++m) = Zero;
402 int gapa = 0;
403 int ai = a[i];
404 const char* matrix = delta[ai];
405 int* sp = s;
406 for(int j = 0; j < nb; ) {
407 *(++m) = 0;
408 int ss = sm[j]+matrix[(int)b[j]];
409
410 gapa -= sigma;
411 if(*sp-rs > gapa) {
412 gapa = *sp-rs;
413 *m |= Astart;
414 }
415
416 int& gapbj = gapb[++j];
417 gapbj -= sigma;
418 if(sm[j]-rs > gapbj) {
419 gapbj = sm[j]-rs;
420 *m |= Bstart;
421 }
422
423 if(gapa > gapbj) {
424 if(ss > gapa) {
425 *(++sp) = ss;
426 if(ss > max_score) {
427 max_score = ss;
428 max_ptr = m;
429 }
430 } else {
431 *(++sp) = gapa;
432 *m |= Agap;
433 }
434 } else {
435 if(ss > gapbj) {
436 *(++sp) = ss;
437 if(ss > max_score) {
438 max_score = ss;
439 max_ptr = m;
440 }
441 } else {
442 *(++sp) = gapbj;
443 *m |= Bgap;
444 }
445 }
446 if(*sp <= 0) {
447 *sp = 0;
448 *m |= Zero;
449 }
450 }
451 swap(sm,s);
452 }
453
454 int ia = (max_ptr-mtrx)/(nb+1)-1;
455 int ib = (max_ptr-mtrx)%(nb+1)-1;
456 CCigar track(ia, ib);
457 m = max_ptr;
458 while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
459 if(*m&Agap) {
460 int len = 1;
461 while(!(*m&Astart)) {
462 ++len;
463 --m;
464 }
465 --m;
466 ib -= len;
467 track.PushFront(CCigar::SElement(len,'D'));
468 } else if(*m&Bgap) {
469 int len = 1;
470 while(!(*m&Bstart)) {
471 ++len;
472 m -= nb+1;
473 }
474 m -= nb+1;
475 ia -= len;
476 track.PushFront(CCigar::SElement(len,'I'));
477 } else {
478 track.PushFront(CCigar::SElement(1,'M'));
479 --ia;
480 --ib;
481 m -= nb+2;
482 }
483 }
484
485 delete[] s;
486 delete[] sm;
487 delete[] gapb;
488 delete[] mtrx;
489
490 return track;
491 }
492
493
LclAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,bool pinleft,bool pinright,const char delta[256][256])494 CCigar LclAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, bool pinleft, bool pinright, const char delta[256][256]) {
495 // rho - new gap penalty (one base gap rho+sigma)
496 // sigma - extension penalty
497
498 int* s = new int[nb+1]; // best scores in current a-raw
499 int* sm = new int[nb+1]; // best scores in previous a-raw
500 int* gapb = new int[nb+1]; // best score with b-gap
501 char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
502
503 int rs = rho+sigma;
504 int bignegative = numeric_limits<int>::min()/2;
505
506 enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
507
508 sm[0] = 0;
509 mtrx[0] = 0;
510 gapb[0] = bignegative; // not used
511 if(pinleft) {
512 if(nb > 0) {
513 sm[1] = -rs;
514 mtrx[1] = Astart|Agap;
515 gapb[1] = bignegative;
516 for(int i = 2; i <= nb; ++i) {
517 sm[i] = sm[i-1]-sigma;
518 mtrx[i] = Agap;
519 gapb[i] = bignegative;
520 }
521 }
522 s[0] = -rs;
523 } else {
524 for(int i = 1; i <= nb; ++i) {
525 sm[i] = 0;
526 mtrx[i] = Zero;
527 gapb[i] = bignegative;
528 }
529 s[0] = 0;
530 }
531
532 int max_score = 0;
533 char* max_ptr = mtrx;
534
535 char* m = mtrx+nb;
536 for(int i = 0; i < na; ++i) {
537 *(++m) = pinleft ? Bstart|Bgap : Zero;
538 int gapa = bignegative;
539 int ai = a[i];
540
541 const char* matrix = delta[ai];
542 int* sp = s;
543 for(int j = 0; j < nb; ) {
544 *(++m) = 0;
545 int ss = sm[j]+matrix[(int)b[j]];
546
547 gapa -= sigma;
548 if(*sp-rs > gapa) {
549 gapa = *sp-rs;
550 *m |= Astart;
551 }
552
553 int& gapbj = gapb[++j];
554 gapbj -= sigma;
555 if(sm[j]-rs > gapbj) {
556 gapbj = sm[j]-rs;
557 *m |= Bstart;
558 }
559
560 if(gapa > gapbj) {
561 if(ss > gapa) {
562 *(++sp) = ss;
563 if(ss > max_score) {
564 max_score = ss;
565 max_ptr = m;
566 }
567 } else {
568 *(++sp) = gapa;
569 *m |= Agap;
570 }
571 } else {
572 if(ss > gapbj) {
573 *(++sp) = ss;
574 if(ss > max_score) {
575 max_score = ss;
576 max_ptr = m;
577 }
578 } else {
579 *(++sp) = gapbj;
580 *m |= Bgap;
581 }
582 }
583 if(*sp <= 0 && !pinleft) {
584 *sp = 0;
585 *m |= Zero;
586 }
587 }
588 swap(sm,s);
589 if(pinleft)
590 *s = *sm-sigma;
591 }
592
593 int maxa, maxb;
594 if(pinright) {
595 maxa = na-1;
596 maxb = nb-1;
597 max_score = sm[nb];
598 } else {
599 maxa = (max_ptr-mtrx)/(nb+1)-1;
600 maxb = (max_ptr-mtrx)%(nb+1)-1;
601 m = max_ptr;
602 }
603 int ia = maxa;
604 int ib = maxb;
605 CCigar track(ia, ib);
606 while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
607 if(*m&Agap) {
608 int len = 1;
609 while(!(*m&Astart)) {
610 ++len;
611 --m;
612 }
613 --m;
614 ib -= len;
615 track.PushFront(CCigar::SElement(len,'D'));
616 } else if(*m&Bgap) {
617 int len = 1;
618 while(!(*m&Bstart)) {
619 ++len;
620 m -= nb+1;
621 }
622 m -= nb+1;
623 ia -= len;
624 track.PushFront(CCigar::SElement(len,'I'));
625 } else {
626 track.PushFront(CCigar::SElement(1,'M'));
627 --ia;
628 --ib;
629 m -= nb+2;
630 }
631 }
632
633 delete[] s;
634 delete[] sm;
635 delete[] gapb;
636 delete[] mtrx;
637
638 return track;
639 }
640
VariBandAlign(const char * a,int na,const char * b,int nb,int rho,int sigma,const char delta[256][256],const TSignedSeqRange * blimits)641 CCigar VariBandAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256], const TSignedSeqRange* blimits) {
642 // rho - new gap penalty (one base gap rho+sigma)
643 // sigma - extension penalty
644
645 int* s = new int[nb+1]; // best scores in current a-raw
646 int* sm = new int[nb+1]; // best scores in previous a-raw
647 int* gapb = new int[nb+1]; // best score with b-gap
648 char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
649
650 int rs = rho+sigma;
651
652 enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
653
654 for(int i = 0; i <= nb; ++i) {
655 s[i] = 0;
656 sm[i] = 0;
657 gapb[i] = 0;
658 mtrx[i] = Zero;
659 }
660
661 int max_score = 0;
662 char* max_ptr = mtrx;
663 char* m = mtrx+nb;
664
665 const TSignedSeqRange* last = blimits+na;
666 while(true) {
667 int ai = *a++;
668 const char* matrix = delta[ai];
669
670 int bleft = blimits->GetFrom();
671 int bright = blimits->GetTo();
672 m += bleft;
673 *(++m) = Zero;
674 int gapa = 0;
675 int* sp = s+bleft;
676 *sp = 0;
677 for(int j = bleft; j <= bright; ) {
678 *(++m) = 0;
679 int ss = sm[j]+matrix[(int)b[j]];
680
681 gapa -= sigma;
682 if(*sp-rs > gapa) {
683 gapa = *sp-rs;
684 *m |= Astart;
685 }
686
687 int& gapbj = gapb[++j];
688 gapbj -= sigma;
689 if(sm[j]-rs > gapbj) {
690 gapbj = sm[j]-rs;
691 *m |= Bstart;
692 }
693
694 if(gapa > gapbj) {
695 if(ss > gapa) {
696 *(++sp) = ss;
697 if(ss > max_score) {
698 max_score = ss;
699 max_ptr = m;
700 }
701 } else {
702 *(++sp) = gapa;
703 *m |= Agap;
704 }
705 } else {
706 if(ss > gapbj) {
707 *(++sp) = ss;
708 if(ss > max_score) {
709 max_score = ss;
710 max_ptr = m;
711 }
712 } else {
713 *(++sp) = gapbj;
714 *m |= Bgap;
715 }
716 }
717 if(*sp <= 0) {
718 *sp = 0;
719 *m |= Zero;
720 }
721 }
722 if(++blimits == last)
723 break;
724
725 swap(sm,s);
726
727 int next = blimits->GetTo();
728 for( ; bright < next-1; ++bright)
729 *(++m) = Zero;
730
731 m += nb-bright-1;
732 }
733
734 int ia = (max_ptr-mtrx)/(nb+1)-1;
735 int ib = (max_ptr-mtrx)%(nb+1)-1;
736 CCigar track(ia, ib);
737 m = max_ptr;
738 while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
739 if(*m&Agap) {
740 int len = 1;
741 while(!(*m&Astart)) {
742 ++len;
743 --m;
744 }
745 --m;
746 ib -= len;
747 track.PushFront(CCigar::SElement(len,'D'));
748 } else if(*m&Bgap) {
749 int len = 1;
750 while(!(*m&Bstart)) {
751 ++len;
752 m -= nb+1;
753 }
754 m -= nb+1;
755 ia -= len;
756 track.PushFront(CCigar::SElement(len,'I'));
757 } else {
758 track.PushFront(CCigar::SElement(1,'M'));
759 --ia;
760 --ib;
761 m -= nb+2;
762 }
763 }
764
765 delete[] s;
766 delete[] sm;
767 delete[] gapb;
768 delete[] mtrx;
769
770 return track;
771 }
772
773
SMatrix(int match,int mismatch)774 SMatrix::SMatrix(int match, int mismatch) { // matrix for DNA
775
776 for(int i = 0; i < 256; ++i) {
777 int c = toupper(i);
778 for(int j = 0; j < 256; ++j) {
779 if(c != 'N' && c == toupper(j)) matrix[i][j] = match;
780 else matrix[i][j] = -mismatch;
781 }
782 }
783 }
784
SMatrix()785 SMatrix::SMatrix() { // matrix for proteins
786
787 string aa("ARNDCQEGHILKMFPSTWYVBZX*");
788 int scores[] = {
789 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4,
790 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4,
791 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4,
792 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 4, 1,-1,-4,
793 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4,
794 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, 0, 3,-1,-4,
795 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4,
796 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-2,-1,-4,
797 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, 0, 0,-1,-4,
798 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-3,-3,-1,-4,
799 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-3,-1,-4,
800 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, 0, 1,-1,-4,
801 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-3,-1,-1,-4,
802 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-3,-3,-1,-4,
803 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-1,-2,-4,
804 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0, 0, 0,-4,
805 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-1,-1, 0,-4,
806 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-3,-2,-4,
807 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-3,-2,-1,-4,
808 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-3,-2,-1,-4,
809 -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, 1,-1,-4,
810 -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4,
811 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,-4,
812 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1
813 };
814
815 for(int i = 0; i < 256; ++i) {
816 for(int j = 0; j < 256; ++j) {
817 matrix[i][j] = 0;
818 }
819 }
820
821 int num = aa.size();
822 for(int i = 0; i < num; ++i) {
823 char c = aa[i];
824 for(int j = 0; j < num; ++j) {
825 int score = scores[num*j+i];
826 char d = aa[j];
827 matrix[(int)c][(int)d] = score;
828 matrix[(int)tolower(c)][(int)tolower(d)] = score;
829 matrix[(int)c][(int)tolower(d)] = score;
830 matrix[(int)tolower(c)][(int)d] = score;
831 }
832 }
833 }
834
Entropy(const string & seq)835 double Entropy(const string& seq) {
836 int length = seq.size();
837 if(length == 0)
838 return 0;
839 double tA = 1.e-8;
840 double tC = 1.e-8;
841 double tG = 1.e-8;
842 double tT = 1.e-8;
843 ITERATE(string, i, seq) {
844 switch(*i) {
845 case 'A': tA += 1; break;
846 case 'C': tC += 1; break;
847 case 'G': tG += 1; break;
848 case 'T': tT += 1; break;
849 default: break;
850 }
851 }
852 double entropy = -(tA*log(tA/length)+tC*log(tC/length)+tG*log(tG/length)+tT*log(tT/length))/(length*log(4.));
853
854 return entropy;
855 }
856
857
858
859 END_SCOPE(gnomon)
860 END_SCOPE(ncbi)
861
862
863
864