1 /*
2  * stdaln.c -- standard alignment (local and banded global alignment)
3  *
4  * Copyright (c) 2003-2006, Li Heng <liheng@genomics.org.cn>
5  *                                  <lh3lh3@gmail.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  */
22 
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include "stdaln.h"
27 
28 /* char -> 17 (=16+1) nucleotides */
29 unsigned char aln_nt16_table[256] = {
30 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
31 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
32 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15,
33 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
34 	15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15,
35 	15,15, 3, 6,  8,15, 7, 9,  0,12,15,15, 15,15,15,15,
36 	15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15,
37 	15,15, 3, 6,  8,15, 7, 9,  0,12,15,15, 15,15,15,15,
38 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
39 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
40 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
41 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
42 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
43 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
44 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
45 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
46 };
47 char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-";
48 
49 /* char -> 5 (=4+1) nucleotides */
50 unsigned char aln_nt4_table[256] = {
51 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
52 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
53 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 5 /*'-'*/, 4, 4,
54 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
55 	4, 0, 4, 2,  4, 4, 4, 1,  4, 4, 4, 4,  4, 4, 4, 4,
56 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
57 	4, 0, 4, 2,  4, 4, 4, 1,  4, 4, 4, 4,  4, 4, 4, 4,
58 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
59 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
60 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
61 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
62 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
63 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
64 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
65 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
66 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
67 };
68 char *aln_nt4_rev_table = "AGCTN-";
69 
70 /* char -> 22 (=20+1+1) amino acids */
71 unsigned char aln_aa_table[256] = {
72 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
73 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
74 	21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21,
75 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
76 	21, 0,21, 4,  3, 6,13, 7,  8, 9,21,11, 10,12, 2,21,
77 	14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21,
78 	21, 0,21, 4,  3, 6,13, 7,  8, 9,21,11, 10,12, 2,21,
79 	14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21,
80 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
81 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
82 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
83 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
84 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
85 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
86 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
87 	21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21
88 };
89 char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-";
90                        /* 01234567890123456789012 */
91 
92 /* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */
93 unsigned char aln_trans_table_eu[66] = {
94 	11,11, 2, 2,  1, 1,15,15, 16,16,16,16,  9,12, 9, 9,
95 	 6, 6, 3, 3,  7, 7, 7, 7,  0, 0, 0, 0, 19,19,19,19,
96 	 5, 5, 8, 8,  1, 1, 1, 1, 14,14,14,14, 10,10,10,10,
97 	20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22
98 };
99 char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX";
100                               /* 01234567890123456789012345678901234567890123456789012345678901234 */
101 int aln_sm_blosum62[] = {
102 /*	 A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
103 	 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
104 	-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
105 	-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
106 	-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
107 	 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
108 	-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
109 	-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
110 	 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
111 	-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
112 	-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
113 	-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
114 	-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
115 	-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
116 	-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
117 	-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
118 	 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
119 	 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
120 	-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
121 	-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
122 	 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
123 	-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
124 	 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
125 };
126 
127 int aln_sm_blosum45[] = {
128 /*	 A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
129 	 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0,
130 	-2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1,
131 	-1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1,
132 	-2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1,
133 	-1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2,
134 	-1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1,
135 	-1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1,
136 	 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1,
137 	-2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1,
138 	-1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1,
139 	-1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1,
140 	-1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1,
141 	-1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1,
142 	-2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1,
143 	-1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1,
144 	 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0,
145 	 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0,
146 	-2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2,
147 	-2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1,
148 	 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1,
149 	-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5,
150 	 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1
151 };
152 
153 int aln_sm_nt[] = {
154 /*	 X  A  G  R  C  M  S  V  T  W  K  D  Y  H  B  N */
155 	-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
156 	-2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0,
157 	-2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0,
158 	-2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0,
159 	-2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0,
160 	-2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0,
161 	-2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0,
162 	-2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0,
163 	-2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0,
164 	-2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0,
165 	-2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0,
166 	-2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167 	-2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0,
168 	-2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169 	-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170 	-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
171 };
172 
173 int aln_sm_read[] = {
174 /*	  X   A   G   R   C   M   S   V   T   W   K   D   Y   H   B   N  */
175 	-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,
176 	-17,  2,-17,  1,-17,  1,-17,  0,-17,  1,-17,  0,-17,  0,-17,  0,
177 	-17,-17,  2,  1,-17,-17,  1,  0,-17,-17,  1,  0,-17,-17,  0,  0,
178 	-17,  1,  1,  1,-17,-17,-17,  0,-17,-17,-17,  0,-17,  0,  0,  0,
179 	-17,-17,-17,-17,  2,  1,  1,  0,-17,-17,-17,-17,  1,  0,  0,  0,
180 	-17,  1,-17,-17,  1,  1,-17,  0,-17,-17,-17,  0,-17,  0,  0,  0,
181 	-17,-17,  1,-17,  1,-17,  1,  0,-17,-17,-17,  0,-17,  0,  0,  0,
182 	-17,  0,  0,  0,  0,  0,  0,  0,-17,  0,  0,  0,  0,  0,  0,  0,
183 	-17,-17,-17,-17,-17,-17,-17,-17,  2,  1,  1,  0,  1,  0,  0,  0,
184 	-17,  1,-17,-17,-17,-17,-17,  0,  1,  1,-17,  0,-17,  0,  0,  0,
185 	-17,-17,  1,-17,-17,-17,-17,  0,  1,-17,  1,  0,-17,  0,  0,  0,
186 	-17,  0,  0,  0,-17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
187 	-17,-17,-17,-17,  1,-17,-17,  0,  1,-17,-17,  0,  1,  0,  0,  0,
188 	-17,  0,-17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
189 	-17,-17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
190 	-17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
191 };
192 
193 int aln_sm_hs[] = {
194 /*     A    G    C    T    N */
195 	  91, -31,-114,-123, -44,
196 	 -31, 100,-125,-114, -42,
197 	-123,-125, 100, -31, -42,
198 	-114,-114, -31,  91, -42,
199 	 -44, -42, -42, -42, -43
200 };
201 
202 /********************/
203 /* START OF align.c */
204 /********************/
205 
206 AlnParam aln_param_nt2nt   = { 10,  2,  2, aln_sm_nt, 16, 75 };
207 AlnParam aln_param_rd2rd   = { 20, 19, 19, aln_sm_read, 16, 75 };
208 AlnParam aln_param_aa2aa   = { 12,  2,  2, aln_sm_blosum62, 22, 50 };
209 
aln_init_AlnAln()210 AlnAln *aln_init_AlnAln()
211 {
212 	AlnAln *aa;
213 	aa = (AlnAln*)MYALLOC(sizeof(AlnAln));
214 	aa->path = 0;
215 	aa->out1 = aa->out2 = aa->outm = 0;
216 	aa->path_len = 0;
217 	return aa;
218 }
aln_free_AlnAln(AlnAln * aa)219 void aln_free_AlnAln(AlnAln *aa)
220 {
221 	MYFREE(aa->path);
222 	MYFREE(aa->out1);
223 	MYFREE(aa->out2);
224 	MYFREE(aa->outm);
225 	MYFREE(aa);
226 }
227 
228 /***************************/
229 /* START OF common_align.c */
230 /***************************/
231 
232 #define LOCAL_OVERFLOW_THRESHOLD 32000
233 #define LOCAL_OVERFLOW_REDUCE 16000
234 #define NT_LOCAL_SCORE int
235 #define NT_LOCAL_SHIFT 16
236 #define NT_LOCAL_MASK 0xffff
237 
238 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
239 
240 #define set_M(MM, cur, p, sc)							\
241 {														\
242 	if ((p)->M >= (p)->I) {								\
243 		if ((p)->M >= (p)->D) {							\
244 			(MM) = (p)->M + (sc); (cur)->Mt = FROM_M;	\
245 		} else {										\
246 			(MM) = (p)->D + (sc); (cur)->Mt = FROM_D;	\
247 		}												\
248 	} else {											\
249 		if ((p)->I > (p)->D) {							\
250 			(MM) = (p)->I + (sc); (cur)->Mt = FROM_I;	\
251 		} else {										\
252 			(MM) = (p)->D + (sc); (cur)->Mt = FROM_D;	\
253 		}												\
254 	}													\
255 }
256 #define set_I(II, cur, p)								\
257 {														\
258 	if ((p)->M - gap_open > (p)->I - gap_ext) {			\
259 		(cur)->It = FROM_M;								\
260 		(II) = (p)->M - gap_open;						\
261 	} else {											\
262 		(cur)->It = FROM_I;								\
263 		(II) = (p)->I - gap_ext;						\
264 	}													\
265 }
266 #define set_end_I(II, cur, p)							\
267 {														\
268 	if (gap_end >= 0) {									\
269 		if ((p)->M > (p)->I) {							\
270 			(cur)->It = FROM_M;							\
271 			(II) = (p)->M - gap_end;					\
272 		} else {										\
273 			(cur)->It = FROM_I;							\
274 			(II) = (p)->I - gap_end;					\
275 		}												\
276 	} else set_I(II, cur, p);							\
277 }
278 #define set_D(DD, cur, p)								\
279 {														\
280 	if ((p)->M - gap_open > (p)->D - gap_ext) {			\
281 		(cur)->Dt = FROM_M;								\
282 		(DD) = (p)->M - gap_open;						\
283 	} else {											\
284 		(cur)->Dt = FROM_D;								\
285 		(DD) = (p)->D - gap_ext;						\
286 	}													\
287 }
288 #define set_end_D(DD, cur, p)							\
289 {														\
290 	if (gap_end >= 0) {									\
291 		if ((p)->M > (p)->D) {							\
292 			(cur)->Dt = FROM_M;							\
293 			(DD) = (p)->M - gap_end;					\
294 		} else {										\
295 			(cur)->Dt = FROM_D;							\
296 			(DD) = (p)->D - gap_end;					\
297 		}												\
298 	} else set_D(DD, cur, p);							\
299 }
300 
301 typedef struct
302 {
303 	unsigned char Mt:3, It:2, Dt:2;
304 } dpcell_t;
305 
306 typedef struct
307 {
308 	int M, I, D;
309 } dpscore_t;
310 
311 /* build score profile for accelerating alignment, in theory */
aln_init_score_array(unsigned char * seq,int len,int row,int * score_matrix,int ** s_array)312 void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array)
313 {
314 	int *tmp, *tmp2, i, k;
315 	for (i = 0; i != row; ++i) {
316 		tmp = score_matrix + i * row;
317 		tmp2 = s_array[i];
318 		for (k = 0; k != len; ++k)
319 			tmp2[k] = tmp[seq[k]];
320 	}
321 }
322 /***************************
323  * banded global alignment *
324  ***************************/
aln_global_core(unsigned char * seq1,int len1,unsigned char * seq2,int len2,const AlnParam * ap,path_t * path,int * path_len)325 int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
326 		path_t *path, int *path_len)
327 {
328 	register int i, j;
329 	dpcell_t **dpcell, *q;
330 	dpscore_t *curr, *last, *s;
331 	path_t *p;
332 	int b1, b2, tmp_end;
333 	int *mat, end, max;
334 	unsigned char type, ctype;
335 
336 	int gap_open, gap_ext, gap_end, b;
337 	int *score_matrix, N_MATRIX_ROW;
338 
339 	/* initialize some align-related parameters. just for compatibility */
340 	gap_open = ap->gap_open;
341 	gap_ext = ap->gap_ext;
342 	gap_end = ap->gap_end;
343 	b = ap->band_width;
344 	score_matrix = ap->matrix;
345 	N_MATRIX_ROW = ap->row;
346 
347 	if (len1 == 0 || len2 == 0) {
348 		*path_len = 0;
349 		return 0;
350 	}
351 	/* calculate b1 and b2 */
352 	if (len1 > len2) {
353 		b1 = len1 - len2 + b;
354 		b2 = b;
355 	} else {
356 		b1 = b;
357 		b2 = len2 - len1 + b;
358 	}
359 	if (b1 > len1) b1 = len1;
360 	if (b2 > len2) b2 = len2;
361 	--seq1; --seq2;
362 
363 	/* allocate memory */
364 	end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
365 	dpcell = (dpcell_t**)MYALLOC(sizeof(dpcell_t*) * (len2 + 1));
366 	for (j = 0; j <= len2; ++j)
367 		dpcell[j] = (dpcell_t*)MYALLOC(sizeof(dpcell_t) * end);
368 	for (j = b2 + 1; j <= len2; ++j)
369 		dpcell[j] -= j - b2;
370 	curr = (dpscore_t*)MYALLOC(sizeof(dpscore_t) * (len1 + 1));
371 	last = (dpscore_t*)MYALLOC(sizeof(dpscore_t) * (len1 + 1));
372 
373 	/* set first row */
374 	SET_INF(*curr); curr->M = 0;
375 	for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
376 		SET_INF(*s);
377 		set_end_D(s->D, dpcell[0] + i, s - 1);
378 	}
379 	s = curr; curr = last; last = s;
380 
381 	/* core dynamic programming, part 1 */
382 	tmp_end = (b2 < len2)? b2 : len2 - 1;
383 	for (j = 1; j <= tmp_end; ++j) {
384 		q = dpcell[j]; s = curr; SET_INF(*s);
385 		set_end_I(s->I, q, last);
386 		end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
387 		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
388 		++s; ++q;
389 		for (i = 1; i != end; ++i, ++s, ++q) {
390 			set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
391 			set_I(s->I, q, last + i);
392 			set_D(s->D, q, s - 1);
393 		}
394 		set_M(s->M, q, last + i - 1, mat[seq1[i]]);
395 		set_D(s->D, q, s - 1);
396 		if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
397 			set_end_I(s->I, q, last + i);
398 		} else s->I = MINOR_INF;
399 		s = curr; curr = last; last = s;
400 	}
401 	/* last row for part 1, use set_end_D() instead of set_D() */
402 	if (j == len2 && b2 != len2 - 1) {
403 		q = dpcell[j]; s = curr; SET_INF(*s);
404 		set_end_I(s->I, q, last);
405 		end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
406 		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
407 		++s; ++q;
408 		for (i = 1; i != end; ++i, ++s, ++q) {
409 			set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
410 			set_I(s->I, q, last + i);
411 			set_end_D(s->D, q, s - 1);
412 		}
413 		set_M(s->M, q, last + i - 1, mat[seq1[i]]);
414 		set_end_D(s->D, q, s - 1);
415 		if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
416 			set_end_I(s->I, q, last + i);
417 		} else s->I = MINOR_INF;
418 		s = curr; curr = last; last = s;
419 		++j;
420 	}
421 
422 	/* core dynamic programming, part 2 */
423 	for (; j <= len2 - b2 + 1; ++j) {
424 		SET_INF(curr[j - b2]);
425 		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
426 		end = j + b1 - 1;
427 		for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
428 			set_M(s->M, q, last + i - 1, mat[seq1[i]]);
429 			set_I(s->I, q, last + i);
430 			set_D(s->D, q, s - 1);
431 		}
432 		set_M(s->M, q, last + i - 1, mat[seq1[i]]);
433 		set_D(s->D, q, s - 1);
434 		s->I = MINOR_INF;
435 		s = curr; curr = last; last = s;
436 	}
437 
438 	/* core dynamic programming, part 3 */
439 	for (; j < len2; ++j) {
440 		SET_INF(curr[j - b2]);
441 		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
442 		for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
443 			set_M(s->M, q, last + i - 1, mat[seq1[i]]);
444 			set_I(s->I, q, last + i);
445 			set_D(s->D, q, s - 1);
446 		}
447 		set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
448 		set_end_I(s->I, q, last + i);
449 		set_D(s->D, q, s - 1);
450 		s = curr; curr = last; last = s;
451 	}
452 	/* last row */
453 	if (j == len2) {
454 		SET_INF(curr[j - b2]);
455 		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
456 		for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
457 			set_M(s->M, q, last + i - 1, mat[seq1[i]]);
458 			set_I(s->I, q, last + i);
459 			set_end_D(s->D, q, s - 1);
460 		}
461 		set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
462 		set_end_I(s->I, q, last + i);
463 		set_end_D(s->D, q, s - 1);
464 		s = curr; curr = last; last = s;
465 	}
466 
467 	/* backtrace */
468 	i = len1; j = len2;
469 	q = dpcell[j] + i;
470 	s = last + len1;
471 	max = s->M; type = q->Mt; ctype = FROM_M;
472 	if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
473 	if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
474 
475 	p = path;
476 	p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
477 	++p;
478 	do {
479 		switch (ctype) {
480 			case FROM_M: --i; --j; break;
481 			case FROM_I: --j; break;
482 			case FROM_D: --i; break;
483 		}
484 		q = dpcell[j] + i;
485 		ctype = type;
486 		switch (type) {
487 			case FROM_M: type = q->Mt; break;
488 			case FROM_I: type = q->It; break;
489 			case FROM_D: type = q->Dt; break;
490 		}
491 		p->ctype = ctype; p->i = i; p->j = j;
492 		++p;
493 	} while (i || j);
494 	*path_len = p - path - 1;
495 
496 	/* free memory */
497 	for (j = b2 + 1; j <= len2; ++j)
498 		dpcell[j] += j - b2;
499 	for (j = 0; j <= len2; ++j)
500 		MYFREE(dpcell[j]);
501 	MYFREE(dpcell);
502 	MYFREE(curr); MYFREE(last);
503 
504 	return max;
505 }
506 /*************************************************
507  * local alignment combined with banded strategy *
508  *************************************************/
aln_local_core(unsigned char * seq1,int len1,unsigned char * seq2,int len2,const AlnParam * ap,path_t * path,int * path_len,int do_align)509 int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
510 				   path_t *path, int *path_len, int do_align)
511 {
512 	register NT_LOCAL_SCORE *s;
513 	register int i;
514 	int q, r, qr, tmp_len, qr_shift;
515 	int **s_array, *score_array;
516 	int e, f;
517 	int is_overflow, of_base;
518 	NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h;
519 	int j, start_i, start_j, end_i, end_j;
520 	path_t *p;
521 	int score_f, score_r, score_g;
522 	int start, end, max_score;
523 
524 	int gap_open, gap_ext;
525 	int *score_matrix, N_MATRIX_ROW;
526 
527 	/* initialize some align-related parameters. just for compatibility */
528 	gap_open = ap->gap_open;
529 	gap_ext = ap->gap_ext;
530 //	b = ap->band_width;
531 	score_matrix = ap->matrix;
532 	N_MATRIX_ROW = ap->row;
533 
534 	if (len1 == 0 || len2 == 0) return -1;
535 
536 	/* allocate memory */
537 	eh = (NT_LOCAL_SCORE*)MYALLOC(sizeof(NT_LOCAL_SCORE) * (len1 + 1));
538 	s_array = (int**)MYALLOC(sizeof(int*) * N_MATRIX_ROW);
539 	for (i = 0; i != N_MATRIX_ROW; ++i)
540 		s_array[i] = (int*)MYALLOC(sizeof(int) * len1);
541 	/* initialization */
542 	aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
543 	q = gap_open - gap_ext;
544 	r = gap_ext;
545 	qr = q + r;
546 	qr_shift = (qr+1) << NT_LOCAL_SHIFT;
547 	tmp_len = len1 + 1;
548 	start_i = start_j = end_i = end_j = 0;
549 	for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i)
550 		if (max_score < score_matrix[i]) max_score = score_matrix[i];
551 	/* convert the coordinate */
552 	--seq1; --seq2;
553 	for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i];
554 
555 	/* forward dynamic programming */
556 	for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0;
557 	score_f = 0;
558 	is_overflow = of_base = 0;
559 	for (j = 1; j <= len2; ++j) {
560 		last_h = f = 0;
561 		score_array = s_array[seq2[j]];
562 		if (is_overflow) { /* adjust eh[] array if overflow occurs. */
563 			/* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed.
564 			 * If it is too large, this block will be excuted frequently and therefore
565 			 * slow down the whole program.
566 			 * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the
567 			 * number of assignments because it sets some cells to zero when overflow
568 			 * happens. */
569 			int tmp, tmp2;
570 			score_f -= LOCAL_OVERFLOW_REDUCE;
571 			of_base += LOCAL_OVERFLOW_REDUCE;
572 			is_overflow = 0;
573 			for (i = 1, s = eh; i <= tmp_len; ++i, ++s) {
574 				tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK;
575 				if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0;
576 				else tmp2 -= LOCAL_OVERFLOW_REDUCE;
577 				if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0;
578 				else tmp -= LOCAL_OVERFLOW_REDUCE;
579 				*s = (tmp << NT_LOCAL_SHIFT) | tmp2;
580 			}
581 		}
582 		for (i = 1, s = eh; i != tmp_len; ++i, ++s) {
583 			/* prepare for calculate current h */
584 			curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i];
585 			if (curr_h < 0) curr_h = 0;
586 			if (last_h > qr) { /* initialize f */
587 				f = (f > last_h - q)? f - r : last_h - qr;
588 				if (curr_h < f) curr_h = f;
589 			}
590 			if (*(s+1) >= qr_shift) { /* initialize e */
591 				curr_last_h = *(s+1) >> NT_LOCAL_SHIFT;
592 				e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr;
593 				if (curr_h < e) curr_h = e;
594 				*s = (last_h << NT_LOCAL_SHIFT) | e;
595 			} else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */
596 			last_h = curr_h;
597 			if (score_f < curr_h) {
598 				score_f = curr_h; end_i = i; end_j = j;
599 				if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1;
600 			}
601 		}
602 		*s = last_h << NT_LOCAL_SHIFT;
603 	}
604 	score_f += of_base;
605 
606 	if (path == 0) goto end_func; /* skip path-filling */
607 
608 	/* reverse dynamic programming */
609 	for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0;
610 	if (end_i == 0 || end_j == 0) goto end_func; /* no local match */
611 	score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]];
612 	is_overflow = of_base = 0;
613 	start_i = end_i; start_j = end_j;
614 	eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */
615 	start = end_i - 1;
616 	end = end_i - 3;
617 	if (end <= 0) end = 0;
618 
619 	/* second pass DP can be done in a band, speed will thus be enhanced */
620 	for (j = end_j - 1; j != 0; --j) {
621 		last_h = f = 0;
622 		score_array = s_array[seq2[j]];
623 		if (is_overflow) { /* adjust eh[] array if overflow occurs. */
624 			int tmp, tmp2;
625 			score_r -= LOCAL_OVERFLOW_REDUCE;
626 			of_base += LOCAL_OVERFLOW_REDUCE;
627 			is_overflow = 0;
628 			for (i = start, s = eh + start + 1; i >= end; --i, --s) {
629 				tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK;
630 				if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0;
631 				else tmp2 -= LOCAL_OVERFLOW_REDUCE;
632 				if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0;
633 				else tmp -= LOCAL_OVERFLOW_REDUCE;
634 				*s = (tmp << NT_LOCAL_SHIFT) | tmp2;
635 			}
636 		}
637 		for (i = start, s = eh + start + 1; i != end; --i, --s) {
638 			/* prepare for calculate current h */
639 			curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i];
640 			if (curr_h < 0) curr_h = 0;
641 			if (last_h > qr) { /* initialize f */
642 				f = (f > last_h - q)? f - r : last_h - qr;
643 				if (curr_h < f) curr_h = f;
644 			}
645 			if (*(s-1) >= qr_shift) { /* initialize e */
646 				curr_last_h = *(s-1) >> NT_LOCAL_SHIFT;
647 				e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr;
648 				if (curr_h < e) curr_h = e;
649 				*s = (last_h << NT_LOCAL_SHIFT) | e;
650 			} else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */
651 			last_h = curr_h;
652 			if (score_r < curr_h) {
653 				score_r = curr_h; start_i = i; start_j = j;
654 				if (score_r + of_base - qr == score_f) {
655 					j = 1; break;
656 				}
657 				if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1;
658 			}
659 		}
660 		*s = last_h << NT_LOCAL_SHIFT;
661 		/* recalculate start and end, the boundaries of the band */
662 		if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start;
663 		if (start <= 0) start = 0;
664 		end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1;
665 		if (end <= 0) end = 0;
666 	}
667 
668 	if (path_len == 0) {
669 		path[0].i = start_i; path[0].j = start_j;
670 		path[1].i = end_i; path[1].j = end_j;
671 		goto end_func;
672 	}
673 
674 	score_r += of_base;
675 	score_r -= qr;
676 
677 #ifdef DEBUG
678 	/* this seems not a bug */
679 	if (score_f != score_r)
680 		fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r);
681 #endif
682 
683 	if (do_align) { /* call global alignment to fill the path */
684 		score_g = 0;
685 		j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j;
686 		++j; /* j is the maximum band_width */
687 		for (i = ap->band_width;; i <<= 1) {
688 			AlnParam ap_real = *ap;
689 			ap_real.gap_end = -1;
690 			ap_real.band_width = i;
691 			score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j,
692 									  end_j - start_j + 1, &ap_real, path, path_len);
693 			if (score_g == score_r || score_f == score_g) break;
694 			if (i > j) break;
695 		}
696 		if (score_r > score_g && score_f > score_g)
697 			fprintf(stderr, "[aln_local_core] Cannot find reasonable band width. Continue anyway.\n");
698 		score_f = score_g;
699 
700 		/* convert coordinate */
701 		for (p = path + *path_len - 1; p >= path; --p) {
702 			p->i += start_i - 1;
703 			p->j += start_j - 1;
704 		}
705 	} else { /* just store the start and end */
706 		*path_len = 2;
707 		path[1].i = start_i; path[1].j = start_j;
708 		path->i = end_i; path->j = end_j;
709 	}
710 
711 end_func:
712 	/* free */
713 	MYFREE(eh);
714 	for (i = 0; i != N_MATRIX_ROW; ++i) {
715 		++s_array[i];
716 		MYFREE(s_array[i]);
717 	}
718 	MYFREE(s_array);
719 	return score_f;
720 }
aln_stdaln_aux(const char * seq1,const char * seq2,const AlnParam * ap,int is_global,int do_align,int len1,int len2)721 AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap,
722 					   int is_global, int do_align, int len1, int len2)
723 {
724 	unsigned char *seq11, *seq22;
725 	int score;
726 	int i, j, l;
727 	path_t *p;
728 	char *out1, *out2, *outm;
729 	AlnAln *aa;
730 
731 	if (len1 < 0) len1 = strlen(seq1);
732 	if (len2 < 0) len2 = strlen(seq2);
733 
734 	aa = aln_init_AlnAln();
735 	seq11 = (unsigned char*)MYALLOC(sizeof(unsigned char) * len1);
736 	seq22 = (unsigned char*)MYALLOC(sizeof(unsigned char) * len2);
737 	aa->path = (path_t*)MYALLOC(sizeof(path_t) * (len1 + len2 + 1));
738 
739 	if (ap->row < 10) { /* 4-nucleotide alignment */
740 		for (i = 0; i < len1; ++i)
741 			seq11[i] = aln_nt4_table[(int)seq1[i]];
742 		for (j = 0; j < len2; ++j)
743 			seq22[j] = aln_nt4_table[(int)seq2[j]];
744 	} else if (ap->row < 20) { /* 16-nucleotide alignment */
745 		for (i = 0; i < len1; ++i)
746 			seq11[i] = aln_nt16_table[(int)seq1[i]];
747 		for (j = 0; j < len2; ++j)
748 			seq22[j] = aln_nt16_table[(int)seq2[j]];
749 	} else { /* amino acids */
750 		for (i = 0; i < len1; ++i)
751 			seq11[i] = aln_aa_table[(int)seq1[i]];
752 		for (j = 0; j < len2; ++j)
753 			seq22[j] = aln_aa_table[(int)seq2[j]];
754 	}
755 
756 	if (is_global) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len);
757 	else score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, do_align);
758 	aa->score = score;
759 
760 	if (do_align) {
761 		out1 = aa->out1 = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1));
762 		out2 = aa->out2 = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1));
763 		outm = aa->outm = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1));
764 
765 		--seq1; --seq2;
766 		--seq11; --seq22;
767 
768 		p = aa->path + aa->path_len - 1;
769 
770 		for (l = 0; p >= aa->path; --p, ++l) {
771 			switch (p->ctype) {
772 			case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j];
773 				outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' ';
774 				break;
775 			case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break;
776 			case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break;
777 			}
778 		}
779 		out1[l] = out2[l] = outm[l] = '\0';
780 		++seq11; ++seq22;
781 	}
782 
783 	MYFREE(seq11);
784 	MYFREE(seq22);
785 
786 	p = aa->path + aa->path_len - 1;
787 	aa->start1 = p->i;
788 	aa->end1 = aa->path->i;
789 	aa->start2 = p->j;
790 	aa->end2 = aa->path->j;
791 
792 	return aa;
793 }
aln_stdaln(const char * seq1,const char * seq2,const AlnParam * ap,int is_global,int do_align)794 AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int is_global, int do_align)
795 {
796 	return aln_stdaln_aux(seq1, seq2, ap, is_global, do_align, -1, -1);
797 }
798