1 /*
2 * stdaln.c -- standard alignment (local and banded global alignment)
3 *
4 * Copyright (c) 2003-2006, Li Heng <liheng@genomics.org.cn>
5 * <lh3lh3@gmail.com>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include "stdaln.h"
27
28 /* char -> 17 (=16+1) nucleotides */
29 unsigned char aln_nt16_table[256] = {
30 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
31 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
32 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15,
33 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
34 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15,
35 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15,
36 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15,
37 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15,
38 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
39 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
40 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
41 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
42 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
43 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
44 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
45 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
46 };
47 char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-";
48
49 /* char -> 5 (=4+1) nucleotides */
50 unsigned char aln_nt4_table[256] = {
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
62 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
63 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
64 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
65 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
66 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
67 };
68 char *aln_nt4_rev_table = "AGCTN-";
69
70 /* char -> 22 (=20+1+1) amino acids */
71 unsigned char aln_aa_table[256] = {
72 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
73 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
74 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21,
75 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
76 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21,
77 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21,
78 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21,
79 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21,
80 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
81 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
82 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
83 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
84 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
85 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
86 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
87 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21
88 };
89 char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-";
90 /* 01234567890123456789012 */
91
92 /* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */
93 unsigned char aln_trans_table_eu[66] = {
94 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9,
95 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19,
96 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10,
97 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22
98 };
99 char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX";
100 /* 01234567890123456789012345678901234567890123456789012345678901234 */
101 int aln_sm_blosum62[] = {
102 /* A R N D C Q E G H I L K M F P S T W Y V * X */
103 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
104 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
105 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
106 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
107 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
108 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
109 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
110 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
111 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
112 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
113 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
114 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
115 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
116 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
117 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
118 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
119 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
120 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
121 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
122 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
123 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
124 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
125 };
126
127 int aln_sm_blosum45[] = {
128 /* A R N D C Q E G H I L K M F P S T W Y V * X */
129 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0,
130 -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1,
131 -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1,
132 -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1,
133 -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2,
134 -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1,
135 -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1,
136 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1,
137 -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1,
138 -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1,
139 -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1,
140 -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1,
141 -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1,
142 -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1,
143 -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1,
144 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0,
145 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0,
146 -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2,
147 -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1,
148 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1,
149 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5,
150 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1
151 };
152
153 int aln_sm_nt[] = {
154 /* X A G R C M S V T W K D Y H B N */
155 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
156 -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0,
157 -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0,
158 -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0,
159 -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0,
160 -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0,
161 -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0,
162 -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0,
163 -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0,
164 -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0,
165 -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0,
166 -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167 -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0,
168 -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169 -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170 -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
171 };
172
173 int aln_sm_read[] = {
174 /* X A G R C M S V T W K D Y H B N */
175 -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,
176 -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0,
177 -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0,
178 -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0,
179 -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0,
180 -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0,
181 -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0,
182 -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0,
183 -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0,
184 -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0,
185 -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0,
186 -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0,
188 -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
191 };
192
193 int aln_sm_hs[] = {
194 /* A G C T N */
195 91, -31,-114,-123, -44,
196 -31, 100,-125,-114, -42,
197 -123,-125, 100, -31, -42,
198 -114,-114, -31, 91, -42,
199 -44, -42, -42, -42, -43
200 };
201
202 /********************/
203 /* START OF align.c */
204 /********************/
205
206 AlnParam aln_param_nt2nt = { 10, 2, 2, aln_sm_nt, 16, 75 };
207 AlnParam aln_param_rd2rd = { 20, 19, 19, aln_sm_read, 16, 75 };
208 AlnParam aln_param_aa2aa = { 12, 2, 2, aln_sm_blosum62, 22, 50 };
209
aln_init_AlnAln()210 AlnAln *aln_init_AlnAln()
211 {
212 AlnAln *aa;
213 aa = (AlnAln*)MYALLOC(sizeof(AlnAln));
214 aa->path = 0;
215 aa->out1 = aa->out2 = aa->outm = 0;
216 aa->path_len = 0;
217 return aa;
218 }
aln_free_AlnAln(AlnAln * aa)219 void aln_free_AlnAln(AlnAln *aa)
220 {
221 MYFREE(aa->path);
222 MYFREE(aa->out1);
223 MYFREE(aa->out2);
224 MYFREE(aa->outm);
225 MYFREE(aa);
226 }
227
228 /***************************/
229 /* START OF common_align.c */
230 /***************************/
231
232 #define LOCAL_OVERFLOW_THRESHOLD 32000
233 #define LOCAL_OVERFLOW_REDUCE 16000
234 #define NT_LOCAL_SCORE int
235 #define NT_LOCAL_SHIFT 16
236 #define NT_LOCAL_MASK 0xffff
237
238 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
239
240 #define set_M(MM, cur, p, sc) \
241 { \
242 if ((p)->M >= (p)->I) { \
243 if ((p)->M >= (p)->D) { \
244 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
245 } else { \
246 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
247 } \
248 } else { \
249 if ((p)->I > (p)->D) { \
250 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
251 } else { \
252 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
253 } \
254 } \
255 }
256 #define set_I(II, cur, p) \
257 { \
258 if ((p)->M - gap_open > (p)->I - gap_ext) { \
259 (cur)->It = FROM_M; \
260 (II) = (p)->M - gap_open; \
261 } else { \
262 (cur)->It = FROM_I; \
263 (II) = (p)->I - gap_ext; \
264 } \
265 }
266 #define set_end_I(II, cur, p) \
267 { \
268 if (gap_end >= 0) { \
269 if ((p)->M > (p)->I) { \
270 (cur)->It = FROM_M; \
271 (II) = (p)->M - gap_end; \
272 } else { \
273 (cur)->It = FROM_I; \
274 (II) = (p)->I - gap_end; \
275 } \
276 } else set_I(II, cur, p); \
277 }
278 #define set_D(DD, cur, p) \
279 { \
280 if ((p)->M - gap_open > (p)->D - gap_ext) { \
281 (cur)->Dt = FROM_M; \
282 (DD) = (p)->M - gap_open; \
283 } else { \
284 (cur)->Dt = FROM_D; \
285 (DD) = (p)->D - gap_ext; \
286 } \
287 }
288 #define set_end_D(DD, cur, p) \
289 { \
290 if (gap_end >= 0) { \
291 if ((p)->M > (p)->D) { \
292 (cur)->Dt = FROM_M; \
293 (DD) = (p)->M - gap_end; \
294 } else { \
295 (cur)->Dt = FROM_D; \
296 (DD) = (p)->D - gap_end; \
297 } \
298 } else set_D(DD, cur, p); \
299 }
300
301 typedef struct
302 {
303 unsigned char Mt:3, It:2, Dt:2;
304 } dpcell_t;
305
306 typedef struct
307 {
308 int M, I, D;
309 } dpscore_t;
310
311 /* build score profile for accelerating alignment, in theory */
aln_init_score_array(unsigned char * seq,int len,int row,int * score_matrix,int ** s_array)312 void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array)
313 {
314 int *tmp, *tmp2, i, k;
315 for (i = 0; i != row; ++i) {
316 tmp = score_matrix + i * row;
317 tmp2 = s_array[i];
318 for (k = 0; k != len; ++k)
319 tmp2[k] = tmp[seq[k]];
320 }
321 }
322 /***************************
323 * banded global alignment *
324 ***************************/
aln_global_core(unsigned char * seq1,int len1,unsigned char * seq2,int len2,const AlnParam * ap,path_t * path,int * path_len)325 int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
326 path_t *path, int *path_len)
327 {
328 register int i, j;
329 dpcell_t **dpcell, *q;
330 dpscore_t *curr, *last, *s;
331 path_t *p;
332 int b1, b2, tmp_end;
333 int *mat, end, max;
334 unsigned char type, ctype;
335
336 int gap_open, gap_ext, gap_end, b;
337 int *score_matrix, N_MATRIX_ROW;
338
339 /* initialize some align-related parameters. just for compatibility */
340 gap_open = ap->gap_open;
341 gap_ext = ap->gap_ext;
342 gap_end = ap->gap_end;
343 b = ap->band_width;
344 score_matrix = ap->matrix;
345 N_MATRIX_ROW = ap->row;
346
347 if (len1 == 0 || len2 == 0) {
348 *path_len = 0;
349 return 0;
350 }
351 /* calculate b1 and b2 */
352 if (len1 > len2) {
353 b1 = len1 - len2 + b;
354 b2 = b;
355 } else {
356 b1 = b;
357 b2 = len2 - len1 + b;
358 }
359 if (b1 > len1) b1 = len1;
360 if (b2 > len2) b2 = len2;
361 --seq1; --seq2;
362
363 /* allocate memory */
364 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
365 dpcell = (dpcell_t**)MYALLOC(sizeof(dpcell_t*) * (len2 + 1));
366 for (j = 0; j <= len2; ++j)
367 dpcell[j] = (dpcell_t*)MYALLOC(sizeof(dpcell_t) * end);
368 for (j = b2 + 1; j <= len2; ++j)
369 dpcell[j] -= j - b2;
370 curr = (dpscore_t*)MYALLOC(sizeof(dpscore_t) * (len1 + 1));
371 last = (dpscore_t*)MYALLOC(sizeof(dpscore_t) * (len1 + 1));
372
373 /* set first row */
374 SET_INF(*curr); curr->M = 0;
375 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
376 SET_INF(*s);
377 set_end_D(s->D, dpcell[0] + i, s - 1);
378 }
379 s = curr; curr = last; last = s;
380
381 /* core dynamic programming, part 1 */
382 tmp_end = (b2 < len2)? b2 : len2 - 1;
383 for (j = 1; j <= tmp_end; ++j) {
384 q = dpcell[j]; s = curr; SET_INF(*s);
385 set_end_I(s->I, q, last);
386 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
387 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
388 ++s; ++q;
389 for (i = 1; i != end; ++i, ++s, ++q) {
390 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
391 set_I(s->I, q, last + i);
392 set_D(s->D, q, s - 1);
393 }
394 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
395 set_D(s->D, q, s - 1);
396 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
397 set_end_I(s->I, q, last + i);
398 } else s->I = MINOR_INF;
399 s = curr; curr = last; last = s;
400 }
401 /* last row for part 1, use set_end_D() instead of set_D() */
402 if (j == len2 && b2 != len2 - 1) {
403 q = dpcell[j]; s = curr; SET_INF(*s);
404 set_end_I(s->I, q, last);
405 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
406 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
407 ++s; ++q;
408 for (i = 1; i != end; ++i, ++s, ++q) {
409 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
410 set_I(s->I, q, last + i);
411 set_end_D(s->D, q, s - 1);
412 }
413 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
414 set_end_D(s->D, q, s - 1);
415 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
416 set_end_I(s->I, q, last + i);
417 } else s->I = MINOR_INF;
418 s = curr; curr = last; last = s;
419 ++j;
420 }
421
422 /* core dynamic programming, part 2 */
423 for (; j <= len2 - b2 + 1; ++j) {
424 SET_INF(curr[j - b2]);
425 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
426 end = j + b1 - 1;
427 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
428 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
429 set_I(s->I, q, last + i);
430 set_D(s->D, q, s - 1);
431 }
432 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
433 set_D(s->D, q, s - 1);
434 s->I = MINOR_INF;
435 s = curr; curr = last; last = s;
436 }
437
438 /* core dynamic programming, part 3 */
439 for (; j < len2; ++j) {
440 SET_INF(curr[j - b2]);
441 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
442 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
443 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
444 set_I(s->I, q, last + i);
445 set_D(s->D, q, s - 1);
446 }
447 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
448 set_end_I(s->I, q, last + i);
449 set_D(s->D, q, s - 1);
450 s = curr; curr = last; last = s;
451 }
452 /* last row */
453 if (j == len2) {
454 SET_INF(curr[j - b2]);
455 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
456 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
457 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
458 set_I(s->I, q, last + i);
459 set_end_D(s->D, q, s - 1);
460 }
461 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
462 set_end_I(s->I, q, last + i);
463 set_end_D(s->D, q, s - 1);
464 s = curr; curr = last; last = s;
465 }
466
467 /* backtrace */
468 i = len1; j = len2;
469 q = dpcell[j] + i;
470 s = last + len1;
471 max = s->M; type = q->Mt; ctype = FROM_M;
472 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
473 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
474
475 p = path;
476 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
477 ++p;
478 do {
479 switch (ctype) {
480 case FROM_M: --i; --j; break;
481 case FROM_I: --j; break;
482 case FROM_D: --i; break;
483 }
484 q = dpcell[j] + i;
485 ctype = type;
486 switch (type) {
487 case FROM_M: type = q->Mt; break;
488 case FROM_I: type = q->It; break;
489 case FROM_D: type = q->Dt; break;
490 }
491 p->ctype = ctype; p->i = i; p->j = j;
492 ++p;
493 } while (i || j);
494 *path_len = p - path - 1;
495
496 /* free memory */
497 for (j = b2 + 1; j <= len2; ++j)
498 dpcell[j] += j - b2;
499 for (j = 0; j <= len2; ++j)
500 MYFREE(dpcell[j]);
501 MYFREE(dpcell);
502 MYFREE(curr); MYFREE(last);
503
504 return max;
505 }
506 /*************************************************
507 * local alignment combined with banded strategy *
508 *************************************************/
aln_local_core(unsigned char * seq1,int len1,unsigned char * seq2,int len2,const AlnParam * ap,path_t * path,int * path_len,int do_align)509 int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
510 path_t *path, int *path_len, int do_align)
511 {
512 register NT_LOCAL_SCORE *s;
513 register int i;
514 int q, r, qr, tmp_len, qr_shift;
515 int **s_array, *score_array;
516 int e, f;
517 int is_overflow, of_base;
518 NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h;
519 int j, start_i, start_j, end_i, end_j;
520 path_t *p;
521 int score_f, score_r, score_g;
522 int start, end, max_score;
523
524 int gap_open, gap_ext;
525 int *score_matrix, N_MATRIX_ROW;
526
527 /* initialize some align-related parameters. just for compatibility */
528 gap_open = ap->gap_open;
529 gap_ext = ap->gap_ext;
530 // b = ap->band_width;
531 score_matrix = ap->matrix;
532 N_MATRIX_ROW = ap->row;
533
534 if (len1 == 0 || len2 == 0) return -1;
535
536 /* allocate memory */
537 eh = (NT_LOCAL_SCORE*)MYALLOC(sizeof(NT_LOCAL_SCORE) * (len1 + 1));
538 s_array = (int**)MYALLOC(sizeof(int*) * N_MATRIX_ROW);
539 for (i = 0; i != N_MATRIX_ROW; ++i)
540 s_array[i] = (int*)MYALLOC(sizeof(int) * len1);
541 /* initialization */
542 aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
543 q = gap_open - gap_ext;
544 r = gap_ext;
545 qr = q + r;
546 qr_shift = (qr+1) << NT_LOCAL_SHIFT;
547 tmp_len = len1 + 1;
548 start_i = start_j = end_i = end_j = 0;
549 for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i)
550 if (max_score < score_matrix[i]) max_score = score_matrix[i];
551 /* convert the coordinate */
552 --seq1; --seq2;
553 for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i];
554
555 /* forward dynamic programming */
556 for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0;
557 score_f = 0;
558 is_overflow = of_base = 0;
559 for (j = 1; j <= len2; ++j) {
560 last_h = f = 0;
561 score_array = s_array[seq2[j]];
562 if (is_overflow) { /* adjust eh[] array if overflow occurs. */
563 /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed.
564 * If it is too large, this block will be excuted frequently and therefore
565 * slow down the whole program.
566 * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the
567 * number of assignments because it sets some cells to zero when overflow
568 * happens. */
569 int tmp, tmp2;
570 score_f -= LOCAL_OVERFLOW_REDUCE;
571 of_base += LOCAL_OVERFLOW_REDUCE;
572 is_overflow = 0;
573 for (i = 1, s = eh; i <= tmp_len; ++i, ++s) {
574 tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK;
575 if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0;
576 else tmp2 -= LOCAL_OVERFLOW_REDUCE;
577 if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0;
578 else tmp -= LOCAL_OVERFLOW_REDUCE;
579 *s = (tmp << NT_LOCAL_SHIFT) | tmp2;
580 }
581 }
582 for (i = 1, s = eh; i != tmp_len; ++i, ++s) {
583 /* prepare for calculate current h */
584 curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i];
585 if (curr_h < 0) curr_h = 0;
586 if (last_h > qr) { /* initialize f */
587 f = (f > last_h - q)? f - r : last_h - qr;
588 if (curr_h < f) curr_h = f;
589 }
590 if (*(s+1) >= qr_shift) { /* initialize e */
591 curr_last_h = *(s+1) >> NT_LOCAL_SHIFT;
592 e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr;
593 if (curr_h < e) curr_h = e;
594 *s = (last_h << NT_LOCAL_SHIFT) | e;
595 } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */
596 last_h = curr_h;
597 if (score_f < curr_h) {
598 score_f = curr_h; end_i = i; end_j = j;
599 if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1;
600 }
601 }
602 *s = last_h << NT_LOCAL_SHIFT;
603 }
604 score_f += of_base;
605
606 if (path == 0) goto end_func; /* skip path-filling */
607
608 /* reverse dynamic programming */
609 for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0;
610 if (end_i == 0 || end_j == 0) goto end_func; /* no local match */
611 score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]];
612 is_overflow = of_base = 0;
613 start_i = end_i; start_j = end_j;
614 eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */
615 start = end_i - 1;
616 end = end_i - 3;
617 if (end <= 0) end = 0;
618
619 /* second pass DP can be done in a band, speed will thus be enhanced */
620 for (j = end_j - 1; j != 0; --j) {
621 last_h = f = 0;
622 score_array = s_array[seq2[j]];
623 if (is_overflow) { /* adjust eh[] array if overflow occurs. */
624 int tmp, tmp2;
625 score_r -= LOCAL_OVERFLOW_REDUCE;
626 of_base += LOCAL_OVERFLOW_REDUCE;
627 is_overflow = 0;
628 for (i = start, s = eh + start + 1; i >= end; --i, --s) {
629 tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK;
630 if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0;
631 else tmp2 -= LOCAL_OVERFLOW_REDUCE;
632 if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0;
633 else tmp -= LOCAL_OVERFLOW_REDUCE;
634 *s = (tmp << NT_LOCAL_SHIFT) | tmp2;
635 }
636 }
637 for (i = start, s = eh + start + 1; i != end; --i, --s) {
638 /* prepare for calculate current h */
639 curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i];
640 if (curr_h < 0) curr_h = 0;
641 if (last_h > qr) { /* initialize f */
642 f = (f > last_h - q)? f - r : last_h - qr;
643 if (curr_h < f) curr_h = f;
644 }
645 if (*(s-1) >= qr_shift) { /* initialize e */
646 curr_last_h = *(s-1) >> NT_LOCAL_SHIFT;
647 e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr;
648 if (curr_h < e) curr_h = e;
649 *s = (last_h << NT_LOCAL_SHIFT) | e;
650 } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */
651 last_h = curr_h;
652 if (score_r < curr_h) {
653 score_r = curr_h; start_i = i; start_j = j;
654 if (score_r + of_base - qr == score_f) {
655 j = 1; break;
656 }
657 if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1;
658 }
659 }
660 *s = last_h << NT_LOCAL_SHIFT;
661 /* recalculate start and end, the boundaries of the band */
662 if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start;
663 if (start <= 0) start = 0;
664 end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1;
665 if (end <= 0) end = 0;
666 }
667
668 if (path_len == 0) {
669 path[0].i = start_i; path[0].j = start_j;
670 path[1].i = end_i; path[1].j = end_j;
671 goto end_func;
672 }
673
674 score_r += of_base;
675 score_r -= qr;
676
677 #ifdef DEBUG
678 /* this seems not a bug */
679 if (score_f != score_r)
680 fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r);
681 #endif
682
683 if (do_align) { /* call global alignment to fill the path */
684 score_g = 0;
685 j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j;
686 ++j; /* j is the maximum band_width */
687 for (i = ap->band_width;; i <<= 1) {
688 AlnParam ap_real = *ap;
689 ap_real.gap_end = -1;
690 ap_real.band_width = i;
691 score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j,
692 end_j - start_j + 1, &ap_real, path, path_len);
693 if (score_g == score_r || score_f == score_g) break;
694 if (i > j) break;
695 }
696 if (score_r > score_g && score_f > score_g)
697 fprintf(stderr, "[aln_local_core] Cannot find reasonable band width. Continue anyway.\n");
698 score_f = score_g;
699
700 /* convert coordinate */
701 for (p = path + *path_len - 1; p >= path; --p) {
702 p->i += start_i - 1;
703 p->j += start_j - 1;
704 }
705 } else { /* just store the start and end */
706 *path_len = 2;
707 path[1].i = start_i; path[1].j = start_j;
708 path->i = end_i; path->j = end_j;
709 }
710
711 end_func:
712 /* free */
713 MYFREE(eh);
714 for (i = 0; i != N_MATRIX_ROW; ++i) {
715 ++s_array[i];
716 MYFREE(s_array[i]);
717 }
718 MYFREE(s_array);
719 return score_f;
720 }
aln_stdaln_aux(const char * seq1,const char * seq2,const AlnParam * ap,int is_global,int do_align,int len1,int len2)721 AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap,
722 int is_global, int do_align, int len1, int len2)
723 {
724 unsigned char *seq11, *seq22;
725 int score;
726 int i, j, l;
727 path_t *p;
728 char *out1, *out2, *outm;
729 AlnAln *aa;
730
731 if (len1 < 0) len1 = strlen(seq1);
732 if (len2 < 0) len2 = strlen(seq2);
733
734 aa = aln_init_AlnAln();
735 seq11 = (unsigned char*)MYALLOC(sizeof(unsigned char) * len1);
736 seq22 = (unsigned char*)MYALLOC(sizeof(unsigned char) * len2);
737 aa->path = (path_t*)MYALLOC(sizeof(path_t) * (len1 + len2 + 1));
738
739 if (ap->row < 10) { /* 4-nucleotide alignment */
740 for (i = 0; i < len1; ++i)
741 seq11[i] = aln_nt4_table[(int)seq1[i]];
742 for (j = 0; j < len2; ++j)
743 seq22[j] = aln_nt4_table[(int)seq2[j]];
744 } else if (ap->row < 20) { /* 16-nucleotide alignment */
745 for (i = 0; i < len1; ++i)
746 seq11[i] = aln_nt16_table[(int)seq1[i]];
747 for (j = 0; j < len2; ++j)
748 seq22[j] = aln_nt16_table[(int)seq2[j]];
749 } else { /* amino acids */
750 for (i = 0; i < len1; ++i)
751 seq11[i] = aln_aa_table[(int)seq1[i]];
752 for (j = 0; j < len2; ++j)
753 seq22[j] = aln_aa_table[(int)seq2[j]];
754 }
755
756 if (is_global) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len);
757 else score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, do_align);
758 aa->score = score;
759
760 if (do_align) {
761 out1 = aa->out1 = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1));
762 out2 = aa->out2 = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1));
763 outm = aa->outm = (char*)MYALLOC(sizeof(char) * (aa->path_len + 1));
764
765 --seq1; --seq2;
766 --seq11; --seq22;
767
768 p = aa->path + aa->path_len - 1;
769
770 for (l = 0; p >= aa->path; --p, ++l) {
771 switch (p->ctype) {
772 case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j];
773 outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' ';
774 break;
775 case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break;
776 case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break;
777 }
778 }
779 out1[l] = out2[l] = outm[l] = '\0';
780 ++seq11; ++seq22;
781 }
782
783 MYFREE(seq11);
784 MYFREE(seq22);
785
786 p = aa->path + aa->path_len - 1;
787 aa->start1 = p->i;
788 aa->end1 = aa->path->i;
789 aa->start2 = p->j;
790 aa->end2 = aa->path->j;
791
792 return aa;
793 }
aln_stdaln(const char * seq1,const char * seq2,const AlnParam * ap,int is_global,int do_align)794 AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int is_global, int do_align)
795 {
796 return aln_stdaln_aux(seq1, seq2, ap, is_global, do_align, -1, -1);
797 }
798