1 /* The MIT License
2
3 Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 #include <stdlib.h>
27 #include <stdint.h>
28 #include <assert.h>
29 // #include <emmintrin.h>
30 #define SIMDE_ENABLE_NATIVE_ALIASES
31 #include <simde/x86/sse2.h>
32 #include "ksw.h"
33
34 #ifdef USE_MALLOC_WRAPPERS
35 # include "malloc_wrap.h"
36 #endif
37
38 #ifdef __GNUC__
39 #define LIKELY(x) __builtin_expect((x),1)
40 #define UNLIKELY(x) __builtin_expect((x),0)
41 #else
42 #define LIKELY(x) (x)
43 #define UNLIKELY(x) (x)
44 #endif
45
46 const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
47
48 struct _kswq_t {
49 int qlen, slen;
50 uint8_t shift, mdiff, max, size;
51 __m128i *qp, *H0, *H1, *E, *Hmax;
52 };
53
54 /**
55 * Initialize the query data structure
56 *
57 * @param size Number of bytes used to store a score; valid valures are 1 or 2
58 * @param qlen Length of the query sequence
59 * @param query Query sequence
60 * @param m Size of the alphabet
61 * @param mat Scoring matrix in a one-dimension array
62 *
63 * @return Query data structure
64 */
ksw_qinit(int size,int qlen,const uint8_t * query,int m,const int8_t * mat)65 kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
66 {
67 kswq_t *q;
68 int slen, a, tmp, p;
69
70 size = size > 1? 2 : 1;
71 p = 8 * (3 - size); // # values per __m128i
72 slen = (qlen + p - 1) / p; // segmented length
73 q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
74 q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
75 q->H0 = q->qp + slen * m;
76 q->H1 = q->H0 + slen;
77 q->E = q->H1 + slen;
78 q->Hmax = q->E + slen;
79 q->slen = slen; q->qlen = qlen; q->size = size;
80 // compute shift
81 tmp = m * m;
82 for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
83 if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
84 if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
85 }
86 q->max = q->mdiff;
87 q->shift = 256 - q->shift; // NB: q->shift is uint8_t
88 q->mdiff += q->shift; // this is the difference between the min and max scores
89 // An example: p=8, qlen=19, slen=3 and segmentation:
90 // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
91 if (size == 1) {
92 int8_t *t = (int8_t*)q->qp;
93 for (a = 0; a < m; ++a) {
94 int i, k, nlen = slen * p;
95 const int8_t *ma = mat + a * m;
96 for (i = 0; i < slen; ++i)
97 for (k = i; k < nlen; k += slen) // p iterations
98 *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
99 }
100 } else {
101 int16_t *t = (int16_t*)q->qp;
102 for (a = 0; a < m; ++a) {
103 int i, k, nlen = slen * p;
104 const int8_t *ma = mat + a * m;
105 for (i = 0; i < slen; ++i)
106 for (k = i; k < nlen; k += slen) // p iterations
107 *t++ = (k >= qlen? 0 : ma[query[k]]);
108 }
109 }
110 return q;
111 }
112
ksw_u8(kswq_t * q,int tlen,const uint8_t * target,int _o_del,int _e_del,int _o_ins,int _e_ins,int xtra)113 kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
114 {
115 int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
116 uint64_t *b;
117 __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
118 kswr_t r;
119
120 #define __max_16(ret, xx) do { \
121 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
122 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
123 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
124 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
125 (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
126 } while (0)
127
128 // initialization
129 r = g_defr;
130 minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
131 endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
132 m_b = n_b = 0; b = 0;
133 zero = _mm_set1_epi32(0);
134 oe_del = _mm_set1_epi8(_o_del + _e_del);
135 e_del = _mm_set1_epi8(_e_del);
136 oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
137 e_ins = _mm_set1_epi8(_e_ins);
138 shift = _mm_set1_epi8(q->shift);
139 H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
140 slen = q->slen;
141 for (i = 0; i < slen; ++i) {
142 _mm_store_si128(E + i, zero);
143 _mm_store_si128(H0 + i, zero);
144 _mm_store_si128(Hmax + i, zero);
145 }
146 // the core loop
147 for (i = 0; i < tlen; ++i) {
148 int j, k, cmp, imax;
149 __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
150 h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
151 h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
152 for (j = 0; LIKELY(j < slen); ++j) {
153 /* SW cells are computed in the following order:
154 * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
155 * E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
156 * F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
157 */
158 // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
159 h = _mm_adds_epu8(h, _mm_load_si128(S + j));
160 h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
161 e = _mm_load_si128(E + j); // e=E'(i,j)
162 h = _mm_max_epu8(h, e);
163 h = _mm_max_epu8(h, f); // h=H'(i,j)
164 max = _mm_max_epu8(max, h); // set max
165 _mm_store_si128(H1 + j, h); // save to H'(i,j)
166 // now compute E'(i+1,j)
167 e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
168 t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
169 e = _mm_max_epu8(e, t); // e=E'(i+1,j)
170 _mm_store_si128(E + j, e); // save to E'(i+1,j)
171 // now compute F'(i,j+1)
172 f = _mm_subs_epu8(f, e_ins);
173 t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
174 f = _mm_max_epu8(f, t);
175 // get H'(i-1,j) and prepare for the next j
176 h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
177 }
178 // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
179 for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
180 f = _mm_slli_si128(f, 1);
181 for (j = 0; LIKELY(j < slen); ++j) {
182 h = _mm_load_si128(H1 + j);
183 h = _mm_max_epu8(h, f); // h=H'(i,j)
184 _mm_store_si128(H1 + j, h);
185 h = _mm_subs_epu8(h, oe_ins);
186 f = _mm_subs_epu8(f, e_ins);
187 cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
188 if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
189 }
190 }
191 end_loop16:
192 //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
193 __max_16(imax, max); // imax is the maximum number in max
194 if (imax >= minsc) { // write the b array; this condition adds branching unfornately
195 if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
196 if (n_b == m_b) {
197 m_b = m_b? m_b<<1 : 8;
198 b = (uint64_t*)realloc(b, 8 * m_b);
199 }
200 b[n_b++] = (uint64_t)imax<<32 | i;
201 } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
202 }
203 if (imax > gmax) {
204 gmax = imax; te = i; // te is the end position on the target
205 for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
206 _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
207 if (gmax + q->shift >= 255 || gmax >= endsc) break;
208 }
209 S = H1; H1 = H0; H0 = S; // swap H0 and H1
210 }
211 r.score = gmax + q->shift < 255? gmax : 255;
212 r.te = te;
213 if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
214 int max = -1, tmp, low, high, qlen = slen * 16;
215 uint8_t *t = (uint8_t*)Hmax;
216 for (i = 0; i < qlen; ++i, ++t)
217 if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
218 else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
219 //printf("%d,%d\n", max, gmax);
220 if (b) {
221 i = (r.score + q->max - 1) / q->max;
222 low = te - i; high = te + i;
223 for (i = 0; i < n_b; ++i) {
224 int e = (int32_t)b[i];
225 if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
226 r.score2 = b[i]>>32, r.te2 = e;
227 }
228 }
229 }
230 free(b);
231 return r;
232 }
233
ksw_i16(kswq_t * q,int tlen,const uint8_t * target,int _o_del,int _e_del,int _o_ins,int _e_ins,int xtra)234 kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
235 {
236 int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
237 uint64_t *b;
238 __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
239 kswr_t r;
240
241 #define __max_8(ret, xx) do { \
242 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
243 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
244 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
245 (ret) = _mm_extract_epi16((xx), 0); \
246 } while (0)
247
248 // initialization
249 r = g_defr;
250 minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
251 endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
252 m_b = n_b = 0; b = 0;
253 zero = _mm_set1_epi32(0);
254 oe_del = _mm_set1_epi16(_o_del + _e_del);
255 e_del = _mm_set1_epi16(_e_del);
256 oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
257 e_ins = _mm_set1_epi16(_e_ins);
258 H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
259 slen = q->slen;
260 for (i = 0; i < slen; ++i) {
261 _mm_store_si128(E + i, zero);
262 _mm_store_si128(H0 + i, zero);
263 _mm_store_si128(Hmax + i, zero);
264 }
265 // the core loop
266 for (i = 0; i < tlen; ++i) {
267 int j, k, imax;
268 __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
269 h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
270 h = _mm_slli_si128(h, 2);
271 for (j = 0; LIKELY(j < slen); ++j) {
272 h = _mm_adds_epi16(h, *S++);
273 e = _mm_load_si128(E + j);
274 h = _mm_max_epi16(h, e);
275 h = _mm_max_epi16(h, f);
276 max = _mm_max_epi16(max, h);
277 _mm_store_si128(H1 + j, h);
278 e = _mm_subs_epu16(e, e_del);
279 t = _mm_subs_epu16(h, oe_del);
280 e = _mm_max_epi16(e, t);
281 _mm_store_si128(E + j, e);
282 f = _mm_subs_epu16(f, e_ins);
283 t = _mm_subs_epu16(h, oe_ins);
284 f = _mm_max_epi16(f, t);
285 h = _mm_load_si128(H0 + j);
286 }
287 for (k = 0; LIKELY(k < 16); ++k) {
288 f = _mm_slli_si128(f, 2);
289 for (j = 0; LIKELY(j < slen); ++j) {
290 h = _mm_load_si128(H1 + j);
291 h = _mm_max_epi16(h, f);
292 _mm_store_si128(H1 + j, h);
293 h = _mm_subs_epu16(h, oe_ins);
294 f = _mm_subs_epu16(f, e_ins);
295 if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
296 }
297 }
298 end_loop8:
299 __max_8(imax, max);
300 if (imax >= minsc) {
301 if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
302 if (n_b == m_b) {
303 m_b = m_b? m_b<<1 : 8;
304 b = (uint64_t*)realloc(b, 8 * m_b);
305 }
306 b[n_b++] = (uint64_t)imax<<32 | i;
307 } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
308 }
309 if (imax > gmax) {
310 gmax = imax; te = i;
311 for (j = 0; LIKELY(j < slen); ++j)
312 _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
313 if (gmax >= endsc) break;
314 }
315 S = H1; H1 = H0; H0 = S;
316 }
317 r.score = gmax; r.te = te;
318 {
319 int max = -1, tmp, low, high, qlen = slen * 8;
320 uint16_t *t = (uint16_t*)Hmax;
321 for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
322 if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
323 else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
324 if (b) {
325 i = (r.score + q->max - 1) / q->max;
326 low = te - i; high = te + i;
327 for (i = 0; i < n_b; ++i) {
328 int e = (int32_t)b[i];
329 if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
330 r.score2 = b[i]>>32, r.te2 = e;
331 }
332 }
333 }
334 free(b);
335 return r;
336 }
337
revseq(int l,uint8_t * s)338 static inline void revseq(int l, uint8_t *s)
339 {
340 int i, t;
341 for (i = 0; i < l>>1; ++i)
342 t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
343 }
344
ksw_align2(int qlen,uint8_t * query,int tlen,uint8_t * target,int m,const int8_t * mat,int o_del,int e_del,int o_ins,int e_ins,int xtra,kswq_t ** qry)345 kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry)
346 {
347 int size;
348 kswq_t *q;
349 kswr_t r, rr;
350 kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int);
351
352 q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
353 if (qry && *qry == 0) *qry = q;
354 func = q->size == 2? ksw_i16 : ksw_u8;
355 size = q->size;
356 r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra);
357 if (qry == 0) free(q);
358 if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
359 revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
360 q = ksw_qinit(size, r.qe + 1, query, m, mat);
361 rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score);
362 revseq(r.qe + 1, query); revseq(r.te + 1, target);
363 free(q);
364 if (r.score == rr.score)
365 r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
366 return r;
367 }
368
ksw_align(int qlen,uint8_t * query,int tlen,uint8_t * target,int m,const int8_t * mat,int gapo,int gape,int xtra,kswq_t ** qry)369 kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
370 {
371 return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry);
372 }
373
374 /********************
375 *** SW extension ***
376 ********************/
377
378 typedef struct {
379 int32_t h, e;
380 } eh_t;
381
ksw_extend2(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int o_del,int e_del,int o_ins,int e_ins,int w,int end_bonus,int zdrop,int h0,int * _qle,int * _tle,int * _gtle,int * _gscore,int * _max_off)382 int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
383 {
384 eh_t *eh; // score array
385 int8_t *qp; // query profile
386 int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
387 assert(h0 > 0);
388 // allocate memory
389 qp = malloc(qlen * m);
390 eh = calloc(qlen + 1, 8);
391 // generate the query profile
392 for (k = i = 0; k < m; ++k) {
393 const int8_t *p = &mat[k * m];
394 for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
395 }
396 // fill the first row
397 eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
398 for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
399 eh[j].h = eh[j-1].h - e_ins;
400 // adjust $w if it is too large
401 k = m * m;
402 for (i = 0, max = 0; i < k; ++i) // get the max score
403 max = max > mat[i]? max : mat[i];
404 max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
405 max_ins = max_ins > 1? max_ins : 1;
406 w = w < max_ins? w : max_ins;
407 max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
408 max_del = max_del > 1? max_del : 1;
409 w = w < max_del? w : max_del; // TODO: is this necessary?
410 // DP loop
411 max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
412 max_off = 0;
413 beg = 0, end = qlen;
414 for (i = 0; LIKELY(i < tlen); ++i) {
415 int t, f = 0, h1, m = 0, mj = -1;
416 int8_t *q = &qp[target[i] * qlen];
417 // apply the band and the constraint (if provided)
418 if (beg < i - w) beg = i - w;
419 if (end > i + w + 1) end = i + w + 1;
420 if (end > qlen) end = qlen;
421 // compute the first column
422 if (beg == 0) {
423 h1 = h0 - (o_del + e_del * (i + 1));
424 if (h1 < 0) h1 = 0;
425 } else h1 = 0;
426 for (j = beg; LIKELY(j < end); ++j) {
427 // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
428 // Similar to SSE2-SW, cells are computed in the following order:
429 // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
430 // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
431 // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
432 eh_t *p = &eh[j];
433 int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
434 p->h = h1; // set H(i,j-1) for the next row
435 M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M"
436 h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
437 h = h > f? h : f;
438 h1 = h; // save H(i,j) to h1 for the next column
439 mj = m > h? mj : j; // record the position where max score is achieved
440 m = m > h? m : h; // m is stored at eh[mj+1]
441 t = M - oe_del;
442 t = t > 0? t : 0;
443 e -= e_del;
444 e = e > t? e : t; // computed E(i+1,j)
445 p->e = e; // save E(i+1,j) for the next row
446 t = M - oe_ins;
447 t = t > 0? t : 0;
448 f -= e_ins;
449 f = f > t? f : t; // computed F(i,j+1)
450 }
451 eh[end].h = h1; eh[end].e = 0;
452 if (j == qlen) {
453 max_ie = gscore > h1? max_ie : i;
454 gscore = gscore > h1? gscore : h1;
455 }
456 if (m == 0) break;
457 if (m > max) {
458 max = m, max_i = i, max_j = mj;
459 max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
460 } else if (zdrop > 0) {
461 if (i - max_i > mj - max_j) {
462 if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
463 } else {
464 if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
465 }
466 }
467 // update beg and end for the next round
468 for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j);
469 beg = j;
470 for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
471 end = j + 2 < qlen? j + 2 : qlen;
472 //beg = 0; end = qlen; // uncomment this line for debugging
473 }
474 free(eh); free(qp);
475 if (_qle) *_qle = max_j + 1;
476 if (_tle) *_tle = max_i + 1;
477 if (_gtle) *_gtle = max_ie + 1;
478 if (_gscore) *_gscore = gscore;
479 if (_max_off) *_max_off = max_off;
480 return max;
481 }
482
ksw_extend(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int gapo,int gape,int w,int end_bonus,int zdrop,int h0,int * qle,int * tle,int * gtle,int * gscore,int * max_off)483 int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off)
484 {
485 return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off);
486 }
487
488 /********************
489 * Global alignment *
490 ********************/
491
492 #define MINUS_INF -0x40000000
493
push_cigar(int * n_cigar,int * m_cigar,uint32_t * cigar,int op,int len)494 static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
495 {
496 if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
497 if (*n_cigar == *m_cigar) {
498 *m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
499 cigar = realloc(cigar, (*m_cigar) << 2);
500 }
501 cigar[(*n_cigar)++] = len<<4 | op;
502 } else cigar[(*n_cigar)-1] += len<<4;
503 return cigar;
504 }
505
ksw_global2(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int o_del,int e_del,int o_ins,int e_ins,int w,int * n_cigar_,uint32_t ** cigar_)506 int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
507 {
508 eh_t *eh;
509 int8_t *qp; // query profile
510 int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col;
511 uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
512 if (n_cigar_) *n_cigar_ = 0;
513 // allocate memory
514 n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
515 z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0;
516 qp = malloc(qlen * m);
517 eh = calloc(qlen + 1, 8);
518 // generate the query profile
519 for (k = i = 0; k < m; ++k) {
520 const int8_t *p = &mat[k * m];
521 for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
522 }
523 // fill the first row
524 eh[0].h = 0; eh[0].e = MINUS_INF;
525 for (j = 1; j <= qlen && j <= w; ++j)
526 eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF;
527 for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
528 // DP loop
529 for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
530 int32_t f = MINUS_INF, h1, beg, end, t;
531 int8_t *q = &qp[target[i] * qlen];
532 beg = i > w? i - w : 0;
533 end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
534 h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF;
535 if (n_cigar_ && cigar_) {
536 uint8_t *zi = &z[(long)i * n_col];
537 for (j = beg; LIKELY(j < end); ++j) {
538 // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
539 // Cells are computed in the following order:
540 // M(i,j) = H(i-1,j-1) + S(i,j)
541 // H(i,j) = max{M(i,j), E(i,j), F(i,j)}
542 // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape
543 // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape
544 // We have to separate M(i,j); otherwise the direction may not be recorded correctly.
545 // However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global().
546 // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k.
547 // In practice, this should happen very rarely given a reasonable scoring system.
548 eh_t *p = &eh[j];
549 int32_t h, m = p->h, e = p->e;
550 uint8_t d; // direction
551 p->h = h1;
552 m += q[j];
553 d = m >= e? 0 : 1;
554 h = m >= e? m : e;
555 d = h >= f? d : 2;
556 h = h >= f? h : f;
557 h1 = h;
558 t = m - oe_del;
559 e -= e_del;
560 d |= e > t? 1<<2 : 0;
561 e = e > t? e : t;
562 p->e = e;
563 t = m - oe_ins;
564 f -= e_ins;
565 d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
566 f = f > t? f : t;
567 zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
568 }
569 } else {
570 for (j = beg; LIKELY(j < end); ++j) {
571 eh_t *p = &eh[j];
572 int32_t h, m = p->h, e = p->e;
573 p->h = h1;
574 m += q[j];
575 h = m >= e? m : e;
576 h = h >= f? h : f;
577 h1 = h;
578 t = m - oe_del;
579 e -= e_del;
580 e = e > t? e : t;
581 p->e = e;
582 t = m - oe_ins;
583 f -= e_ins;
584 f = f > t? f : t;
585 }
586 }
587 eh[end].h = h1; eh[end].e = MINUS_INF;
588 }
589 score = eh[qlen].h;
590 if (n_cigar_ && cigar_) { // backtrack
591 int n_cigar = 0, m_cigar = 0, which = 0;
592 uint32_t *cigar = 0, tmp;
593 i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
594 while (i >= 0 && k >= 0) {
595 which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
596 if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
597 else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
598 else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
599 }
600 if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
601 if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
602 for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
603 tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
604 *n_cigar_ = n_cigar, *cigar_ = cigar;
605 }
606 free(eh); free(qp); free(z);
607 return score;
608 }
609
ksw_global(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int gapo,int gape,int w,int * n_cigar_,uint32_t ** cigar_)610 int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
611 {
612 return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
613 }
614
615 /*******************************************
616 * Main function (not compiled by default) *
617 *******************************************/
618
619 #ifdef _KSW_MAIN
620
621 #include <unistd.h>
622 #include <stdio.h>
623 #include <zlib.h>
624 #include "kseq.h"
625 KSEQ_INIT(gzFile, err_gzread)
626
627 unsigned char seq_nt4_table[256] = {
628 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
629 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
630 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
631 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
632 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
633 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
634 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
635 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
636 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
637 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
638 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
639 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
640 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
641 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
642 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
643 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
644 };
645
main(int argc,char * argv[])646 int main(int argc, char *argv[])
647 {
648 int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
649 int8_t mat[25];
650 int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
651 uint8_t *rseq = 0;
652 gzFile fpt, fpq;
653 kseq_t *kst, *ksq;
654
655 // parse command line
656 while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
657 switch (c) {
658 case 'a': sa = atoi(optarg); break;
659 case 'b': sb = atoi(optarg); break;
660 case 'q': gapo = atoi(optarg); break;
661 case 'r': gape = atoi(optarg); break;
662 case 't': minsc = atoi(optarg); break;
663 case 'f': forward_only = 1; break;
664 case '1': xtra |= KSW_XBYTE; break;
665 }
666 }
667 if (optind + 2 > argc) {
668 fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
669 return 1;
670 }
671 if (minsc > 0xffff) minsc = 0xffff;
672 xtra |= KSW_XSUBO | minsc;
673 // initialize scoring matrix
674 for (i = k = 0; i < 4; ++i) {
675 for (j = 0; j < 4; ++j)
676 mat[k++] = i == j? sa : -sb;
677 mat[k++] = 0; // ambiguous base
678 }
679 for (j = 0; j < 5; ++j) mat[k++] = 0;
680 // open file
681 fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt);
682 fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
683 // all-pair alignment
684 while (kseq_read(ksq) > 0) {
685 kswq_t *q[2] = {0, 0};
686 kswr_t r;
687 for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
688 if (!forward_only) { // reverse
689 if ((int)ksq->seq.m > max_rseq) {
690 max_rseq = ksq->seq.m;
691 rseq = (uint8_t*)realloc(rseq, max_rseq);
692 }
693 for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
694 rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
695 }
696 gzrewind(fpt); kseq_rewind(kst);
697 while (kseq_read(kst) > 0) {
698 for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
699 r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
700 if (r.score >= minsc)
701 err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
702 if (rseq) {
703 r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
704 if (r.score >= minsc)
705 err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
706 }
707 }
708 free(q[0]); free(q[1]);
709 }
710 free(rseq);
711 kseq_destroy(kst); err_gzclose(fpt);
712 kseq_destroy(ksq); err_gzclose(fpq);
713 return 0;
714 }
715 #endif
716