1 /* The MIT License
2 
3    Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
4 
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12 
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25 
26 #include <stdlib.h>
27 #include <stdint.h>
28 #include <assert.h>
29 // #include <emmintrin.h>
30 #define SIMDE_ENABLE_NATIVE_ALIASES
31 #include <simde/x86/sse2.h>
32 #include "ksw.h"
33 
34 #ifdef USE_MALLOC_WRAPPERS
35 #  include "malloc_wrap.h"
36 #endif
37 
38 #ifdef __GNUC__
39 #define LIKELY(x) __builtin_expect((x),1)
40 #define UNLIKELY(x) __builtin_expect((x),0)
41 #else
42 #define LIKELY(x) (x)
43 #define UNLIKELY(x) (x)
44 #endif
45 
46 const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
47 
48 struct _kswq_t {
49 	int qlen, slen;
50 	uint8_t shift, mdiff, max, size;
51 	__m128i *qp, *H0, *H1, *E, *Hmax;
52 };
53 
54 /**
55  * Initialize the query data structure
56  *
57  * @param size   Number of bytes used to store a score; valid valures are 1 or 2
58  * @param qlen   Length of the query sequence
59  * @param query  Query sequence
60  * @param m      Size of the alphabet
61  * @param mat    Scoring matrix in a one-dimension array
62  *
63  * @return       Query data structure
64  */
ksw_qinit(int size,int qlen,const uint8_t * query,int m,const int8_t * mat)65 kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
66 {
67 	kswq_t *q;
68 	int slen, a, tmp, p;
69 
70 	size = size > 1? 2 : 1;
71 	p = 8 * (3 - size); // # values per __m128i
72 	slen = (qlen + p - 1) / p; // segmented length
73 	q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
74 	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
75 	q->H0 = q->qp + slen * m;
76 	q->H1 = q->H0 + slen;
77 	q->E  = q->H1 + slen;
78 	q->Hmax = q->E + slen;
79 	q->slen = slen; q->qlen = qlen; q->size = size;
80 	// compute shift
81 	tmp = m * m;
82 	for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
83 		if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
84 		if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
85 	}
86 	q->max = q->mdiff;
87 	q->shift = 256 - q->shift; // NB: q->shift is uint8_t
88 	q->mdiff += q->shift; // this is the difference between the min and max scores
89 	// An example: p=8, qlen=19, slen=3 and segmentation:
90 	//  {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
91 	if (size == 1) {
92 		int8_t *t = (int8_t*)q->qp;
93 		for (a = 0; a < m; ++a) {
94 			int i, k, nlen = slen * p;
95 			const int8_t *ma = mat + a * m;
96 			for (i = 0; i < slen; ++i)
97 				for (k = i; k < nlen; k += slen) // p iterations
98 					*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
99 		}
100 	} else {
101 		int16_t *t = (int16_t*)q->qp;
102 		for (a = 0; a < m; ++a) {
103 			int i, k, nlen = slen * p;
104 			const int8_t *ma = mat + a * m;
105 			for (i = 0; i < slen; ++i)
106 				for (k = i; k < nlen; k += slen) // p iterations
107 					*t++ = (k >= qlen? 0 : ma[query[k]]);
108 		}
109 	}
110 	return q;
111 }
112 
ksw_u8(kswq_t * q,int tlen,const uint8_t * target,int _o_del,int _e_del,int _o_ins,int _e_ins,int xtra)113 kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
114 {
115 	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
116 	uint64_t *b;
117 	__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
118 	kswr_t r;
119 
120 #define __max_16(ret, xx) do { \
121 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
122 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
123 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
124 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
125     	(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
126 	} while (0)
127 
128 	// initialization
129 	r = g_defr;
130 	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
131 	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
132 	m_b = n_b = 0; b = 0;
133 	zero = _mm_set1_epi32(0);
134 	oe_del = _mm_set1_epi8(_o_del + _e_del);
135 	e_del = _mm_set1_epi8(_e_del);
136 	oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
137 	e_ins = _mm_set1_epi8(_e_ins);
138 	shift = _mm_set1_epi8(q->shift);
139 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
140 	slen = q->slen;
141 	for (i = 0; i < slen; ++i) {
142 		_mm_store_si128(E + i, zero);
143 		_mm_store_si128(H0 + i, zero);
144 		_mm_store_si128(Hmax + i, zero);
145 	}
146 	// the core loop
147 	for (i = 0; i < tlen; ++i) {
148 		int j, k, cmp, imax;
149 		__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
150 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
151 		h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
152 		for (j = 0; LIKELY(j < slen); ++j) {
153 			/* SW cells are computed in the following order:
154 			 *   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
155 			 *   E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
156 			 *   F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
157 			 */
158 			// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
159 			h = _mm_adds_epu8(h, _mm_load_si128(S + j));
160 			h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
161 			e = _mm_load_si128(E + j); // e=E'(i,j)
162 			h = _mm_max_epu8(h, e);
163 			h = _mm_max_epu8(h, f); // h=H'(i,j)
164 			max = _mm_max_epu8(max, h); // set max
165 			_mm_store_si128(H1 + j, h); // save to H'(i,j)
166 			// now compute E'(i+1,j)
167 			e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
168 			t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
169 			e = _mm_max_epu8(e, t); // e=E'(i+1,j)
170 			_mm_store_si128(E + j, e); // save to E'(i+1,j)
171 			// now compute F'(i,j+1)
172 			f = _mm_subs_epu8(f, e_ins);
173 			t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
174 			f = _mm_max_epu8(f, t);
175 			// get H'(i-1,j) and prepare for the next j
176 			h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
177 		}
178 		// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
179 		for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
180 			f = _mm_slli_si128(f, 1);
181 			for (j = 0; LIKELY(j < slen); ++j) {
182 				h = _mm_load_si128(H1 + j);
183 				h = _mm_max_epu8(h, f); // h=H'(i,j)
184 				_mm_store_si128(H1 + j, h);
185 				h = _mm_subs_epu8(h, oe_ins);
186 				f = _mm_subs_epu8(f, e_ins);
187 				cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
188 				if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
189 			}
190 		}
191 end_loop16:
192 		//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
193 		__max_16(imax, max); // imax is the maximum number in max
194 		if (imax >= minsc) { // write the b array; this condition adds branching unfornately
195 			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
196 				if (n_b == m_b) {
197 					m_b = m_b? m_b<<1 : 8;
198 					b = (uint64_t*)realloc(b, 8 * m_b);
199 				}
200 				b[n_b++] = (uint64_t)imax<<32 | i;
201 			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
202 		}
203 		if (imax > gmax) {
204 			gmax = imax; te = i; // te is the end position on the target
205 			for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
206 				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
207 			if (gmax + q->shift >= 255 || gmax >= endsc) break;
208 		}
209 		S = H1; H1 = H0; H0 = S; // swap H0 and H1
210 	}
211 	r.score = gmax + q->shift < 255? gmax : 255;
212 	r.te = te;
213 	if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
214 		int max = -1, tmp, low, high, qlen = slen * 16;
215 		uint8_t *t = (uint8_t*)Hmax;
216 		for (i = 0; i < qlen; ++i, ++t)
217 			if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
218 			else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
219 		//printf("%d,%d\n", max, gmax);
220 		if (b) {
221 			i = (r.score + q->max - 1) / q->max;
222 			low = te - i; high = te + i;
223 			for (i = 0; i < n_b; ++i) {
224 				int e = (int32_t)b[i];
225 				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
226 					r.score2 = b[i]>>32, r.te2 = e;
227 			}
228 		}
229 	}
230 	free(b);
231 	return r;
232 }
233 
ksw_i16(kswq_t * q,int tlen,const uint8_t * target,int _o_del,int _e_del,int _o_ins,int _e_ins,int xtra)234 kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
235 {
236 	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
237 	uint64_t *b;
238 	__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
239 	kswr_t r;
240 
241 #define __max_8(ret, xx) do { \
242 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
243 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
244 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
245     	(ret) = _mm_extract_epi16((xx), 0); \
246 	} while (0)
247 
248 	// initialization
249 	r = g_defr;
250 	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
251 	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
252 	m_b = n_b = 0; b = 0;
253 	zero = _mm_set1_epi32(0);
254 	oe_del = _mm_set1_epi16(_o_del + _e_del);
255 	e_del = _mm_set1_epi16(_e_del);
256 	oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
257 	e_ins = _mm_set1_epi16(_e_ins);
258 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
259 	slen = q->slen;
260 	for (i = 0; i < slen; ++i) {
261 		_mm_store_si128(E + i, zero);
262 		_mm_store_si128(H0 + i, zero);
263 		_mm_store_si128(Hmax + i, zero);
264 	}
265 	// the core loop
266 	for (i = 0; i < tlen; ++i) {
267 		int j, k, imax;
268 		__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
269 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
270 		h = _mm_slli_si128(h, 2);
271 		for (j = 0; LIKELY(j < slen); ++j) {
272 			h = _mm_adds_epi16(h, *S++);
273 			e = _mm_load_si128(E + j);
274 			h = _mm_max_epi16(h, e);
275 			h = _mm_max_epi16(h, f);
276 			max = _mm_max_epi16(max, h);
277 			_mm_store_si128(H1 + j, h);
278 			e = _mm_subs_epu16(e, e_del);
279 			t = _mm_subs_epu16(h, oe_del);
280 			e = _mm_max_epi16(e, t);
281 			_mm_store_si128(E + j, e);
282 			f = _mm_subs_epu16(f, e_ins);
283 			t = _mm_subs_epu16(h, oe_ins);
284 			f = _mm_max_epi16(f, t);
285 			h = _mm_load_si128(H0 + j);
286 		}
287 		for (k = 0; LIKELY(k < 16); ++k) {
288 			f = _mm_slli_si128(f, 2);
289 			for (j = 0; LIKELY(j < slen); ++j) {
290 				h = _mm_load_si128(H1 + j);
291 				h = _mm_max_epi16(h, f);
292 				_mm_store_si128(H1 + j, h);
293 				h = _mm_subs_epu16(h, oe_ins);
294 				f = _mm_subs_epu16(f, e_ins);
295 				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
296 			}
297 		}
298 end_loop8:
299 		__max_8(imax, max);
300 		if (imax >= minsc) {
301 			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
302 				if (n_b == m_b) {
303 					m_b = m_b? m_b<<1 : 8;
304 					b = (uint64_t*)realloc(b, 8 * m_b);
305 				}
306 				b[n_b++] = (uint64_t)imax<<32 | i;
307 			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
308 		}
309 		if (imax > gmax) {
310 			gmax = imax; te = i;
311 			for (j = 0; LIKELY(j < slen); ++j)
312 				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
313 			if (gmax >= endsc) break;
314 		}
315 		S = H1; H1 = H0; H0 = S;
316 	}
317 	r.score = gmax; r.te = te;
318 	{
319 		int max = -1, tmp, low, high, qlen = slen * 8;
320 		uint16_t *t = (uint16_t*)Hmax;
321 		for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
322 			if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
323 			else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
324 		if (b) {
325 			i = (r.score + q->max - 1) / q->max;
326 			low = te - i; high = te + i;
327 			for (i = 0; i < n_b; ++i) {
328 				int e = (int32_t)b[i];
329 				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
330 					r.score2 = b[i]>>32, r.te2 = e;
331 			}
332 		}
333 	}
334 	free(b);
335 	return r;
336 }
337 
revseq(int l,uint8_t * s)338 static inline void revseq(int l, uint8_t *s)
339 {
340 	int i, t;
341 	for (i = 0; i < l>>1; ++i)
342 		t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
343 }
344 
ksw_align2(int qlen,uint8_t * query,int tlen,uint8_t * target,int m,const int8_t * mat,int o_del,int e_del,int o_ins,int e_ins,int xtra,kswq_t ** qry)345 kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry)
346 {
347 	int size;
348 	kswq_t *q;
349 	kswr_t r, rr;
350 	kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int);
351 
352 	q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
353 	if (qry && *qry == 0) *qry = q;
354 	func = q->size == 2? ksw_i16 : ksw_u8;
355 	size = q->size;
356 	r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra);
357 	if (qry == 0) free(q);
358 	if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
359 	revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
360 	q = ksw_qinit(size, r.qe + 1, query, m, mat);
361 	rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score);
362 	revseq(r.qe + 1, query); revseq(r.te + 1, target);
363 	free(q);
364 	if (r.score == rr.score)
365 		r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
366 	return r;
367 }
368 
ksw_align(int qlen,uint8_t * query,int tlen,uint8_t * target,int m,const int8_t * mat,int gapo,int gape,int xtra,kswq_t ** qry)369 kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
370 {
371 	return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry);
372 }
373 
374 /********************
375  *** SW extension ***
376  ********************/
377 
378 typedef struct {
379 	int32_t h, e;
380 } eh_t;
381 
ksw_extend2(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int o_del,int e_del,int o_ins,int e_ins,int w,int end_bonus,int zdrop,int h0,int * _qle,int * _tle,int * _gtle,int * _gscore,int * _max_off)382 int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
383 {
384 	eh_t *eh; // score array
385 	int8_t *qp; // query profile
386 	int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
387 	assert(h0 > 0);
388 	// allocate memory
389 	qp = malloc(qlen * m);
390 	eh = calloc(qlen + 1, 8);
391 	// generate the query profile
392 	for (k = i = 0; k < m; ++k) {
393 		const int8_t *p = &mat[k * m];
394 		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
395 	}
396 	// fill the first row
397 	eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
398 	for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
399 		eh[j].h = eh[j-1].h - e_ins;
400 	// adjust $w if it is too large
401 	k = m * m;
402 	for (i = 0, max = 0; i < k; ++i) // get the max score
403 		max = max > mat[i]? max : mat[i];
404 	max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
405 	max_ins = max_ins > 1? max_ins : 1;
406 	w = w < max_ins? w : max_ins;
407 	max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
408 	max_del = max_del > 1? max_del : 1;
409 	w = w < max_del? w : max_del; // TODO: is this necessary?
410 	// DP loop
411 	max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
412 	max_off = 0;
413 	beg = 0, end = qlen;
414 	for (i = 0; LIKELY(i < tlen); ++i) {
415 		int t, f = 0, h1, m = 0, mj = -1;
416 		int8_t *q = &qp[target[i] * qlen];
417 		// apply the band and the constraint (if provided)
418 		if (beg < i - w) beg = i - w;
419 		if (end > i + w + 1) end = i + w + 1;
420 		if (end > qlen) end = qlen;
421 		// compute the first column
422 		if (beg == 0) {
423 			h1 = h0 - (o_del + e_del * (i + 1));
424 			if (h1 < 0) h1 = 0;
425 		} else h1 = 0;
426 		for (j = beg; LIKELY(j < end); ++j) {
427 			// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
428 			// Similar to SSE2-SW, cells are computed in the following order:
429 			//   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
430 			//   E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
431 			//   F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
432 			eh_t *p = &eh[j];
433 			int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
434 			p->h = h1;          // set H(i,j-1) for the next row
435 			M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M"
436 			h = M > e? M : e;   // e and f are guaranteed to be non-negative, so h>=0 even if M<0
437 			h = h > f? h : f;
438 			h1 = h;             // save H(i,j) to h1 for the next column
439 			mj = m > h? mj : j; // record the position where max score is achieved
440 			m = m > h? m : h;   // m is stored at eh[mj+1]
441 			t = M - oe_del;
442 			t = t > 0? t : 0;
443 			e -= e_del;
444 			e = e > t? e : t;   // computed E(i+1,j)
445 			p->e = e;           // save E(i+1,j) for the next row
446 			t = M - oe_ins;
447 			t = t > 0? t : 0;
448 			f -= e_ins;
449 			f = f > t? f : t;   // computed F(i,j+1)
450 		}
451 		eh[end].h = h1; eh[end].e = 0;
452 		if (j == qlen) {
453 			max_ie = gscore > h1? max_ie : i;
454 			gscore = gscore > h1? gscore : h1;
455 		}
456 		if (m == 0) break;
457 		if (m > max) {
458 			max = m, max_i = i, max_j = mj;
459 			max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
460 		} else if (zdrop > 0) {
461 			if (i - max_i > mj - max_j) {
462 				if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
463 			} else {
464 				if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
465 			}
466 		}
467 		// update beg and end for the next round
468 		for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j);
469 		beg = j;
470 		for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
471 		end = j + 2 < qlen? j + 2 : qlen;
472 		//beg = 0; end = qlen; // uncomment this line for debugging
473 	}
474 	free(eh); free(qp);
475 	if (_qle) *_qle = max_j + 1;
476 	if (_tle) *_tle = max_i + 1;
477 	if (_gtle) *_gtle = max_ie + 1;
478 	if (_gscore) *_gscore = gscore;
479 	if (_max_off) *_max_off = max_off;
480 	return max;
481 }
482 
ksw_extend(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int gapo,int gape,int w,int end_bonus,int zdrop,int h0,int * qle,int * tle,int * gtle,int * gscore,int * max_off)483 int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off)
484 {
485 	return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off);
486 }
487 
488 /********************
489  * Global alignment *
490  ********************/
491 
492 #define MINUS_INF -0x40000000
493 
push_cigar(int * n_cigar,int * m_cigar,uint32_t * cigar,int op,int len)494 static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
495 {
496 	if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
497 		if (*n_cigar == *m_cigar) {
498 			*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
499 			cigar = realloc(cigar, (*m_cigar) << 2);
500 		}
501 		cigar[(*n_cigar)++] = len<<4 | op;
502 	} else cigar[(*n_cigar)-1] += len<<4;
503 	return cigar;
504 }
505 
ksw_global2(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int o_del,int e_del,int o_ins,int e_ins,int w,int * n_cigar_,uint32_t ** cigar_)506 int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
507 {
508 	eh_t *eh;
509 	int8_t *qp; // query profile
510 	int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col;
511 	uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
512 	if (n_cigar_) *n_cigar_ = 0;
513 	// allocate memory
514 	n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
515 	z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0;
516 	qp = malloc(qlen * m);
517 	eh = calloc(qlen + 1, 8);
518 	// generate the query profile
519 	for (k = i = 0; k < m; ++k) {
520 		const int8_t *p = &mat[k * m];
521 		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
522 	}
523 	// fill the first row
524 	eh[0].h = 0; eh[0].e = MINUS_INF;
525 	for (j = 1; j <= qlen && j <= w; ++j)
526 		eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF;
527 	for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
528 	// DP loop
529 	for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
530 		int32_t f = MINUS_INF, h1, beg, end, t;
531 		int8_t *q = &qp[target[i] * qlen];
532 		beg = i > w? i - w : 0;
533 		end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
534 		h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF;
535 		if (n_cigar_ && cigar_) {
536 			uint8_t *zi = &z[(long)i * n_col];
537 			for (j = beg; LIKELY(j < end); ++j) {
538 				// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
539 				// Cells are computed in the following order:
540 				//   M(i,j)   = H(i-1,j-1) + S(i,j)
541 				//   H(i,j)   = max{M(i,j), E(i,j), F(i,j)}
542 				//   E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape
543 				//   F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape
544 				// We have to separate M(i,j); otherwise the direction may not be recorded correctly.
545 				// However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global().
546 				// Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k.
547 				// In practice, this should happen very rarely given a reasonable scoring system.
548 				eh_t *p = &eh[j];
549 				int32_t h, m = p->h, e = p->e;
550 				uint8_t d; // direction
551 				p->h = h1;
552 				m += q[j];
553 				d = m >= e? 0 : 1;
554 				h = m >= e? m : e;
555 				d = h >= f? d : 2;
556 				h = h >= f? h : f;
557 				h1 = h;
558 				t = m - oe_del;
559 				e -= e_del;
560 				d |= e > t? 1<<2 : 0;
561 				e  = e > t? e    : t;
562 				p->e = e;
563 				t = m - oe_ins;
564 				f -= e_ins;
565 				d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
566 				f  = f > t? f    : t;
567 				zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
568 			}
569 		} else {
570 			for (j = beg; LIKELY(j < end); ++j) {
571 				eh_t *p = &eh[j];
572 				int32_t h, m = p->h, e = p->e;
573 				p->h = h1;
574 				m += q[j];
575 				h = m >= e? m : e;
576 				h = h >= f? h : f;
577 				h1 = h;
578 				t = m - oe_del;
579 				e -= e_del;
580 				e  = e > t? e : t;
581 				p->e = e;
582 				t = m - oe_ins;
583 				f -= e_ins;
584 				f  = f > t? f : t;
585 			}
586 		}
587 		eh[end].h = h1; eh[end].e = MINUS_INF;
588 	}
589 	score = eh[qlen].h;
590 	if (n_cigar_ && cigar_) { // backtrack
591 		int n_cigar = 0, m_cigar = 0, which = 0;
592 		uint32_t *cigar = 0, tmp;
593 		i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
594 		while (i >= 0 && k >= 0) {
595 			which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
596 			if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
597 			else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
598 			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
599 		}
600 		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
601 		if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
602 		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
603 			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
604 		*n_cigar_ = n_cigar, *cigar_ = cigar;
605 	}
606 	free(eh); free(qp); free(z);
607 	return score;
608 }
609 
ksw_global(int qlen,const uint8_t * query,int tlen,const uint8_t * target,int m,const int8_t * mat,int gapo,int gape,int w,int * n_cigar_,uint32_t ** cigar_)610 int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
611 {
612 	return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
613 }
614 
615 /*******************************************
616  * Main function (not compiled by default) *
617  *******************************************/
618 
619 #ifdef _KSW_MAIN
620 
621 #include <unistd.h>
622 #include <stdio.h>
623 #include <zlib.h>
624 #include "kseq.h"
625 KSEQ_INIT(gzFile, err_gzread)
626 
627 unsigned char seq_nt4_table[256] = {
628 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
629 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
630 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
631 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
632 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
633 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
634 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
635 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
636 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
637 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
638 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
639 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
640 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
641 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
642 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
643 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
644 };
645 
main(int argc,char * argv[])646 int main(int argc, char *argv[])
647 {
648 	int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
649 	int8_t mat[25];
650 	int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
651 	uint8_t *rseq = 0;
652 	gzFile fpt, fpq;
653 	kseq_t *kst, *ksq;
654 
655 	// parse command line
656 	while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
657 		switch (c) {
658 			case 'a': sa = atoi(optarg); break;
659 			case 'b': sb = atoi(optarg); break;
660 			case 'q': gapo = atoi(optarg); break;
661 			case 'r': gape = atoi(optarg); break;
662 			case 't': minsc = atoi(optarg); break;
663 			case 'f': forward_only = 1; break;
664 			case '1': xtra |= KSW_XBYTE; break;
665 		}
666 	}
667 	if (optind + 2 > argc) {
668 		fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
669 		return 1;
670 	}
671 	if (minsc > 0xffff) minsc = 0xffff;
672 	xtra |= KSW_XSUBO | minsc;
673 	// initialize scoring matrix
674 	for (i = k = 0; i < 4; ++i) {
675 		for (j = 0; j < 4; ++j)
676 			mat[k++] = i == j? sa : -sb;
677 		mat[k++] = 0; // ambiguous base
678 	}
679 	for (j = 0; j < 5; ++j) mat[k++] = 0;
680 	// open file
681 	fpt = xzopen(argv[optind],   "r"); kst = kseq_init(fpt);
682 	fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
683 	// all-pair alignment
684 	while (kseq_read(ksq) > 0) {
685 		kswq_t *q[2] = {0, 0};
686 		kswr_t r;
687 		for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
688 		if (!forward_only) { // reverse
689 			if ((int)ksq->seq.m > max_rseq) {
690 				max_rseq = ksq->seq.m;
691 				rseq = (uint8_t*)realloc(rseq, max_rseq);
692 			}
693 			for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
694 				rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
695 		}
696 		gzrewind(fpt); kseq_rewind(kst);
697 		while (kseq_read(kst) > 0) {
698 			for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
699 			r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
700 			if (r.score >= minsc)
701 				err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
702 			if (rseq) {
703 				r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
704 				if (r.score >= minsc)
705 					err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
706 			}
707 		}
708 		free(q[0]); free(q[1]);
709 	}
710 	free(rseq);
711 	kseq_destroy(kst); err_gzclose(fpt);
712 	kseq_destroy(ksq); err_gzclose(fpq);
713 	return 0;
714 }
715 #endif
716