1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 /*
64 Using 16-bit signed values, from -32768 to +32767.
65 match: positive
66 mismatch: negative
67 gap penalties: positive (open, extend, query/target, left/interior/right)
68 optimal global alignment (NW)
69 maximize score
70 */
71
72 #define CHANNELS 8
73 #define CDEPTH 4
74
75 /*
76 Due to memory usage, limit the product of the length of the sequences.
77 If the product of the query length and any target sequence length
78 is above the limit, the alignment will not be computed and a score
79 of SHRT_MAX will be returned as the score.
80 If an overflow occurs during alignment computation, a score of
81 SHRT_MAX will also be returned.
82
83 The limit is set to 5 000 * 5 000 = 25 000 000. This will allocate up to
84 200 MB per thread. It will align pairs of sequences less than 5000 nt long
85 using the SIMD implementation, larger alignments will be performed with
86 the linear memory aligner.
87 */
88
89 #define MAXSEQLENPRODUCT 25000000
90
91 static int64_t scorematrix[16][16];
92
93 /*
94 The macros below usually operate on 128-bit vectors of 8 signed
95 short 16-bit integers. Additions and subtractions should be
96 saturated. The shift operation should shift left by 2 bytes (one
97 short int) and shift in zeros. The v_mask_gt operation should
98 compare two vectors of signed shorts and return a 16-bit bitmask
99 with pairs of 2 bits set for each element greater in the first than
100 in the second argument.
101 */
102
103 #ifdef __PPC__
104
105 typedef vector signed short VECTOR_SHORT;
106
107 const vector unsigned char perm_merge_long_low =
108 {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
109 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
110
111 const vector unsigned char perm_merge_long_high =
112 {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
113 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
114
115 #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h}
116 #define v_load(a) vec_ld(0, (VECTOR_SHORT *)(a))
117 #define v_store(a, b) vec_st((vector unsigned char)(b), 0, \
118 (vector unsigned char *)(a))
119 #define v_add(a, b) vec_adds((a), (b))
120 #define v_sub(a, b) vec_subs((a), (b))
121 #define v_sub_unsigned(a, b) ((VECTOR_SHORT) \
122 vec_subs((vector unsigned short) (a), \
123 (vector unsigned short) (b)))
124 #define v_max(a, b) vec_max((a), (b))
125 #define v_min(a, b) vec_min((a), (b))
126 #define v_dup(a) vec_splat((VECTOR_SHORT){(short)(a), 0, 0, 0, 0, 0, 0, 0}, 0);
127 #define v_zero vec_splat_s16(0)
128 #define v_and(a, b) vec_and((a), (b))
129 #define v_xor(a, b) vec_xor((a), (b))
130 #define v_shift_left(a) vec_sld((a), v_zero, 2)
131
132 #elif defined __aarch64__
133
134 typedef int16x8_t VECTOR_SHORT;
135
136 const uint16x8_t neon_mask =
137 {0x0003, 0x000c, 0x0030, 0x00c0, 0x0300, 0x0c00, 0x3000, 0xc000};
138
139 #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h}
140 #define v_load(a) vld1q_s16((const int16_t *)(a))
141 #define v_store(a, b) vst1q_s16((int16_t *)(a), (b))
142 #define v_merge_lo_16(a, b) vzip1q_s16((a),(b))
143 #define v_merge_hi_16(a, b) vzip2q_s16((a),(b))
144 #define v_merge_lo_32(a, b) vreinterpretq_s16_s32(vzip1q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b)))
145 #define v_merge_hi_32(a, b) vreinterpretq_s16_s32(vzip2q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b)))
146 #define v_merge_lo_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(a)), vget_low_s64(vreinterpretq_s64_s16(b))))
147 #define v_merge_hi_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s16(a)), vget_high_s64(vreinterpretq_s64_s16(b))))
148 #define v_add(a, b) vqaddq_s16((a), (b))
149 #define v_sub(a, b) vqsubq_s16((a), (b))
150 #define v_sub_unsigned(a, b) vreinterpretq_s16_u16(vqsubq_u16(vreinterpretq_u16_s16(a), vreinterpretq_u16_s16(b)))
151 #define v_max(a, b) vmaxq_s16((a), (b))
152 #define v_min(a, b) vminq_s16((a), (b))
153 #define v_dup(a) vdupq_n_s16(a)
154 #define v_zero v_dup(0)
155 #define v_and(a, b) vandq_s16((a), (b))
156 #define v_xor(a, b) veorq_s16((a), (b))
157 #define v_shift_left(a) vextq_s16((v_zero), (a), 7)
158 #define v_mask_gt(a, b) vaddvq_u16(vandq_u16((vcgtq_s16((a), (b))), neon_mask))
159
160 #elif __x86_64__
161
162 typedef __m128i VECTOR_SHORT;
163
164 #define v_init(a,b,c,d,e,f,g,h) _mm_set_epi16(h,g,f,e,d,c,b,a)
165 #define v_load(a) _mm_load_si128((VECTOR_SHORT *)(a))
166 #define v_store(a, b) _mm_store_si128((VECTOR_SHORT *)(a), (b))
167 #define v_merge_lo_16(a, b) _mm_unpacklo_epi16((a),(b))
168 #define v_merge_hi_16(a, b) _mm_unpackhi_epi16((a),(b))
169 #define v_merge_lo_32(a, b) _mm_unpacklo_epi32((a),(b))
170 #define v_merge_hi_32(a, b) _mm_unpackhi_epi32((a),(b))
171 #define v_merge_lo_64(a, b) _mm_unpacklo_epi64((a),(b))
172 #define v_merge_hi_64(a, b) _mm_unpackhi_epi64((a),(b))
173 #define v_add(a, b) _mm_adds_epi16((a), (b))
174 #define v_sub(a, b) _mm_subs_epi16((a), (b))
175 #define v_sub_unsigned(a, b) _mm_subs_epu16((a), (b))
176 #define v_max(a, b) _mm_max_epi16((a), (b))
177 #define v_min(a, b) _mm_min_epi16((a), (b))
178 #define v_dup(a) _mm_set1_epi16(a)
179 #define v_zero v_dup(0)
180 #define v_and(a, b) _mm_and_si128((a), (b))
181 #define v_xor(a, b) _mm_xor_si128((a), (b))
182 #define v_shift_left(a) _mm_slli_si128((a), 2)
183 #define v_mask_gt(a, b) _mm_movemask_epi8(_mm_cmpgt_epi16((a), (b)))
184
185 #else
186
187 #error Unknown Architecture
188
189 #endif
190
191 struct s16info_s
192 {
193 VECTOR_SHORT matrix[32];
194 VECTOR_SHORT * hearray;
195 VECTOR_SHORT * dprofile;
196 VECTOR_SHORT ** qtable;
197 unsigned short * dir;
198 char * qseq;
199 uint64_t diralloc;
200
201 char * cigar;
202 char * cigarend;
203 int64_t cigaralloc;
204 int opcount;
205 char op;
206
207 int qlen;
208 int maxdlen;
209 CELL penalty_gap_open_query_left;
210 CELL penalty_gap_open_target_left;
211 CELL penalty_gap_open_query_interior;
212 CELL penalty_gap_open_target_interior;
213 CELL penalty_gap_open_query_right;
214 CELL penalty_gap_open_target_right;
215 CELL penalty_gap_extension_query_left;
216 CELL penalty_gap_extension_target_left;
217 CELL penalty_gap_extension_query_interior;
218 CELL penalty_gap_extension_target_interior;
219 CELL penalty_gap_extension_query_right;
220 CELL penalty_gap_extension_target_right;
221 };
222
_mm_print(VECTOR_SHORT x)223 void _mm_print(VECTOR_SHORT x)
224 {
225 auto * y = (unsigned short*)&x;
226 for (int i=0; i<8; i++) {
227 printf("%s%6d", (i>0?" ":""), y[7-i]);
228
229 }
230 }
231
_mm_print2(VECTOR_SHORT x)232 void _mm_print2(VECTOR_SHORT x)
233 {
234 auto * y = (signed short*)&x;
235 for (int i=0; i<8; i++) {
236 printf("%s%2d", (i>0?" ":""), y[7-i]);
237
238 }
239 }
240
dprofile_dump16(CELL * dprofile)241 void dprofile_dump16(CELL * dprofile)
242 {
243 char * s = sym_nt_4bit;
244 printf("\ndprofile:\n");
245 for(int i=0; i<16; i++)
246 {
247 printf("%c: ",s[i]);
248 for(int k=0; k<CDEPTH; k++)
249 {
250 printf("[");
251 for(int j=0; j<CHANNELS; j++) {
252 printf(" %3d", dprofile[CHANNELS*CDEPTH*i + CHANNELS*k + j]);
253
254 }
255 printf("]");
256 }
257 printf("\n");
258 }
259 }
260
dumpscorematrix(CELL * m)261 void dumpscorematrix(CELL * m)
262 {
263 for(int i=0; i<16; i++)
264 {
265 printf("%2d %c", i, sym_nt_4bit[i]);
266 for(int j=0; j<16; j++) {
267 printf(" %2d", m[16*i+j]);
268
269 }
270 printf("\n");
271 }
272 }
273
dprofile_fill16(CELL * dprofile_word,CELL * score_matrix_word,BYTE * dseq)274 void dprofile_fill16(CELL * dprofile_word,
275 CELL * score_matrix_word,
276 BYTE * dseq)
277 {
278 #if 0
279 dumpscorematrix(score_matrix_word);
280
281 for (int j=0; j<CDEPTH; j++)
282 {
283 for(int z=0; z<CHANNELS; z++)
284 fprintf(stderr, " [%c]", sym_nt_4bit[dseq[j*CHANNELS+z]]);
285 fprintf(stderr, "\n");
286 }
287 #endif
288
289 for (int j=0; j<CDEPTH; j++)
290 {
291 int d[CHANNELS];
292 for(int z=0; z<CHANNELS; z++) {
293 d[z] = dseq[j*CHANNELS+z] << 4;
294
295 }
296
297 for(int i=0; i<16; i += 8)
298 {
299
300 #ifdef __PPC__
301 vector signed short reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
302 vector signed int reg8, reg9, reg10,reg11,reg12,reg13,reg14,reg15;
303 vector signed long long reg16,reg17,reg18,reg19,reg20,reg21,reg22,reg23;
304 vector signed long long reg24,reg25,reg26,reg27,reg28,reg29,reg30,reg31;
305 #else
306 VECTOR_SHORT reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
307 VECTOR_SHORT reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15;
308 VECTOR_SHORT reg16, reg17, reg18, reg19, reg20, reg21, reg22, reg23;
309 VECTOR_SHORT reg24, reg25, reg26, reg27, reg28, reg29, reg30, reg31;
310 #endif
311
312 reg0 = v_load(score_matrix_word + d[0] + i);
313 reg1 = v_load(score_matrix_word + d[1] + i);
314 reg2 = v_load(score_matrix_word + d[2] + i);
315 reg3 = v_load(score_matrix_word + d[3] + i);
316 reg4 = v_load(score_matrix_word + d[4] + i);
317 reg5 = v_load(score_matrix_word + d[5] + i);
318 reg6 = v_load(score_matrix_word + d[6] + i);
319 reg7 = v_load(score_matrix_word + d[7] + i);
320
321 #ifdef __PPC__
322 reg8 = (vector signed int) vec_mergeh(reg0, reg1);
323 reg9 = (vector signed int) vec_mergel(reg0, reg1);
324 reg10 = (vector signed int) vec_mergeh(reg2, reg3);
325 reg11 = (vector signed int) vec_mergel(reg2, reg3);
326 reg12 = (vector signed int) vec_mergeh(reg4, reg5);
327 reg13 = (vector signed int) vec_mergel(reg4, reg5);
328 reg14 = (vector signed int) vec_mergeh(reg6, reg7);
329 reg15 = (vector signed int) vec_mergel(reg6, reg7);
330
331 reg16 = (vector signed long long) vec_mergeh(reg8, reg10);
332 reg17 = (vector signed long long) vec_mergel(reg8, reg10);
333 reg18 = (vector signed long long) vec_mergeh(reg12, reg14);
334 reg19 = (vector signed long long) vec_mergel(reg12, reg14);
335 reg20 = (vector signed long long) vec_mergeh(reg9, reg11);
336 reg21 = (vector signed long long) vec_mergel(reg9, reg11);
337 reg22 = (vector signed long long) vec_mergeh(reg13, reg15);
338 reg23 = (vector signed long long) vec_mergel(reg13, reg15);
339
340 reg24 = (vector signed long long) vec_perm
341 (reg16, reg18, perm_merge_long_low);
342 reg25 = (vector signed long long) vec_perm
343 (reg16, reg18, perm_merge_long_high);
344 reg26 = (vector signed long long) vec_perm
345 (reg17, reg19, perm_merge_long_low);
346 reg27 = (vector signed long long) vec_perm
347 (reg17, reg19, perm_merge_long_high);
348 reg28 = (vector signed long long) vec_perm
349 (reg20, reg22, perm_merge_long_low);
350 reg29 = (vector signed long long) vec_perm
351 (reg20, reg22, perm_merge_long_high);
352 reg30 = (vector signed long long) vec_perm
353 (reg21, reg23, perm_merge_long_low);
354 reg31 = (vector signed long long) vec_perm
355 (reg21, reg23, perm_merge_long_high);
356 #else
357 reg8 = v_merge_lo_16(reg0, reg1);
358 reg9 = v_merge_hi_16(reg0, reg1);
359 reg10 = v_merge_lo_16(reg2, reg3);
360 reg11 = v_merge_hi_16(reg2, reg3);
361 reg12 = v_merge_lo_16(reg4, reg5);
362 reg13 = v_merge_hi_16(reg4, reg5);
363 reg14 = v_merge_lo_16(reg6, reg7);
364 reg15 = v_merge_hi_16(reg6, reg7);
365
366 reg16 = v_merge_lo_32(reg8, reg10);
367 reg17 = v_merge_hi_32(reg8, reg10);
368 reg18 = v_merge_lo_32(reg12, reg14);
369 reg19 = v_merge_hi_32(reg12, reg14);
370 reg20 = v_merge_lo_32(reg9, reg11);
371 reg21 = v_merge_hi_32(reg9, reg11);
372 reg22 = v_merge_lo_32(reg13, reg15);
373 reg23 = v_merge_hi_32(reg13, reg15);
374
375 reg24 = v_merge_lo_64(reg16, reg18);
376 reg25 = v_merge_hi_64(reg16, reg18);
377 reg26 = v_merge_lo_64(reg17, reg19);
378 reg27 = v_merge_hi_64(reg17, reg19);
379 reg28 = v_merge_lo_64(reg20, reg22);
380 reg29 = v_merge_hi_64(reg20, reg22);
381 reg30 = v_merge_lo_64(reg21, reg23);
382 reg31 = v_merge_hi_64(reg21, reg23);
383 #endif
384
385 v_store(dprofile_word + CDEPTH*CHANNELS*(i+0) + CHANNELS*j, reg24);
386 v_store(dprofile_word + CDEPTH*CHANNELS*(i+1) + CHANNELS*j, reg25);
387 v_store(dprofile_word + CDEPTH*CHANNELS*(i+2) + CHANNELS*j, reg26);
388 v_store(dprofile_word + CDEPTH*CHANNELS*(i+3) + CHANNELS*j, reg27);
389 v_store(dprofile_word + CDEPTH*CHANNELS*(i+4) + CHANNELS*j, reg28);
390 v_store(dprofile_word + CDEPTH*CHANNELS*(i+5) + CHANNELS*j, reg29);
391 v_store(dprofile_word + CDEPTH*CHANNELS*(i+6) + CHANNELS*j, reg30);
392 v_store(dprofile_word + CDEPTH*CHANNELS*(i+7) + CHANNELS*j, reg31);
393 }
394 }
395 #if 0
396 dprofile_dump16(dprofile_word);
397 #endif
398 }
399
400 /*
401 The direction bits are set as follows:
402 in DIR[0..1] if F>H initially (must go up) (4th pri)
403 in DIR[2..3] if E>max(H,F) (must go left) (3rd pri)
404 in DIR[4..5] if new F>H (must extend up) (2nd pri)
405 in DIR[6..7] if new E>H (must extend left) (1st pri)
406 no bits set: go diagonally
407 */
408
409 /*
410 On PPC the fifth parameter is a vector for the result in the lower 64 bits.
411 On x86_64 the fifth parameter is the address to write the result to.
412 */
413
414 #ifdef __PPC__
415
416 /* Handle differences between GNU and IBM compilers */
417
418 #ifdef __IBMCPP__
419 #define VECTORBYTEPERMUTE vec_bperm
420 #else
421 #define VECTORBYTEPERMUTE vec_vbpermq
422 #endif
423
424 /* The VSX vec_bperm instruction puts the 16 selected bits of the first
425 source into bits 48-63 of the destination. */
426
427 const vector unsigned char perm = { 120, 112, 104, 96, 88, 80, 72, 64,
428 56, 48, 40, 32, 24, 16, 8, 0 };
429
430 #define ALIGNCORE(H, N, F, V, RES, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \
431 { \
432 vector unsigned short W, X, Y, Z; \
433 vector unsigned int WX, YZ; \
434 vector short VV; \
435 VV = v_load(&V); \
436 H = v_add(H, VV); \
437 W = (vector unsigned short) VECTORBYTEPERMUTE \
438 ((vector unsigned char) vec_cmpgt(F, H), perm); \
439 H = v_max(H, F); \
440 X = (vector unsigned short) VECTORBYTEPERMUTE \
441 ((vector unsigned char) vec_cmpgt(E, H), perm); \
442 H = v_max(H, E); \
443 H_MIN = v_min(H_MIN, H); \
444 H_MAX = v_max(H_MAX, H); \
445 N = H; \
446 HF = v_sub(H, QR_t); \
447 F = v_sub(F, R_t); \
448 Y = (vector unsigned short) VECTORBYTEPERMUTE \
449 ((vector unsigned char) vec_cmpgt(F, HF), perm); \
450 F = v_max(F, HF); \
451 HE = v_sub(H, QR_q); \
452 E = v_sub(E, R_q); \
453 Z = (vector unsigned short) VECTORBYTEPERMUTE \
454 ((vector unsigned char) vec_cmpgt(E, HE), perm); \
455 E = v_max(E, HE); \
456 WX = (vector unsigned int) vec_mergel(W, X); \
457 YZ = (vector unsigned int) vec_mergel(Y, Z); \
458 RES = (vector unsigned long long) vec_mergeh(WX, YZ); \
459 }
460
461 #else
462
463 /* x86_64 & aarch64 */
464
465 #define ALIGNCORE(H, N, F, V, PATH, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \
466 H = v_add(H, V); \
467 *(PATH+0) = v_mask_gt(F, H); \
468 H = v_max(H, F); \
469 *(PATH+1) = v_mask_gt(E, H); \
470 H = v_max(H, E); \
471 H_MIN = v_min(H_MIN, H); \
472 H_MAX = v_max(H_MAX, H); \
473 N = H; \
474 HF = v_sub(H, QR_t); \
475 F = v_sub(F, R_t); \
476 *(PATH+2) = v_mask_gt(F, HF); \
477 F = v_max(F, HF); \
478 HE = v_sub(H, QR_q); \
479 E = v_sub(E, R_q); \
480 *(PATH+3) = v_mask_gt(E, HE); \
481 E = v_max(E, HE);
482
483 #endif
484
aligncolumns_first(VECTOR_SHORT * Sm,VECTOR_SHORT * hep,VECTOR_SHORT ** qp,VECTOR_SHORT QR_q_i,VECTOR_SHORT R_q_i,VECTOR_SHORT QR_q_r,VECTOR_SHORT R_q_r,VECTOR_SHORT QR_t_0,VECTOR_SHORT R_t_0,VECTOR_SHORT QR_t_1,VECTOR_SHORT R_t_1,VECTOR_SHORT QR_t_2,VECTOR_SHORT R_t_2,VECTOR_SHORT QR_t_3,VECTOR_SHORT R_t_3,VECTOR_SHORT h0,VECTOR_SHORT h1,VECTOR_SHORT h2,VECTOR_SHORT h3,VECTOR_SHORT f0,VECTOR_SHORT f1,VECTOR_SHORT f2,VECTOR_SHORT f3,VECTOR_SHORT * _h_min,VECTOR_SHORT * _h_max,VECTOR_SHORT Mm,VECTOR_SHORT M_QR_t_left,VECTOR_SHORT M_R_t_left,VECTOR_SHORT M_QR_q_interior,VECTOR_SHORT M_QR_q_right,int64_t ql,unsigned short * dir)485 void aligncolumns_first(VECTOR_SHORT * Sm,
486 VECTOR_SHORT * hep,
487 VECTOR_SHORT ** qp,
488 VECTOR_SHORT QR_q_i,
489 VECTOR_SHORT R_q_i,
490 VECTOR_SHORT QR_q_r,
491 VECTOR_SHORT R_q_r,
492 VECTOR_SHORT QR_t_0,
493 VECTOR_SHORT R_t_0,
494 VECTOR_SHORT QR_t_1,
495 VECTOR_SHORT R_t_1,
496 VECTOR_SHORT QR_t_2,
497 VECTOR_SHORT R_t_2,
498 VECTOR_SHORT QR_t_3,
499 VECTOR_SHORT R_t_3,
500 VECTOR_SHORT h0,
501 VECTOR_SHORT h1,
502 VECTOR_SHORT h2,
503 VECTOR_SHORT h3,
504 VECTOR_SHORT f0,
505 VECTOR_SHORT f1,
506 VECTOR_SHORT f2,
507 VECTOR_SHORT f3,
508 VECTOR_SHORT * _h_min,
509 VECTOR_SHORT * _h_max,
510 VECTOR_SHORT Mm,
511 VECTOR_SHORT M_QR_t_left,
512 VECTOR_SHORT M_R_t_left,
513 VECTOR_SHORT M_QR_q_interior,
514 VECTOR_SHORT M_QR_q_right,
515 int64_t ql,
516 unsigned short * dir)
517 {
518
519 VECTOR_SHORT h4, h5, h6, h7, h8, E, HE, HF;
520 VECTOR_SHORT * vp;
521
522 VECTOR_SHORT h_min = v_zero;
523 VECTOR_SHORT h_max = v_zero;
524
525 #ifdef __PPC__
526 vector unsigned long long RES1, RES2, RES;
527 #endif
528
529 int64_t i;
530
531 f0 = v_sub(f0, QR_t_0);
532 f1 = v_sub(f1, QR_t_1);
533 f2 = v_sub(f2, QR_t_2);
534 f3 = v_sub(f3, QR_t_3);
535
536
537 for(i=0; i < ql - 1; i++)
538 {
539 vp = qp[i+0];
540
541 h4 = hep[2*i+0];
542
543 E = hep[2*i+1];
544
545 /*
546 Initialize selected h and e values for next/this round.
547 First zero those cells where a new sequence starts
548 by using an unsigned saturated subtraction of a huge value to
549 set it to zero.
550 Then use signed subtraction to obtain the correct value.
551 */
552
553 h4 = v_sub_unsigned(h4, Mm);
554 h4 = v_sub(h4, M_QR_t_left);
555
556 E = v_sub_unsigned(E, Mm);
557 E = v_sub(E, M_QR_t_left);
558 E = v_sub(E, M_QR_q_interior);
559
560 M_QR_t_left = v_add(M_QR_t_left, M_R_t_left);
561
562 #ifdef __PPC__
563 ALIGNCORE(h0, h5, f0, vp[0], RES1,
564 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
565 ALIGNCORE(h1, h6, f1, vp[1], RES2,
566 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
567 RES = vec_perm(RES1, RES2, perm_merge_long_low);
568 v_store((dir + 16*i + 0), RES);
569 ALIGNCORE(h2, h7, f2, vp[2], RES1,
570 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
571 ALIGNCORE(h3, h8, f3, vp[3], RES2,
572 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
573 RES = vec_perm(RES1, RES2, perm_merge_long_low);
574 v_store((dir + 16*i + 8), RES);
575 #else
576 ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+0,
577 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
578 ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+4,
579 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
580 ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+8,
581 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
582 ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
583 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
584 #endif
585
586 hep[2*i+0] = h8;
587 hep[2*i+1] = E;
588
589 h0 = h4;
590 h1 = h5;
591 h2 = h6;
592 h3 = h7;
593 }
594
595 /* the final round - using query gap penalties for right end */
596
597 vp = qp[i+0];
598
599 E = hep[2*i+1];
600
601 E = v_sub_unsigned(E, Mm);
602 E = v_sub(E, M_QR_t_left);
603 E = v_sub(E, M_QR_q_right);
604
605
606 #ifdef __PPC__
607 ALIGNCORE(h0, h5, f0, vp[0], RES1,
608 QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
609 ALIGNCORE(h1, h6, f1, vp[1], RES2,
610 QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
611 RES = vec_perm(RES1, RES2, perm_merge_long_low);
612 v_store((dir + 16*i + 0), RES);
613 ALIGNCORE(h2, h7, f2, vp[2], RES1,
614 QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
615 ALIGNCORE(h3, h8, f3, vp[3], RES2,
616 QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
617 RES = vec_perm(RES1, RES2, perm_merge_long_low);
618 v_store((dir + 16*i + 8), RES);
619 #else
620 ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0,
621 QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
622 ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4,
623 QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
624 ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8,
625 QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
626 ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
627 QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
628 #endif
629
630
631 hep[2*i+0] = h8;
632 hep[2*i+1] = E;
633
634 Sm[0] = h5;
635 Sm[1] = h6;
636 Sm[2] = h7;
637 Sm[3] = h8;
638
639 *_h_min = h_min;
640 *_h_max = h_max;
641 }
642
aligncolumns_rest(VECTOR_SHORT * Sm,VECTOR_SHORT * hep,VECTOR_SHORT ** qp,VECTOR_SHORT QR_q_i,VECTOR_SHORT R_q_i,VECTOR_SHORT QR_q_r,VECTOR_SHORT R_q_r,VECTOR_SHORT QR_t_0,VECTOR_SHORT R_t_0,VECTOR_SHORT QR_t_1,VECTOR_SHORT R_t_1,VECTOR_SHORT QR_t_2,VECTOR_SHORT R_t_2,VECTOR_SHORT QR_t_3,VECTOR_SHORT R_t_3,VECTOR_SHORT h0,VECTOR_SHORT h1,VECTOR_SHORT h2,VECTOR_SHORT h3,VECTOR_SHORT f0,VECTOR_SHORT f1,VECTOR_SHORT f2,VECTOR_SHORT f3,VECTOR_SHORT * _h_min,VECTOR_SHORT * _h_max,int64_t ql,unsigned short * dir)643 void aligncolumns_rest(VECTOR_SHORT * Sm,
644 VECTOR_SHORT * hep,
645 VECTOR_SHORT ** qp,
646 VECTOR_SHORT QR_q_i,
647 VECTOR_SHORT R_q_i,
648 VECTOR_SHORT QR_q_r,
649 VECTOR_SHORT R_q_r,
650 VECTOR_SHORT QR_t_0,
651 VECTOR_SHORT R_t_0,
652 VECTOR_SHORT QR_t_1,
653 VECTOR_SHORT R_t_1,
654 VECTOR_SHORT QR_t_2,
655 VECTOR_SHORT R_t_2,
656 VECTOR_SHORT QR_t_3,
657 VECTOR_SHORT R_t_3,
658 VECTOR_SHORT h0,
659 VECTOR_SHORT h1,
660 VECTOR_SHORT h2,
661 VECTOR_SHORT h3,
662 VECTOR_SHORT f0,
663 VECTOR_SHORT f1,
664 VECTOR_SHORT f2,
665 VECTOR_SHORT f3,
666 VECTOR_SHORT * _h_min,
667 VECTOR_SHORT * _h_max,
668 int64_t ql,
669 unsigned short * dir)
670 {
671 VECTOR_SHORT h4, h5, h6, h7, h8, E, HE, HF;
672 VECTOR_SHORT * vp;
673
674 VECTOR_SHORT h_min = v_zero;
675 VECTOR_SHORT h_max = v_zero;
676
677 #ifdef __PPC__
678 vector unsigned long long RES1, RES2, RES;
679 #endif
680
681 int64_t i;
682
683 f0 = v_sub(f0, QR_t_0);
684 f1 = v_sub(f1, QR_t_1);
685 f2 = v_sub(f2, QR_t_2);
686 f3 = v_sub(f3, QR_t_3);
687
688 for(i=0; i < ql - 1; i++)
689 {
690 vp = qp[i+0];
691
692 h4 = hep[2*i+0];
693
694 E = hep[2*i+1];
695
696 #ifdef __PPC__
697 ALIGNCORE(h0, h5, f0, vp[0], RES1,
698 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
699 ALIGNCORE(h1, h6, f1, vp[1], RES2,
700 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
701 RES = vec_perm(RES1, RES2, perm_merge_long_low);
702 v_store((dir + 16*i + 0), RES);
703 ALIGNCORE(h2, h7, f2, vp[2], RES1,
704 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
705 ALIGNCORE(h3, h8, f3, vp[3], RES2,
706 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
707 RES = vec_perm(RES1, RES2, perm_merge_long_low);
708 v_store((dir + 16*i + 8), RES);
709 #else
710 ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0,
711 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
712 ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4,
713 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
714 ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8,
715 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
716 ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
717 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
718 #endif
719
720 hep[2*i+0] = h8;
721 hep[2*i+1] = E;
722
723 h0 = h4;
724 h1 = h5;
725 h2 = h6;
726 h3 = h7;
727 }
728
729 /* the final round - using query gap penalties for right end */
730
731 vp = qp[i+0];
732
733 E = hep[2*i+1];
734
735 #ifdef __PPC__
736 ALIGNCORE(h0, h5, f0, vp[0], RES1,
737 QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
738 ALIGNCORE(h1, h6, f1, vp[1], RES2,
739 QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
740 RES = vec_perm(RES1, RES2, perm_merge_long_low);
741 v_store((dir + 16*i + 0), RES);
742 ALIGNCORE(h2, h7, f2, vp[2], RES1,
743 QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
744 ALIGNCORE(h3, h8, f3, vp[3], RES2,
745 QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
746 RES = vec_perm(RES1, RES2, perm_merge_long_low);
747 v_store((dir + 16*i + 8), RES);
748 #else
749 ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0,
750 QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
751 ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4,
752 QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
753 ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8,
754 QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
755 ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
756 QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
757 #endif
758
759 hep[2*i+0] = h8;
760 hep[2*i+1] = E;
761
762 Sm[0] = h5;
763 Sm[1] = h6;
764 Sm[2] = h7;
765 Sm[3] = h8;
766
767 *_h_min = h_min;
768 *_h_max = h_max;
769 }
770
pushop(s16info_s * s,char newop)771 inline void pushop(s16info_s * s, char newop)
772 {
773 if (newop == s->op) {
774 s->opcount++;
775 } else
776 {
777 *--s->cigarend = s->op;
778 if (s->opcount > 1)
779 {
780 char buf[11];
781 int len = sprintf(buf, "%d", s->opcount);
782 s->cigarend -= len;
783 memcpy(s->cigarend, buf, len);
784 }
785 s->op = newop;
786 s->opcount = 1;
787 }
788 }
789
finishop(s16info_s * s)790 inline void finishop(s16info_s * s)
791 {
792 if (s->op && s->opcount)
793 {
794 *--s->cigarend = s->op;
795 if (s->opcount > 1)
796 {
797 char buf[11];
798 int len = sprintf(buf, "%d", s->opcount);
799 s->cigarend -= len;
800 memcpy(s->cigarend, buf, len);
801 }
802 s->op = 0;
803 s->opcount = 0;
804 }
805 }
806
backtrack16(s16info_s * s,char * dseq,uint64_t dlen,uint64_t offset,uint64_t channel,unsigned short * paligned,unsigned short * pmatches,unsigned short * pmismatches,unsigned short * pgaps)807 void backtrack16(s16info_s * s,
808 char * dseq,
809 uint64_t dlen,
810 uint64_t offset,
811 uint64_t channel,
812 unsigned short * paligned,
813 unsigned short * pmatches,
814 unsigned short * pmismatches,
815 unsigned short * pgaps)
816 {
817 unsigned short * dirbuffer = s->dir;
818 uint64_t dirbuffersize = s->qlen * s->maxdlen * 4;
819 uint64_t qlen = s->qlen;
820 char * qseq = s->qseq;
821
822 uint64_t maskup = 3ULL << (2*channel+ 0);
823 uint64_t maskleft = 3ULL << (2*channel+16);
824 uint64_t maskextup = 3ULL << (2*channel+32);
825 uint64_t maskextleft = 3ULL << (2*channel+48);
826
827 #if 0
828
829 printf("Dumping backtracking array\n");
830
831 for(uint64_t i=0; i<qlen; i++)
832 {
833 for(uint64_t j=0; j<dlen; j++)
834 {
835 uint64_t d = *((uint64_t *) (dirbuffer +
836 (offset + 16*s->qlen*(j/4) +
837 16*i + 4*(j&3)) % dirbuffersize));
838 if (d & maskup)
839 {
840 if (d & maskleft)
841 printf("+");
842 else
843 printf("^");
844 }
845 else if (d & maskleft)
846 {
847 printf("<");
848 }
849 else
850 {
851 printf("\\");
852 }
853 }
854 printf("\n");
855 }
856
857 printf("Dumping gap extension array\n");
858
859 for(uint64_t i=0; i<qlen; i++)
860 {
861 for(uint64_t j=0; j<dlen; j++)
862 {
863 uint64_t d = *((uint64_t *) (dirbuffer +
864 (offset + 16*s->qlen*(j/4) +
865 16*i + 4*(j&3)) % dirbuffersize));
866 if (d & maskextup)
867 {
868 if (d & maskextleft)
869 printf("+");
870 else
871 printf("^");
872 }
873 else if (d & maskextleft)
874 {
875 printf("<");
876 }
877 else
878 {
879 printf("\\");
880 }
881 }
882 printf("\n");
883 }
884
885 #endif
886
887 unsigned short aligned = 0;
888 unsigned short matches = 0;
889 unsigned short mismatches = 0;
890 unsigned short gaps = 0;
891
892 int64_t i = qlen - 1;
893 int64_t j = dlen - 1;
894
895 s->cigarend = s->cigar + s->qlen + s->maxdlen + 1;
896 s->op = 0;
897 s->opcount = 1;
898
899 while ((i>=0) && (j>=0))
900 {
901 aligned++;
902
903 uint64_t d = *((uint64_t *) (dirbuffer +
904 (offset + 16*s->qlen*(j/4) +
905 16*i + 4*(j&3)) % dirbuffersize));
906
907 if ((s->op == 'I') && (d & maskextleft))
908 {
909 j--;
910 pushop(s, 'I');
911 }
912 else if ((s->op == 'D') && (d & maskextup))
913 {
914 i--;
915 pushop(s, 'D');
916 }
917 else if (d & maskleft)
918 {
919 if (s->op != 'I') {
920 gaps++;
921
922 }
923 j--;
924 pushop(s, 'I');
925 }
926 else if (d & maskup)
927 {
928 if (s->op != 'D') {
929 gaps++;
930
931 }
932 i--;
933 pushop(s, 'D');
934 }
935 else
936 {
937 if (chrmap_4bit[(int)(qseq[i])] & chrmap_4bit[(int)(dseq[j])]) {
938 matches++;
939 } else {
940 mismatches++;
941
942 }
943 i--;
944 j--;
945 pushop(s, 'M');
946 }
947 }
948
949 while(i>=0)
950 {
951 aligned++;
952 if (s->op != 'D') {
953 gaps++;
954
955 }
956 i--;
957 pushop(s, 'D');
958 }
959
960 while(j>=0)
961 {
962 aligned++;
963 if (s->op != 'I') {
964 gaps++;
965
966 }
967 j--;
968 pushop(s, 'I');
969 }
970
971 finishop(s);
972
973 /* move cigar to beginning of allocated memory area */
974 int cigarlen = s->cigar + s->qlen + s->maxdlen - s->cigarend;
975 memmove(s->cigar, s->cigarend, cigarlen + 1);
976
977 * paligned = aligned;
978 * pmatches = matches;
979 * pmismatches = mismatches;
980 * pgaps = gaps;
981 }
982
search16_init(CELL score_match,CELL score_mismatch,CELL penalty_gap_open_query_left,CELL penalty_gap_open_target_left,CELL penalty_gap_open_query_interior,CELL penalty_gap_open_target_interior,CELL penalty_gap_open_query_right,CELL penalty_gap_open_target_right,CELL penalty_gap_extension_query_left,CELL penalty_gap_extension_target_left,CELL penalty_gap_extension_query_interior,CELL penalty_gap_extension_target_interior,CELL penalty_gap_extension_query_right,CELL penalty_gap_extension_target_right)983 struct s16info_s * search16_init(CELL score_match,
984 CELL score_mismatch,
985 CELL penalty_gap_open_query_left,
986 CELL penalty_gap_open_target_left,
987 CELL penalty_gap_open_query_interior,
988 CELL penalty_gap_open_target_interior,
989 CELL penalty_gap_open_query_right,
990 CELL penalty_gap_open_target_right,
991 CELL penalty_gap_extension_query_left,
992 CELL penalty_gap_extension_target_left,
993 CELL penalty_gap_extension_query_interior,
994 CELL penalty_gap_extension_target_interior,
995 CELL penalty_gap_extension_query_right,
996 CELL penalty_gap_extension_target_right)
997 {
998 (void) score_match;
999 (void) score_mismatch;
1000
1001 /* prepare alloc of qtable, dprofile, hearray, dir */
1002 auto * s = (struct s16info_s *)
1003 xmalloc(sizeof(struct s16info_s));
1004
1005 s->dprofile = (VECTOR_SHORT *) xmalloc(2*4*8*16);
1006 s->qlen = 0;
1007 s->qseq = nullptr;
1008 s->maxdlen = 0;
1009 s->dir = nullptr;
1010 s->diralloc = 0;
1011 s->hearray = nullptr;
1012 s->qtable = nullptr;
1013 s->cigar = nullptr;
1014 s->cigarend = nullptr;
1015 s->cigaralloc = 0;
1016
1017 for(int i=0; i<16; i++) {
1018 for(int j=0; j<16; j++)
1019 {
1020 CELL value;
1021 if (ambiguous_4bit[i] || ambiguous_4bit[j]) {
1022 value = 0;
1023 } else if (i == j) {
1024 value = opt_match;
1025 } else {
1026 value = opt_mismatch;
1027
1028 }
1029 ((CELL*)(&s->matrix))[16*i+j] = value;
1030 scorematrix[i][j] = value;
1031 }
1032
1033 }
1034
1035
1036 s->penalty_gap_open_query_left = penalty_gap_open_query_left;
1037 s->penalty_gap_open_query_interior = penalty_gap_open_query_interior;
1038 s->penalty_gap_open_query_right = penalty_gap_open_query_right;
1039
1040 s->penalty_gap_open_target_left = penalty_gap_open_target_left;
1041 s->penalty_gap_open_target_interior = penalty_gap_open_target_interior;
1042 s->penalty_gap_open_target_right = penalty_gap_open_target_right;
1043
1044 s->penalty_gap_extension_query_left = penalty_gap_extension_query_left;
1045 s->penalty_gap_extension_query_interior = penalty_gap_extension_query_interior;
1046 s->penalty_gap_extension_query_right = penalty_gap_extension_query_right;
1047
1048 s->penalty_gap_extension_target_left = penalty_gap_extension_target_left;
1049 s->penalty_gap_extension_target_interior = penalty_gap_extension_target_interior;
1050 s->penalty_gap_extension_target_right = penalty_gap_extension_target_right;
1051
1052 return s;
1053 }
1054
search16_exit(s16info_s * s)1055 void search16_exit(s16info_s * s)
1056 {
1057 /* free mem for dprofile, hearray, dir, qtable */
1058 if (s->dir) {
1059 xfree(s->dir);
1060
1061 }
1062 if (s->hearray) {
1063 xfree(s->hearray);
1064
1065 }
1066 if (s->dprofile) {
1067 xfree(s->dprofile);
1068
1069 }
1070 if (s->qtable) {
1071 xfree(s->qtable);
1072
1073 }
1074 if (s->cigar) {
1075 xfree(s->cigar);
1076
1077 }
1078 xfree(s);
1079 }
1080
search16_qprep(s16info_s * s,char * qseq,int qlen)1081 void search16_qprep(s16info_s * s, char * qseq, int qlen)
1082 {
1083 s->qlen = qlen;
1084 s->qseq = qseq;
1085
1086 if (s->hearray) {
1087 xfree(s->hearray);
1088
1089 }
1090 s->hearray = (VECTOR_SHORT *) xmalloc(2 * s->qlen * sizeof(VECTOR_SHORT));
1091 memset(s->hearray, 0, 2 * s->qlen * sizeof(VECTOR_SHORT));
1092
1093 if (s->qtable) {
1094 xfree(s->qtable);
1095
1096 }
1097 s->qtable = (VECTOR_SHORT **) xmalloc(s->qlen * sizeof(VECTOR_SHORT*));
1098
1099 for(int i = 0; i < qlen; i++) {
1100 s->qtable[i] = s->dprofile + 4 * chrmap_4bit[(int)(qseq[i])];
1101
1102 }
1103 }
1104
search16(s16info_s * s,unsigned int sequences,unsigned int * seqnos,CELL * pscores,unsigned short * paligned,unsigned short * pmatches,unsigned short * pmismatches,unsigned short * pgaps,char ** pcigar)1105 void search16(s16info_s * s,
1106 unsigned int sequences,
1107 unsigned int * seqnos,
1108 CELL * pscores,
1109 unsigned short * paligned,
1110 unsigned short * pmatches,
1111 unsigned short * pmismatches,
1112 unsigned short * pgaps,
1113 char ** pcigar)
1114 {
1115 CELL ** q_start = (CELL**) s->qtable;
1116 CELL * dprofile = (CELL*) s->dprofile;
1117 CELL * hearray = (CELL*) s->hearray;
1118 uint64_t qlen = s->qlen;
1119
1120 if (qlen == 0)
1121 {
1122 for (unsigned int cand_id = 0; cand_id < sequences; cand_id++)
1123 {
1124 unsigned int seqno = seqnos[cand_id];
1125 int64_t length = db_getsequencelen(seqno);
1126
1127 paligned[cand_id] = length;
1128 pmatches[cand_id] = 0;
1129 pmismatches[cand_id] = 0;
1130 pgaps[cand_id] = length;
1131
1132 if (length == 0) {
1133 pscores[cand_id] = 0;
1134 } else {
1135 pscores[cand_id] =
1136 MAX(- s->penalty_gap_open_target_left -
1137 length * s->penalty_gap_extension_target_left,
1138 - s->penalty_gap_open_target_right -
1139 length * s->penalty_gap_extension_target_right);
1140
1141 }
1142
1143 char * cigar = nullptr;
1144 if (length > 0)
1145 {
1146 int ret = xsprintf(&cigar, "%ldI", length);
1147 if ((ret < 2) || !cigar) {
1148 fatal("Unable to allocate enough memory.");
1149
1150 }
1151 }
1152 else
1153 {
1154 cigar = (char *) xmalloc(1);
1155 cigar[0] = 0;
1156 }
1157 pcigar[cand_id] = cigar;
1158 }
1159 return;
1160 }
1161
1162 /* find longest target sequence and reallocate direction buffer */
1163 uint64_t maxdlen = 0;
1164 for(int64_t i = 0; i < sequences; i++)
1165 {
1166 uint64_t dlen = db_getsequencelen(seqnos[i]);
1167 /* skip the very long sequences */
1168 if ((int64_t)(s->qlen) * dlen <= MAXSEQLENPRODUCT)
1169 {
1170 if (dlen > maxdlen) {
1171 maxdlen = dlen;
1172
1173 }
1174 }
1175 }
1176 maxdlen = 4 * ((maxdlen + 3) / 4);
1177 s->maxdlen = maxdlen;
1178 uint64_t dirbuffersize = s->qlen * s->maxdlen * 4;
1179
1180 if (dirbuffersize > s->diralloc)
1181 {
1182 s->diralloc = dirbuffersize;
1183 if (s->dir) {
1184 xfree(s->dir);
1185
1186 }
1187 s->dir = (unsigned short*) xmalloc(dirbuffersize *
1188 sizeof(unsigned short));
1189 }
1190
1191 unsigned short * dirbuffer = s->dir;
1192
1193 if (s->qlen + s->maxdlen + 1 > s->cigaralloc)
1194 {
1195 s->cigaralloc = s->qlen + s->maxdlen + 1;
1196 if (s->cigar) {
1197 xfree(s->cigar);
1198
1199 }
1200 s->cigar = (char *) xmalloc(s->cigaralloc);
1201 }
1202
1203 VECTOR_SHORT M, T0;
1204
1205 VECTOR_SHORT M_QR_target_left, M_R_target_left;
1206 VECTOR_SHORT M_QR_query_interior;
1207 VECTOR_SHORT M_QR_query_right;
1208
1209 VECTOR_SHORT R_query_left;
1210 VECTOR_SHORT QR_query_interior, R_query_interior;
1211 VECTOR_SHORT QR_query_right, R_query_right;
1212 VECTOR_SHORT QR_target_left, R_target_left;
1213 VECTOR_SHORT QR_target_interior, R_target_interior;
1214 VECTOR_SHORT QR_target_right, R_target_right;
1215 VECTOR_SHORT QR_target[4], R_target[4];
1216
1217 VECTOR_SHORT *hep, **qp;
1218
1219 BYTE * d_begin[CHANNELS];
1220 BYTE * d_end[CHANNELS];
1221 uint64_t d_offset[CHANNELS];
1222 BYTE * d_address[CHANNELS];
1223 uint64_t d_length[CHANNELS];
1224 int64_t seq_id[CHANNELS];
1225 bool overflow[CHANNELS];
1226
1227 VECTOR_SHORT dseqalloc[CDEPTH];
1228 VECTOR_SHORT S[4];
1229
1230 BYTE * dseq = (BYTE*) & dseqalloc;
1231 BYTE zero = 0;
1232
1233 uint64_t next_id = 0;
1234 uint64_t done = 0;
1235
1236 T0 = v_init(-1, 0, 0, 0, 0, 0, 0, 0);
1237
1238 R_query_left = v_dup(s->penalty_gap_extension_query_left);
1239
1240 QR_query_interior = v_dup((s->penalty_gap_open_query_interior +
1241 s->penalty_gap_extension_query_interior));
1242 R_query_interior = v_dup(s->penalty_gap_extension_query_interior);
1243
1244 QR_query_right = v_dup((s->penalty_gap_open_query_right +
1245 s->penalty_gap_extension_query_right));
1246 R_query_right = v_dup(s->penalty_gap_extension_query_right);
1247
1248 QR_target_left = v_dup((s->penalty_gap_open_target_left +
1249 s->penalty_gap_extension_target_left));
1250 R_target_left = v_dup(s->penalty_gap_extension_target_left);
1251
1252 QR_target_interior = v_dup((s->penalty_gap_open_target_interior +
1253 s->penalty_gap_extension_target_interior));
1254 R_target_interior = v_dup(s->penalty_gap_extension_target_interior);
1255
1256 QR_target_right = v_dup((s->penalty_gap_open_target_right +
1257 s->penalty_gap_extension_target_right));
1258 R_target_right = v_dup(s->penalty_gap_extension_target_right);
1259
1260 hep = (VECTOR_SHORT*) hearray;
1261 qp = (VECTOR_SHORT**) q_start;
1262
1263 for (int c=0; c<CHANNELS; c++)
1264 {
1265 d_begin[c] = &zero;
1266 d_end[c] = d_begin[c];
1267 d_address[c] = nullptr;
1268 d_offset[c] = 0;
1269 d_length[c] = 0;
1270 seq_id[c] = -1;
1271 overflow[c] = false;
1272 }
1273
1274 short gap_penalty_max = 0;
1275
1276 gap_penalty_max = MAX(gap_penalty_max,
1277 s->penalty_gap_open_query_left +
1278 s->penalty_gap_extension_query_left);
1279 gap_penalty_max = MAX(gap_penalty_max,
1280 s->penalty_gap_open_query_interior +
1281 s->penalty_gap_extension_query_interior);
1282 gap_penalty_max = MAX(gap_penalty_max,
1283 s->penalty_gap_open_query_right +
1284 s->penalty_gap_extension_query_right);
1285 gap_penalty_max = MAX(gap_penalty_max,
1286 s->penalty_gap_open_target_left +
1287 s->penalty_gap_extension_target_left);
1288 gap_penalty_max = MAX(gap_penalty_max,
1289 s->penalty_gap_open_target_interior +
1290 s->penalty_gap_extension_target_interior);
1291 gap_penalty_max = MAX(gap_penalty_max,
1292 s->penalty_gap_open_target_right +
1293 s->penalty_gap_extension_target_right);
1294
1295 short score_min = SHRT_MIN + gap_penalty_max;
1296 short score_max = SHRT_MAX;
1297
1298 for(int i=0; i<4; i++)
1299 {
1300 S[i] = v_zero;
1301 dseqalloc[i] = v_zero;
1302 }
1303
1304 VECTOR_SHORT H0 = v_zero;
1305 VECTOR_SHORT H1 = v_zero;
1306 VECTOR_SHORT H2 = v_zero;
1307 VECTOR_SHORT H3 = v_zero;
1308
1309 VECTOR_SHORT F0 = v_zero;
1310 VECTOR_SHORT F1 = v_zero;
1311 VECTOR_SHORT F2 = v_zero;
1312 VECTOR_SHORT F3 = v_zero;
1313
1314 int easy = 0;
1315
1316 unsigned short * dir = dirbuffer;
1317
1318 while(true)
1319 {
1320 if (easy)
1321 {
1322 /* fill all channels with symbols from the database sequences */
1323
1324 for(int c=0; c<CHANNELS; c++)
1325 {
1326 for(int j=0; j<CDEPTH; j++)
1327 {
1328 if (d_begin[c] < d_end[c]) {
1329 dseq[CHANNELS*j+c] = chrmap_4bit[*(d_begin[c]++)];
1330 } else {
1331 dseq[CHANNELS*j+c] = 0;
1332
1333 }
1334 }
1335 if (d_begin[c] == d_end[c]) {
1336 easy = 0;
1337
1338 }
1339 }
1340
1341 dprofile_fill16(dprofile, (CELL*) s->matrix, dseq);
1342
1343 /* create vectors of gap penalties for target depending on whether
1344 any of the database sequences ended in these four columns */
1345
1346 if (easy)
1347 {
1348 for(unsigned int j=0; j<CDEPTH; j++)
1349 {
1350 QR_target[j] = QR_target_interior;
1351 R_target[j] = R_target_interior;
1352 }
1353 }
1354 else
1355 {
1356 /* one or more sequences ended */
1357
1358 VECTOR_SHORT QR_diff = v_sub(QR_target_right,
1359 QR_target_interior);
1360 VECTOR_SHORT R_diff = v_sub(R_target_right,
1361 R_target_interior);
1362 for(unsigned int j=0; j<CDEPTH; j++)
1363 {
1364 VECTOR_SHORT MM = v_zero;
1365 VECTOR_SHORT TT = T0;
1366 for(int c=0; c<CHANNELS; c++)
1367 {
1368 if ((d_begin[c] == d_end[c]) &&
1369 (j >= ((d_length[c]+3) % 4)))
1370 {
1371 MM = v_xor(MM, TT);
1372 }
1373 TT = v_shift_left(TT);
1374 }
1375 QR_target[j] = v_add(QR_target_interior,
1376 v_and(QR_diff, MM));
1377 R_target[j] = v_add(R_target_interior,
1378 v_and(R_diff, MM));
1379 }
1380 }
1381
1382 VECTOR_SHORT h_min, h_max;
1383
1384 aligncolumns_rest(S, hep, qp,
1385 QR_query_interior, R_query_interior,
1386 QR_query_right, R_query_right,
1387 QR_target[0], R_target[0],
1388 QR_target[1], R_target[1],
1389 QR_target[2], R_target[2],
1390 QR_target[3], R_target[3],
1391 H0, H1, H2, H3,
1392 F0, F1, F2, F3,
1393 & h_min, & h_max,
1394 qlen, dir);
1395
1396 VECTOR_SHORT h_min_vector;
1397 VECTOR_SHORT h_max_vector;
1398 v_store(& h_min_vector, h_min);
1399 v_store(& h_max_vector, h_max);
1400 for(int c=0; c<CHANNELS; c++)
1401 {
1402 if (! overflow[c])
1403 {
1404 signed short h_min_c = ((signed short *)(& h_min_vector))[c];
1405 signed short h_max_c = ((signed short *)(& h_max_vector))[c];
1406 if ((h_min_c <= score_min) ||
1407 (h_max_c >= score_max)) {
1408 overflow[c] = true;
1409
1410 }
1411 }
1412 }
1413 }
1414 else
1415 {
1416 /* One or more sequences ended in the previous block.
1417 We have to switch over to a new sequence */
1418
1419 easy = 1;
1420
1421 M = v_zero;
1422
1423 VECTOR_SHORT T = T0;
1424 for (int c=0; c<CHANNELS; c++)
1425 {
1426 if (d_begin[c] < d_end[c])
1427 {
1428 /* this channel has more sequence */
1429
1430 for(int j=0; j<CDEPTH; j++)
1431 {
1432 if (d_begin[c] < d_end[c]) {
1433 dseq[CHANNELS*j+c] = chrmap_4bit[*(d_begin[c]++)];
1434 } else {
1435 dseq[CHANNELS*j+c] = 0;
1436
1437 }
1438 }
1439 if (d_begin[c] == d_end[c]) {
1440 easy = 0;
1441
1442 }
1443 }
1444 else
1445 {
1446 /* sequence in channel c ended. change of sequence */
1447
1448 M = v_xor(M, T);
1449
1450 int64_t cand_id = seq_id[c];
1451
1452 if (cand_id >= 0)
1453 {
1454 /* save score */
1455
1456 char * dbseq = (char*) d_address[c];
1457 int64_t dbseqlen = d_length[c];
1458 int64_t z = (dbseqlen+3) % 4;
1459 int64_t score = ((CELL*)S)[z*CHANNELS+c];
1460
1461 if (overflow[c])
1462 {
1463 pscores[cand_id] = SHRT_MAX;
1464 paligned[cand_id] = 0;
1465 pmatches[cand_id] = 0;
1466 pmismatches[cand_id] = 0;
1467 pgaps[cand_id] = 0;
1468 pcigar[cand_id] = xstrdup("");
1469 }
1470 else
1471 {
1472 pscores[cand_id] = score;
1473 backtrack16(s, dbseq, dbseqlen, d_offset[c], c,
1474 paligned + cand_id,
1475 pmatches + cand_id,
1476 pmismatches + cand_id,
1477 pgaps + cand_id);
1478 pcigar[cand_id] = (char *) xmalloc(strlen(s->cigar)+1);
1479 strcpy(pcigar[cand_id], s->cigar);
1480 }
1481
1482 done++;
1483 }
1484
1485 /* get next sequence of reasonable length */
1486
1487 int64_t length = 0;
1488
1489 while ((length == 0) && (next_id < sequences))
1490 {
1491 cand_id = next_id++;
1492 length = db_getsequencelen(seqnos[cand_id]);
1493 if ((length==0) || (s->qlen * length > MAXSEQLENPRODUCT))
1494 {
1495 pscores[cand_id] = SHRT_MAX;
1496 paligned[cand_id] = 0;
1497 pmatches[cand_id] = 0;
1498 pmismatches[cand_id] = 0;
1499 pgaps[cand_id] = 0;
1500 pcigar[cand_id] = xstrdup("");
1501 length = 0;
1502 done++;
1503 }
1504 }
1505
1506 if (length > 0)
1507 {
1508 seq_id[c] = cand_id;
1509 char * address = db_getsequence(seqnos[cand_id]);
1510 d_address[c] = (BYTE*) address;
1511 d_length[c] = length;
1512 d_begin[c] = (unsigned char*) address;
1513 d_end[c] = (unsigned char*) address + length;
1514 d_offset[c] = dir - dirbuffer;
1515 overflow[c] = false;
1516
1517 ((CELL*)&H0)[c] = 0;
1518 ((CELL*)&H1)[c] = - s->penalty_gap_open_query_left
1519 - 1*s->penalty_gap_extension_query_left;
1520 ((CELL*)&H2)[c] = - s->penalty_gap_open_query_left
1521 - 2*s->penalty_gap_extension_query_left;
1522 ((CELL*)&H3)[c] = - s->penalty_gap_open_query_left
1523 - 3*s->penalty_gap_extension_query_left;
1524
1525 ((CELL*)&F0)[c] = - s->penalty_gap_open_query_left
1526 - 1*s->penalty_gap_extension_query_left;
1527 ((CELL*)&F1)[c] = - s->penalty_gap_open_query_left
1528 - 2*s->penalty_gap_extension_query_left;
1529 ((CELL*)&F2)[c] = - s->penalty_gap_open_query_left
1530 - 3*s->penalty_gap_extension_query_left;
1531 ((CELL*)&F3)[c] = - s->penalty_gap_open_query_left
1532 - 4*s->penalty_gap_extension_query_left;
1533
1534 /* fill channel */
1535
1536 for(int j=0; j<CDEPTH; j++)
1537 {
1538 if (d_begin[c] < d_end[c]) {
1539 dseq[CHANNELS*j+c] = chrmap_4bit[*(d_begin[c]++)];
1540 } else {
1541 dseq[CHANNELS*j+c] = 0;
1542
1543 }
1544 }
1545 if (d_begin[c] == d_end[c]) {
1546 easy = 0;
1547
1548 }
1549 }
1550 else
1551 {
1552 /* no more sequences, empty channel */
1553
1554 seq_id[c] = -1;
1555 d_address[c] = nullptr;
1556 d_begin[c] = &zero;
1557 d_end[c] = d_begin[c];
1558 d_length[c] = 0;
1559 d_offset[c] = 0;
1560 for (int j=0; j<CDEPTH; j++) {
1561 dseq[CHANNELS*j+c] = 0;
1562
1563 }
1564 }
1565 }
1566 T = v_shift_left(T);
1567 }
1568
1569 if (done == sequences) {
1570 break;
1571
1572 }
1573
1574 /* make masked versions of QR and R for gaps in target */
1575
1576 M_QR_target_left = v_and(M, QR_target_left);
1577 M_R_target_left = v_and(M, R_target_left);
1578
1579 /* make masked versions of QR for gaps in query at target left end */
1580
1581 M_QR_query_interior = v_and(M, QR_query_interior);
1582 M_QR_query_right = v_and(M, QR_query_right);
1583
1584 dprofile_fill16(dprofile, (CELL*) s->matrix, dseq);
1585
1586 /* create vectors of gap penalties for target depending on whether
1587 any of the database sequences ended in these four columns */
1588
1589 if (easy)
1590 {
1591 for(unsigned int j=0; j<CDEPTH; j++)
1592 {
1593 QR_target[j] = QR_target_interior;
1594 R_target[j] = R_target_interior;
1595 }
1596 }
1597 else
1598 {
1599 /* one or more sequences ended */
1600
1601 VECTOR_SHORT QR_diff = v_sub(QR_target_right,
1602 QR_target_interior);
1603 VECTOR_SHORT R_diff = v_sub(R_target_right,
1604 R_target_interior);
1605 for(unsigned int j=0; j<CDEPTH; j++)
1606 {
1607 VECTOR_SHORT MM = v_zero;
1608 VECTOR_SHORT TT = T0;
1609 for(int c=0; c<CHANNELS; c++)
1610 {
1611 if ((d_begin[c] == d_end[c]) &&
1612 (j >= ((d_length[c]+3) % 4)))
1613 {
1614 MM = v_xor(MM, TT);
1615 }
1616 TT = v_shift_left(TT);
1617 }
1618 QR_target[j] = v_add(QR_target_interior,
1619 v_and(QR_diff, MM));
1620 R_target[j] = v_add(R_target_interior,
1621 v_and(R_diff, MM));
1622 }
1623 }
1624
1625 VECTOR_SHORT h_min, h_max;
1626
1627 aligncolumns_first(S, hep, qp,
1628 QR_query_interior, R_query_interior,
1629 QR_query_right, R_query_right,
1630 QR_target[0], R_target[0],
1631 QR_target[1], R_target[1],
1632 QR_target[2], R_target[2],
1633 QR_target[3], R_target[3],
1634 H0, H1, H2, H3,
1635 F0, F1, F2, F3,
1636 & h_min, & h_max,
1637 M,
1638 M_QR_target_left, M_R_target_left,
1639 M_QR_query_interior,
1640 M_QR_query_right,
1641 qlen, dir);
1642
1643 VECTOR_SHORT h_min_vector;
1644 VECTOR_SHORT h_max_vector;
1645 v_store(& h_min_vector, h_min);
1646 v_store(& h_max_vector, h_max);
1647 for(int c=0; c<CHANNELS; c++)
1648 {
1649 if (! overflow[c])
1650 {
1651 signed short h_min_c = ((signed short *)(& h_min_vector))[c];
1652 signed short h_max_c = ((signed short *)(& h_max_vector))[c];
1653 if ((h_min_c <= score_min) ||
1654 (h_max_c >= score_max)) {
1655 overflow[c] = true;
1656
1657 }
1658 }
1659 }
1660 }
1661
1662 H0 = v_sub(H3, R_query_left);
1663 H1 = v_sub(H0, R_query_left);
1664 H2 = v_sub(H1, R_query_left);
1665 H3 = v_sub(H2, R_query_left);
1666
1667 F0 = v_sub(F3, R_query_left);
1668 F1 = v_sub(F0, R_query_left);
1669 F2 = v_sub(F1, R_query_left);
1670 F3 = v_sub(F2, R_query_left);
1671
1672 dir += 4 * 4 * s->qlen;
1673
1674 if (dir >= dirbuffer + dirbuffersize) {
1675 dir -= dirbuffersize;
1676
1677 }
1678 }
1679 }
1680