1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 /*
64   Using 16-bit signed values, from -32768 to +32767.
65   match: positive
66   mismatch: negative
67   gap penalties: positive (open, extend, query/target, left/interior/right)
68   optimal global alignment (NW)
69   maximize score
70 */
71 
72 #define CHANNELS 8
73 #define CDEPTH 4
74 
75 /*
76    Due to memory usage, limit the product of the length of the sequences.
77    If the product of the query length and any target sequence length
78    is above the limit, the alignment will not be computed and a score
79    of SHRT_MAX will be returned as the score.
80    If an overflow occurs during alignment computation, a score of
81    SHRT_MAX will also be returned.
82 
83    The limit is set to 5 000 * 5 000 = 25 000 000. This will allocate up to
84    200 MB per thread. It will align pairs of sequences less than 5000 nt long
85    using the SIMD implementation, larger alignments will be performed with
86    the linear memory aligner.
87 */
88 
89 #define MAXSEQLENPRODUCT 25000000
90 
91 static int64_t scorematrix[16][16];
92 
93 /*
94    The macros below usually operate on 128-bit vectors of 8 signed
95    short 16-bit integers. Additions and subtractions should be
96    saturated.  The shift operation should shift left by 2 bytes (one
97    short int) and shift in zeros. The v_mask_gt operation should
98    compare two vectors of signed shorts and return a 16-bit bitmask
99    with pairs of 2 bits set for each element greater in the first than
100    in the second argument.
101 */
102 
103 #ifdef __PPC__
104 
105 typedef vector signed short VECTOR_SHORT;
106 
107 const vector unsigned char perm_merge_long_low =
108   {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
109    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
110 
111 const vector unsigned char perm_merge_long_high =
112   {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
113    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
114 
115 #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h}
116 #define v_load(a) vec_ld(0, (VECTOR_SHORT *)(a))
117 #define v_store(a, b) vec_st((vector unsigned char)(b), 0,              \
118                              (vector unsigned char *)(a))
119 #define v_add(a, b) vec_adds((a), (b))
120 #define v_sub(a, b) vec_subs((a), (b))
121 #define v_sub_unsigned(a, b) ((VECTOR_SHORT)                            \
122                               vec_subs((vector unsigned short) (a),     \
123                                        (vector unsigned short) (b)))
124 #define v_max(a, b) vec_max((a), (b))
125 #define v_min(a, b) vec_min((a), (b))
126 #define v_dup(a) vec_splat((VECTOR_SHORT){(short)(a), 0, 0, 0, 0, 0, 0, 0}, 0);
127 #define v_zero vec_splat_s16(0)
128 #define v_and(a, b) vec_and((a), (b))
129 #define v_xor(a, b) vec_xor((a), (b))
130 #define v_shift_left(a) vec_sld((a), v_zero, 2)
131 
132 #elif defined __aarch64__
133 
134 typedef int16x8_t VECTOR_SHORT;
135 
136 const uint16x8_t neon_mask =
137   {0x0003, 0x000c, 0x0030, 0x00c0, 0x0300, 0x0c00, 0x3000, 0xc000};
138 
139 #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h}
140 #define v_load(a) vld1q_s16((const int16_t *)(a))
141 #define v_store(a, b) vst1q_s16((int16_t *)(a), (b))
142 #define v_merge_lo_16(a, b) vzip1q_s16((a),(b))
143 #define v_merge_hi_16(a, b) vzip2q_s16((a),(b))
144 #define v_merge_lo_32(a, b) vreinterpretq_s16_s32(vzip1q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b)))
145 #define v_merge_hi_32(a, b) vreinterpretq_s16_s32(vzip2q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b)))
146 #define v_merge_lo_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(a)), vget_low_s64(vreinterpretq_s64_s16(b))))
147 #define v_merge_hi_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s16(a)), vget_high_s64(vreinterpretq_s64_s16(b))))
148 #define v_add(a, b) vqaddq_s16((a), (b))
149 #define v_sub(a, b) vqsubq_s16((a), (b))
150 #define v_sub_unsigned(a, b) vreinterpretq_s16_u16(vqsubq_u16(vreinterpretq_u16_s16(a), vreinterpretq_u16_s16(b)))
151 #define v_max(a, b) vmaxq_s16((a), (b))
152 #define v_min(a, b) vminq_s16((a), (b))
153 #define v_dup(a) vdupq_n_s16(a)
154 #define v_zero v_dup(0)
155 #define v_and(a, b) vandq_s16((a), (b))
156 #define v_xor(a, b) veorq_s16((a), (b))
157 #define v_shift_left(a) vextq_s16((v_zero), (a), 7)
158 #define v_mask_gt(a, b) vaddvq_u16(vandq_u16((vcgtq_s16((a), (b))), neon_mask))
159 
160 #elif __x86_64__
161 
162 typedef __m128i VECTOR_SHORT;
163 
164 #define v_init(a,b,c,d,e,f,g,h) _mm_set_epi16(h,g,f,e,d,c,b,a)
165 #define v_load(a) _mm_load_si128((VECTOR_SHORT *)(a))
166 #define v_store(a, b) _mm_store_si128((VECTOR_SHORT *)(a), (b))
167 #define v_merge_lo_16(a, b) _mm_unpacklo_epi16((a),(b))
168 #define v_merge_hi_16(a, b) _mm_unpackhi_epi16((a),(b))
169 #define v_merge_lo_32(a, b) _mm_unpacklo_epi32((a),(b))
170 #define v_merge_hi_32(a, b) _mm_unpackhi_epi32((a),(b))
171 #define v_merge_lo_64(a, b) _mm_unpacklo_epi64((a),(b))
172 #define v_merge_hi_64(a, b) _mm_unpackhi_epi64((a),(b))
173 #define v_add(a, b) _mm_adds_epi16((a), (b))
174 #define v_sub(a, b) _mm_subs_epi16((a), (b))
175 #define v_sub_unsigned(a, b) _mm_subs_epu16((a), (b))
176 #define v_max(a, b) _mm_max_epi16((a), (b))
177 #define v_min(a, b) _mm_min_epi16((a), (b))
178 #define v_dup(a) _mm_set1_epi16(a)
179 #define v_zero v_dup(0)
180 #define v_and(a, b) _mm_and_si128((a), (b))
181 #define v_xor(a, b) _mm_xor_si128((a), (b))
182 #define v_shift_left(a) _mm_slli_si128((a), 2)
183 #define v_mask_gt(a, b) _mm_movemask_epi8(_mm_cmpgt_epi16((a), (b)))
184 
185 #else
186 
187 #error Unknown Architecture
188 
189 #endif
190 
191 struct s16info_s
192 {
193   VECTOR_SHORT matrix[32];
194   VECTOR_SHORT * hearray;
195   VECTOR_SHORT * dprofile;
196   VECTOR_SHORT ** qtable;
197   unsigned short * dir;
198   char * qseq;
199   uint64_t diralloc;
200 
201   char * cigar;
202   char * cigarend;
203   int64_t cigaralloc;
204   int opcount;
205   char op;
206 
207   int qlen;
208   int maxdlen;
209   CELL penalty_gap_open_query_left;
210   CELL penalty_gap_open_target_left;
211   CELL penalty_gap_open_query_interior;
212   CELL penalty_gap_open_target_interior;
213   CELL penalty_gap_open_query_right;
214   CELL penalty_gap_open_target_right;
215   CELL penalty_gap_extension_query_left;
216   CELL penalty_gap_extension_target_left;
217   CELL penalty_gap_extension_query_interior;
218   CELL penalty_gap_extension_target_interior;
219   CELL penalty_gap_extension_query_right;
220   CELL penalty_gap_extension_target_right;
221 };
222 
_mm_print(VECTOR_SHORT x)223 void _mm_print(VECTOR_SHORT x)
224 {
225   auto * y = (unsigned short*)&x;
226   for (int i=0; i<8; i++) {
227     printf("%s%6d", (i>0?" ":""), y[7-i]);
228 
229         }
230 }
231 
_mm_print2(VECTOR_SHORT x)232 void _mm_print2(VECTOR_SHORT x)
233 {
234   auto * y = (signed short*)&x;
235   for (int i=0; i<8; i++) {
236     printf("%s%2d", (i>0?" ":""), y[7-i]);
237 
238         }
239 }
240 
dprofile_dump16(CELL * dprofile)241 void dprofile_dump16(CELL * dprofile)
242 {
243   char * s = sym_nt_4bit;
244   printf("\ndprofile:\n");
245   for(int i=0; i<16; i++)
246   {
247     printf("%c: ",s[i]);
248     for(int k=0; k<CDEPTH; k++)
249     {
250       printf("[");
251       for(int j=0; j<CHANNELS; j++) {
252         printf(" %3d", dprofile[CHANNELS*CDEPTH*i + CHANNELS*k + j]);
253 
254         }
255       printf("]");
256     }
257     printf("\n");
258   }
259 }
260 
dumpscorematrix(CELL * m)261 void dumpscorematrix(CELL * m)
262 {
263   for(int i=0; i<16; i++)
264     {
265       printf("%2d %c", i, sym_nt_4bit[i]);
266       for(int j=0; j<16; j++) {
267         printf(" %2d", m[16*i+j]);
268 
269         }
270       printf("\n");
271     }
272 }
273 
dprofile_fill16(CELL * dprofile_word,CELL * score_matrix_word,BYTE * dseq)274 void dprofile_fill16(CELL * dprofile_word,
275                      CELL * score_matrix_word,
276                      BYTE * dseq)
277 {
278 #if 0
279   dumpscorematrix(score_matrix_word);
280 
281   for (int j=0; j<CDEPTH; j++)
282     {
283       for(int z=0; z<CHANNELS; z++)
284         fprintf(stderr, " [%c]", sym_nt_4bit[dseq[j*CHANNELS+z]]);
285       fprintf(stderr, "\n");
286     }
287 #endif
288 
289   for (int j=0; j<CDEPTH; j++)
290   {
291     int d[CHANNELS];
292     for(int z=0; z<CHANNELS; z++) {
293       d[z] = dseq[j*CHANNELS+z] << 4;
294 
295         }
296 
297     for(int i=0; i<16; i += 8)
298     {
299 
300 #ifdef __PPC__
301       vector signed short     reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
302       vector signed int       reg8, reg9, reg10,reg11,reg12,reg13,reg14,reg15;
303       vector signed long long reg16,reg17,reg18,reg19,reg20,reg21,reg22,reg23;
304       vector signed long long reg24,reg25,reg26,reg27,reg28,reg29,reg30,reg31;
305 #else
306       VECTOR_SHORT reg0,  reg1,  reg2,  reg3,  reg4,  reg5,  reg6,  reg7;
307       VECTOR_SHORT reg8,  reg9,  reg10, reg11, reg12, reg13, reg14, reg15;
308       VECTOR_SHORT reg16, reg17, reg18, reg19, reg20, reg21, reg22, reg23;
309       VECTOR_SHORT reg24, reg25, reg26, reg27, reg28, reg29, reg30, reg31;
310 #endif
311 
312       reg0 = v_load(score_matrix_word + d[0] + i);
313       reg1 = v_load(score_matrix_word + d[1] + i);
314       reg2 = v_load(score_matrix_word + d[2] + i);
315       reg3 = v_load(score_matrix_word + d[3] + i);
316       reg4 = v_load(score_matrix_word + d[4] + i);
317       reg5 = v_load(score_matrix_word + d[5] + i);
318       reg6 = v_load(score_matrix_word + d[6] + i);
319       reg7 = v_load(score_matrix_word + d[7] + i);
320 
321 #ifdef __PPC__
322       reg8  = (vector signed int) vec_mergeh(reg0, reg1);
323       reg9  = (vector signed int) vec_mergel(reg0, reg1);
324       reg10 = (vector signed int) vec_mergeh(reg2, reg3);
325       reg11 = (vector signed int) vec_mergel(reg2, reg3);
326       reg12 = (vector signed int) vec_mergeh(reg4, reg5);
327       reg13 = (vector signed int) vec_mergel(reg4, reg5);
328       reg14 = (vector signed int) vec_mergeh(reg6, reg7);
329       reg15 = (vector signed int) vec_mergel(reg6, reg7);
330 
331       reg16 = (vector signed long long) vec_mergeh(reg8,  reg10);
332       reg17 = (vector signed long long) vec_mergel(reg8,  reg10);
333       reg18 = (vector signed long long) vec_mergeh(reg12, reg14);
334       reg19 = (vector signed long long) vec_mergel(reg12, reg14);
335       reg20 = (vector signed long long) vec_mergeh(reg9,  reg11);
336       reg21 = (vector signed long long) vec_mergel(reg9,  reg11);
337       reg22 = (vector signed long long) vec_mergeh(reg13, reg15);
338       reg23 = (vector signed long long) vec_mergel(reg13, reg15);
339 
340       reg24 = (vector signed long long) vec_perm
341         (reg16, reg18, perm_merge_long_low);
342       reg25 = (vector signed long long) vec_perm
343         (reg16, reg18, perm_merge_long_high);
344       reg26 = (vector signed long long) vec_perm
345         (reg17, reg19, perm_merge_long_low);
346       reg27 = (vector signed long long) vec_perm
347         (reg17, reg19, perm_merge_long_high);
348       reg28 = (vector signed long long) vec_perm
349         (reg20, reg22, perm_merge_long_low);
350       reg29 = (vector signed long long) vec_perm
351         (reg20, reg22, perm_merge_long_high);
352       reg30 = (vector signed long long) vec_perm
353         (reg21, reg23, perm_merge_long_low);
354       reg31 = (vector signed long long) vec_perm
355         (reg21, reg23, perm_merge_long_high);
356 #else
357       reg8  = v_merge_lo_16(reg0,  reg1);
358       reg9  = v_merge_hi_16(reg0,  reg1);
359       reg10 = v_merge_lo_16(reg2,  reg3);
360       reg11 = v_merge_hi_16(reg2,  reg3);
361       reg12 = v_merge_lo_16(reg4,  reg5);
362       reg13 = v_merge_hi_16(reg4,  reg5);
363       reg14 = v_merge_lo_16(reg6,  reg7);
364       reg15 = v_merge_hi_16(reg6,  reg7);
365 
366       reg16 = v_merge_lo_32(reg8,  reg10);
367       reg17 = v_merge_hi_32(reg8,  reg10);
368       reg18 = v_merge_lo_32(reg12, reg14);
369       reg19 = v_merge_hi_32(reg12, reg14);
370       reg20 = v_merge_lo_32(reg9,  reg11);
371       reg21 = v_merge_hi_32(reg9,  reg11);
372       reg22 = v_merge_lo_32(reg13, reg15);
373       reg23 = v_merge_hi_32(reg13, reg15);
374 
375       reg24 = v_merge_lo_64(reg16, reg18);
376       reg25 = v_merge_hi_64(reg16, reg18);
377       reg26 = v_merge_lo_64(reg17, reg19);
378       reg27 = v_merge_hi_64(reg17, reg19);
379       reg28 = v_merge_lo_64(reg20, reg22);
380       reg29 = v_merge_hi_64(reg20, reg22);
381       reg30 = v_merge_lo_64(reg21, reg23);
382       reg31 = v_merge_hi_64(reg21, reg23);
383 #endif
384 
385       v_store(dprofile_word + CDEPTH*CHANNELS*(i+0) + CHANNELS*j, reg24);
386       v_store(dprofile_word + CDEPTH*CHANNELS*(i+1) + CHANNELS*j, reg25);
387       v_store(dprofile_word + CDEPTH*CHANNELS*(i+2) + CHANNELS*j, reg26);
388       v_store(dprofile_word + CDEPTH*CHANNELS*(i+3) + CHANNELS*j, reg27);
389       v_store(dprofile_word + CDEPTH*CHANNELS*(i+4) + CHANNELS*j, reg28);
390       v_store(dprofile_word + CDEPTH*CHANNELS*(i+5) + CHANNELS*j, reg29);
391       v_store(dprofile_word + CDEPTH*CHANNELS*(i+6) + CHANNELS*j, reg30);
392       v_store(dprofile_word + CDEPTH*CHANNELS*(i+7) + CHANNELS*j, reg31);
393     }
394   }
395 #if 0
396   dprofile_dump16(dprofile_word);
397 #endif
398 }
399 
400 /*
401   The direction bits are set as follows:
402   in DIR[0..1] if F>H initially (must go up) (4th pri)
403   in DIR[2..3] if E>max(H,F) (must go left) (3rd pri)
404   in DIR[4..5] if new F>H (must extend up) (2nd pri)
405   in DIR[6..7] if new E>H (must extend left) (1st pri)
406   no bits set: go diagonally
407 */
408 
409 /*
410   On PPC the fifth parameter is a vector for the result in the lower 64 bits.
411   On x86_64 the fifth parameter is the address to write the result to.
412 */
413 
414 #ifdef __PPC__
415 
416 /* Handle differences between GNU and IBM compilers */
417 
418 #ifdef __IBMCPP__
419 #define VECTORBYTEPERMUTE vec_bperm
420 #else
421 #define VECTORBYTEPERMUTE vec_vbpermq
422 #endif
423 
424 /* The VSX vec_bperm instruction puts the 16 selected bits of the first
425    source into bits 48-63 of the destination. */
426 
427 const vector unsigned char perm  = { 120, 112, 104,  96,  88,  80,  72,  64,
428                                       56,  48,  40,  32,  24,  16,   8,   0 };
429 
430 #define ALIGNCORE(H, N, F, V, RES, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX)  \
431   {                                                                     \
432     vector unsigned short W, X, Y, Z;                                   \
433     vector unsigned int WX, YZ;                                         \
434     vector short VV;                                                    \
435     VV = v_load(&V);                                                    \
436     H = v_add(H, VV);                                                   \
437     W = (vector unsigned short) VECTORBYTEPERMUTE                       \
438       ((vector unsigned char) vec_cmpgt(F, H), perm);                   \
439     H = v_max(H, F);                                                    \
440     X = (vector unsigned short) VECTORBYTEPERMUTE                       \
441       ((vector unsigned char) vec_cmpgt(E, H), perm);                   \
442     H = v_max(H, E);                                                    \
443     H_MIN = v_min(H_MIN, H);                                            \
444     H_MAX = v_max(H_MAX, H);                                            \
445     N = H;                                                              \
446     HF = v_sub(H, QR_t);                                                \
447     F = v_sub(F, R_t);                                                  \
448     Y = (vector unsigned short) VECTORBYTEPERMUTE                       \
449       ((vector unsigned char) vec_cmpgt(F, HF), perm);                  \
450     F = v_max(F, HF);                                                   \
451     HE = v_sub(H, QR_q);                                                \
452     E = v_sub(E, R_q);                                                  \
453     Z = (vector unsigned short) VECTORBYTEPERMUTE                       \
454       ((vector unsigned char) vec_cmpgt(E, HE), perm);                  \
455     E = v_max(E, HE);                                                   \
456     WX = (vector unsigned int) vec_mergel(W, X);                        \
457     YZ = (vector unsigned int) vec_mergel(Y, Z);                        \
458     RES = (vector unsigned long long) vec_mergeh(WX, YZ);               \
459   }
460 
461 #else
462 
463 /* x86_64 & aarch64 */
464 
465 #define ALIGNCORE(H, N, F, V, PATH, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \
466   H = v_add(H, V);                                                      \
467   *(PATH+0) = v_mask_gt(F, H);                                          \
468   H = v_max(H, F);                                                      \
469   *(PATH+1) = v_mask_gt(E, H);                                          \
470   H = v_max(H, E);                                                      \
471   H_MIN = v_min(H_MIN, H);                                              \
472   H_MAX = v_max(H_MAX, H);                                              \
473   N = H;                                                                \
474   HF = v_sub(H, QR_t);                                                  \
475   F = v_sub(F, R_t);                                                    \
476   *(PATH+2) = v_mask_gt(F, HF);                                         \
477   F = v_max(F, HF);                                                     \
478   HE = v_sub(H, QR_q);                                                  \
479   E = v_sub(E, R_q);                                                    \
480   *(PATH+3) = v_mask_gt(E, HE);                                         \
481   E = v_max(E, HE);
482 
483 #endif
484 
aligncolumns_first(VECTOR_SHORT * Sm,VECTOR_SHORT * hep,VECTOR_SHORT ** qp,VECTOR_SHORT QR_q_i,VECTOR_SHORT R_q_i,VECTOR_SHORT QR_q_r,VECTOR_SHORT R_q_r,VECTOR_SHORT QR_t_0,VECTOR_SHORT R_t_0,VECTOR_SHORT QR_t_1,VECTOR_SHORT R_t_1,VECTOR_SHORT QR_t_2,VECTOR_SHORT R_t_2,VECTOR_SHORT QR_t_3,VECTOR_SHORT R_t_3,VECTOR_SHORT h0,VECTOR_SHORT h1,VECTOR_SHORT h2,VECTOR_SHORT h3,VECTOR_SHORT f0,VECTOR_SHORT f1,VECTOR_SHORT f2,VECTOR_SHORT f3,VECTOR_SHORT * _h_min,VECTOR_SHORT * _h_max,VECTOR_SHORT Mm,VECTOR_SHORT M_QR_t_left,VECTOR_SHORT M_R_t_left,VECTOR_SHORT M_QR_q_interior,VECTOR_SHORT M_QR_q_right,int64_t ql,unsigned short * dir)485 void aligncolumns_first(VECTOR_SHORT * Sm,
486                         VECTOR_SHORT * hep,
487                         VECTOR_SHORT ** qp,
488                         VECTOR_SHORT QR_q_i,
489                         VECTOR_SHORT R_q_i,
490                         VECTOR_SHORT QR_q_r,
491                         VECTOR_SHORT R_q_r,
492                         VECTOR_SHORT QR_t_0,
493                         VECTOR_SHORT R_t_0,
494                         VECTOR_SHORT QR_t_1,
495                         VECTOR_SHORT R_t_1,
496                         VECTOR_SHORT QR_t_2,
497                         VECTOR_SHORT R_t_2,
498                         VECTOR_SHORT QR_t_3,
499                         VECTOR_SHORT R_t_3,
500                         VECTOR_SHORT h0,
501                         VECTOR_SHORT h1,
502                         VECTOR_SHORT h2,
503                         VECTOR_SHORT h3,
504                         VECTOR_SHORT f0,
505                         VECTOR_SHORT f1,
506                         VECTOR_SHORT f2,
507                         VECTOR_SHORT f3,
508                         VECTOR_SHORT * _h_min,
509                         VECTOR_SHORT * _h_max,
510                         VECTOR_SHORT Mm,
511                         VECTOR_SHORT M_QR_t_left,
512                         VECTOR_SHORT M_R_t_left,
513                         VECTOR_SHORT M_QR_q_interior,
514                         VECTOR_SHORT M_QR_q_right,
515                         int64_t ql,
516                         unsigned short * dir)
517 {
518 
519   VECTOR_SHORT h4, h5, h6, h7, h8, E, HE, HF;
520   VECTOR_SHORT * vp;
521 
522   VECTOR_SHORT h_min = v_zero;
523   VECTOR_SHORT h_max = v_zero;
524 
525 #ifdef __PPC__
526   vector unsigned long long RES1, RES2, RES;
527 #endif
528 
529   int64_t i;
530 
531   f0 = v_sub(f0, QR_t_0);
532   f1 = v_sub(f1, QR_t_1);
533   f2 = v_sub(f2, QR_t_2);
534   f3 = v_sub(f3, QR_t_3);
535 
536 
537   for(i=0; i < ql - 1; i++)
538     {
539       vp = qp[i+0];
540 
541       h4 = hep[2*i+0];
542 
543       E  = hep[2*i+1];
544 
545       /*
546          Initialize selected h and e values for next/this round.
547          First zero those cells where a new sequence starts
548          by using an unsigned saturated subtraction of a huge value to
549          set it to zero.
550          Then use signed subtraction to obtain the correct value.
551       */
552 
553       h4 = v_sub_unsigned(h4, Mm);
554       h4 = v_sub(h4, M_QR_t_left);
555 
556       E  = v_sub_unsigned(E, Mm);
557       E  = v_sub(E, M_QR_t_left);
558       E  = v_sub(E, M_QR_q_interior);
559 
560       M_QR_t_left = v_add(M_QR_t_left, M_R_t_left);
561 
562 #ifdef __PPC__
563       ALIGNCORE(h0, h5, f0, vp[0], RES1,
564                 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
565       ALIGNCORE(h1, h6, f1, vp[1], RES2,
566                 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
567       RES = vec_perm(RES1, RES2, perm_merge_long_low);
568       v_store((dir + 16*i + 0), RES);
569       ALIGNCORE(h2, h7, f2, vp[2], RES1,
570                 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
571       ALIGNCORE(h3, h8, f3, vp[3], RES2,
572                 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
573       RES = vec_perm(RES1, RES2, perm_merge_long_low);
574       v_store((dir + 16*i + 8), RES);
575 #else
576       ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+0,
577                 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
578       ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+4,
579                 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
580       ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+8,
581                 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
582       ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
583                 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
584 #endif
585 
586       hep[2*i+0] = h8;
587       hep[2*i+1] = E;
588 
589       h0 = h4;
590       h1 = h5;
591       h2 = h6;
592       h3 = h7;
593     }
594 
595   /* the final round - using query gap penalties for right end */
596 
597   vp = qp[i+0];
598 
599   E  = hep[2*i+1];
600 
601   E  = v_sub_unsigned(E, Mm);
602   E  = v_sub(E, M_QR_t_left);
603   E  = v_sub(E, M_QR_q_right);
604 
605 
606 #ifdef __PPC__
607   ALIGNCORE(h0, h5, f0, vp[0], RES1,
608             QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
609   ALIGNCORE(h1, h6, f1, vp[1], RES2,
610             QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
611   RES = vec_perm(RES1, RES2, perm_merge_long_low);
612   v_store((dir + 16*i + 0), RES);
613   ALIGNCORE(h2, h7, f2, vp[2], RES1,
614             QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
615   ALIGNCORE(h3, h8, f3, vp[3], RES2,
616             QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
617   RES = vec_perm(RES1, RES2, perm_merge_long_low);
618   v_store((dir + 16*i + 8), RES);
619 #else
620   ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0,
621             QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
622   ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4,
623             QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
624   ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8,
625             QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
626   ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
627             QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
628 #endif
629 
630 
631   hep[2*i+0] = h8;
632   hep[2*i+1] = E;
633 
634   Sm[0] = h5;
635   Sm[1] = h6;
636   Sm[2] = h7;
637   Sm[3] = h8;
638 
639   *_h_min = h_min;
640   *_h_max = h_max;
641 }
642 
aligncolumns_rest(VECTOR_SHORT * Sm,VECTOR_SHORT * hep,VECTOR_SHORT ** qp,VECTOR_SHORT QR_q_i,VECTOR_SHORT R_q_i,VECTOR_SHORT QR_q_r,VECTOR_SHORT R_q_r,VECTOR_SHORT QR_t_0,VECTOR_SHORT R_t_0,VECTOR_SHORT QR_t_1,VECTOR_SHORT R_t_1,VECTOR_SHORT QR_t_2,VECTOR_SHORT R_t_2,VECTOR_SHORT QR_t_3,VECTOR_SHORT R_t_3,VECTOR_SHORT h0,VECTOR_SHORT h1,VECTOR_SHORT h2,VECTOR_SHORT h3,VECTOR_SHORT f0,VECTOR_SHORT f1,VECTOR_SHORT f2,VECTOR_SHORT f3,VECTOR_SHORT * _h_min,VECTOR_SHORT * _h_max,int64_t ql,unsigned short * dir)643 void aligncolumns_rest(VECTOR_SHORT * Sm,
644                        VECTOR_SHORT * hep,
645                        VECTOR_SHORT ** qp,
646                        VECTOR_SHORT QR_q_i,
647                        VECTOR_SHORT R_q_i,
648                        VECTOR_SHORT QR_q_r,
649                        VECTOR_SHORT R_q_r,
650                        VECTOR_SHORT QR_t_0,
651                        VECTOR_SHORT R_t_0,
652                        VECTOR_SHORT QR_t_1,
653                        VECTOR_SHORT R_t_1,
654                        VECTOR_SHORT QR_t_2,
655                        VECTOR_SHORT R_t_2,
656                        VECTOR_SHORT QR_t_3,
657                        VECTOR_SHORT R_t_3,
658                        VECTOR_SHORT h0,
659                        VECTOR_SHORT h1,
660                        VECTOR_SHORT h2,
661                        VECTOR_SHORT h3,
662                        VECTOR_SHORT f0,
663                        VECTOR_SHORT f1,
664                        VECTOR_SHORT f2,
665                        VECTOR_SHORT f3,
666                        VECTOR_SHORT * _h_min,
667                        VECTOR_SHORT * _h_max,
668                        int64_t ql,
669                        unsigned short * dir)
670 {
671   VECTOR_SHORT h4, h5, h6, h7, h8, E, HE, HF;
672   VECTOR_SHORT * vp;
673 
674   VECTOR_SHORT h_min = v_zero;
675   VECTOR_SHORT h_max = v_zero;
676 
677 #ifdef __PPC__
678   vector unsigned long long RES1, RES2, RES;
679 #endif
680 
681   int64_t i;
682 
683   f0 = v_sub(f0, QR_t_0);
684   f1 = v_sub(f1, QR_t_1);
685   f2 = v_sub(f2, QR_t_2);
686   f3 = v_sub(f3, QR_t_3);
687 
688   for(i=0; i < ql - 1; i++)
689     {
690       vp = qp[i+0];
691 
692       h4 = hep[2*i+0];
693 
694       E  = hep[2*i+1];
695 
696 #ifdef __PPC__
697       ALIGNCORE(h0, h5, f0, vp[0], RES1,
698                 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
699       ALIGNCORE(h1, h6, f1, vp[1], RES2,
700                 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
701       RES = vec_perm(RES1, RES2, perm_merge_long_low);
702       v_store((dir + 16*i + 0), RES);
703       ALIGNCORE(h2, h7, f2, vp[2], RES1,
704                 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
705       ALIGNCORE(h3, h8, f3, vp[3], RES2,
706                 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
707       RES = vec_perm(RES1, RES2, perm_merge_long_low);
708       v_store((dir + 16*i + 8), RES);
709 #else
710       ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0,
711                 QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max);
712       ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4,
713                 QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max);
714       ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8,
715                 QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max);
716       ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
717                 QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max);
718 #endif
719 
720       hep[2*i+0] = h8;
721       hep[2*i+1] = E;
722 
723       h0 = h4;
724       h1 = h5;
725       h2 = h6;
726       h3 = h7;
727     }
728 
729   /* the final round - using query gap penalties for right end */
730 
731   vp = qp[i+0];
732 
733   E  = hep[2*i+1];
734 
735 #ifdef __PPC__
736   ALIGNCORE(h0, h5, f0, vp[0], RES1,
737             QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
738   ALIGNCORE(h1, h6, f1, vp[1], RES2,
739             QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
740   RES = vec_perm(RES1, RES2, perm_merge_long_low);
741   v_store((dir + 16*i + 0), RES);
742   ALIGNCORE(h2, h7, f2, vp[2], RES1,
743             QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
744   ALIGNCORE(h3, h8, f3, vp[3], RES2,
745             QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
746   RES = vec_perm(RES1, RES2, perm_merge_long_low);
747   v_store((dir + 16*i + 8), RES);
748 #else
749   ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0,
750             QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max);
751   ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4,
752             QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max);
753   ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8,
754             QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max);
755   ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12,
756             QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max);
757 #endif
758 
759   hep[2*i+0] = h8;
760   hep[2*i+1] = E;
761 
762   Sm[0] = h5;
763   Sm[1] = h6;
764   Sm[2] = h7;
765   Sm[3] = h8;
766 
767   *_h_min = h_min;
768   *_h_max = h_max;
769 }
770 
pushop(s16info_s * s,char newop)771 inline void pushop(s16info_s * s, char newop)
772 {
773   if (newop == s->op) {
774     s->opcount++;
775   } else
776     {
777       *--s->cigarend = s->op;
778       if (s->opcount > 1)
779         {
780           char buf[11];
781           int len = sprintf(buf, "%d", s->opcount);
782           s->cigarend -= len;
783           memcpy(s->cigarend, buf, len);
784         }
785       s->op = newop;
786       s->opcount = 1;
787     }
788 }
789 
finishop(s16info_s * s)790 inline void finishop(s16info_s * s)
791 {
792   if (s->op && s->opcount)
793     {
794       *--s->cigarend = s->op;
795       if (s->opcount > 1)
796         {
797           char buf[11];
798           int len = sprintf(buf, "%d", s->opcount);
799           s->cigarend -= len;
800           memcpy(s->cigarend, buf, len);
801         }
802       s->op = 0;
803       s->opcount = 0;
804     }
805 }
806 
backtrack16(s16info_s * s,char * dseq,uint64_t dlen,uint64_t offset,uint64_t channel,unsigned short * paligned,unsigned short * pmatches,unsigned short * pmismatches,unsigned short * pgaps)807 void backtrack16(s16info_s * s,
808                  char * dseq,
809                  uint64_t dlen,
810                  uint64_t offset,
811                  uint64_t channel,
812                  unsigned short * paligned,
813                  unsigned short * pmatches,
814                  unsigned short * pmismatches,
815                  unsigned short * pgaps)
816 {
817   unsigned short * dirbuffer = s->dir;
818   uint64_t dirbuffersize = s->qlen * s->maxdlen * 4;
819   uint64_t qlen = s->qlen;
820   char * qseq = s->qseq;
821 
822   uint64_t maskup      = 3ULL << (2*channel+ 0);
823   uint64_t maskleft    = 3ULL << (2*channel+16);
824   uint64_t maskextup   = 3ULL << (2*channel+32);
825   uint64_t maskextleft = 3ULL << (2*channel+48);
826 
827 #if 0
828 
829   printf("Dumping backtracking array\n");
830 
831   for(uint64_t i=0; i<qlen; i++)
832   {
833     for(uint64_t j=0; j<dlen; j++)
834     {
835       uint64_t d = *((uint64_t *) (dirbuffer +
836                                    (offset + 16*s->qlen*(j/4) +
837                                     16*i + 4*(j&3)) % dirbuffersize));
838       if (d & maskup)
839       {
840         if (d & maskleft)
841           printf("+");
842         else
843           printf("^");
844       }
845       else if (d & maskleft)
846       {
847         printf("<");
848       }
849       else
850       {
851         printf("\\");
852       }
853     }
854     printf("\n");
855   }
856 
857   printf("Dumping gap extension array\n");
858 
859   for(uint64_t i=0; i<qlen; i++)
860   {
861     for(uint64_t j=0; j<dlen; j++)
862     {
863       uint64_t d = *((uint64_t *) (dirbuffer +
864                                    (offset + 16*s->qlen*(j/4) +
865                                     16*i + 4*(j&3)) % dirbuffersize));
866       if (d & maskextup)
867       {
868         if (d & maskextleft)
869           printf("+");
870         else
871           printf("^");
872       }
873       else if (d & maskextleft)
874       {
875         printf("<");
876       }
877       else
878       {
879         printf("\\");
880       }
881     }
882     printf("\n");
883   }
884 
885 #endif
886 
887   unsigned short aligned = 0;
888   unsigned short matches = 0;
889   unsigned short mismatches = 0;
890   unsigned short gaps = 0;
891 
892   int64_t i = qlen - 1;
893   int64_t j = dlen - 1;
894 
895   s->cigarend = s->cigar + s->qlen + s->maxdlen + 1;
896   s->op = 0;
897   s->opcount = 1;
898 
899   while ((i>=0) && (j>=0))
900   {
901     aligned++;
902 
903     uint64_t d = *((uint64_t *) (dirbuffer +
904                                  (offset + 16*s->qlen*(j/4) +
905                                   16*i + 4*(j&3)) % dirbuffersize));
906 
907     if ((s->op == 'I') && (d & maskextleft))
908     {
909       j--;
910       pushop(s, 'I');
911     }
912     else if ((s->op == 'D') && (d & maskextup))
913     {
914       i--;
915       pushop(s, 'D');
916     }
917     else if (d & maskleft)
918     {
919       if (s->op != 'I') {
920         gaps++;
921 
922         }
923       j--;
924       pushop(s, 'I');
925     }
926     else if (d & maskup)
927     {
928       if (s->op != 'D') {
929         gaps++;
930 
931         }
932       i--;
933       pushop(s, 'D');
934     }
935     else
936     {
937       if (chrmap_4bit[(int)(qseq[i])] & chrmap_4bit[(int)(dseq[j])]) {
938         matches++;
939       } else {
940         mismatches++;
941 
942         }
943       i--;
944       j--;
945       pushop(s, 'M');
946     }
947   }
948 
949   while(i>=0)
950     {
951       aligned++;
952       if (s->op != 'D') {
953         gaps++;
954 
955         }
956       i--;
957       pushop(s, 'D');
958     }
959 
960   while(j>=0)
961     {
962       aligned++;
963       if (s->op != 'I') {
964         gaps++;
965 
966         }
967       j--;
968       pushop(s, 'I');
969     }
970 
971   finishop(s);
972 
973   /* move cigar to beginning of allocated memory area */
974   int cigarlen = s->cigar + s->qlen + s->maxdlen - s->cigarend;
975   memmove(s->cigar, s->cigarend, cigarlen + 1);
976 
977   * paligned = aligned;
978   * pmatches = matches;
979   * pmismatches = mismatches;
980   * pgaps = gaps;
981 }
982 
search16_init(CELL score_match,CELL score_mismatch,CELL penalty_gap_open_query_left,CELL penalty_gap_open_target_left,CELL penalty_gap_open_query_interior,CELL penalty_gap_open_target_interior,CELL penalty_gap_open_query_right,CELL penalty_gap_open_target_right,CELL penalty_gap_extension_query_left,CELL penalty_gap_extension_target_left,CELL penalty_gap_extension_query_interior,CELL penalty_gap_extension_target_interior,CELL penalty_gap_extension_query_right,CELL penalty_gap_extension_target_right)983 struct s16info_s * search16_init(CELL score_match,
984                                  CELL score_mismatch,
985                                  CELL penalty_gap_open_query_left,
986                                  CELL penalty_gap_open_target_left,
987                                  CELL penalty_gap_open_query_interior,
988                                  CELL penalty_gap_open_target_interior,
989                                  CELL penalty_gap_open_query_right,
990                                  CELL penalty_gap_open_target_right,
991                                  CELL penalty_gap_extension_query_left,
992                                  CELL penalty_gap_extension_target_left,
993                                  CELL penalty_gap_extension_query_interior,
994                                  CELL penalty_gap_extension_target_interior,
995                                  CELL penalty_gap_extension_query_right,
996                                  CELL penalty_gap_extension_target_right)
997 {
998   (void) score_match;
999   (void) score_mismatch;
1000 
1001   /* prepare alloc of qtable, dprofile, hearray, dir */
1002   auto * s = (struct s16info_s *)
1003     xmalloc(sizeof(struct s16info_s));
1004 
1005   s->dprofile = (VECTOR_SHORT *) xmalloc(2*4*8*16);
1006   s->qlen = 0;
1007   s->qseq = nullptr;
1008   s->maxdlen = 0;
1009   s->dir = nullptr;
1010   s->diralloc = 0;
1011   s->hearray = nullptr;
1012   s->qtable = nullptr;
1013   s->cigar = nullptr;
1014   s->cigarend = nullptr;
1015   s->cigaralloc = 0;
1016 
1017   for(int i=0; i<16; i++) {
1018     for(int j=0; j<16; j++)
1019       {
1020         CELL value;
1021         if (ambiguous_4bit[i] || ambiguous_4bit[j]) {
1022           value = 0;
1023         } else if (i == j) {
1024           value = opt_match;
1025         } else {
1026           value = opt_mismatch;
1027 
1028         }
1029         ((CELL*)(&s->matrix))[16*i+j] = value;
1030         scorematrix[i][j] = value;
1031       }
1032 
1033         }
1034 
1035 
1036   s->penalty_gap_open_query_left = penalty_gap_open_query_left;
1037   s->penalty_gap_open_query_interior = penalty_gap_open_query_interior;
1038   s->penalty_gap_open_query_right = penalty_gap_open_query_right;
1039 
1040   s->penalty_gap_open_target_left = penalty_gap_open_target_left;
1041   s->penalty_gap_open_target_interior = penalty_gap_open_target_interior;
1042   s->penalty_gap_open_target_right = penalty_gap_open_target_right;
1043 
1044   s->penalty_gap_extension_query_left = penalty_gap_extension_query_left;
1045   s->penalty_gap_extension_query_interior = penalty_gap_extension_query_interior;
1046   s->penalty_gap_extension_query_right = penalty_gap_extension_query_right;
1047 
1048   s->penalty_gap_extension_target_left = penalty_gap_extension_target_left;
1049   s->penalty_gap_extension_target_interior = penalty_gap_extension_target_interior;
1050   s->penalty_gap_extension_target_right = penalty_gap_extension_target_right;
1051 
1052   return s;
1053 }
1054 
search16_exit(s16info_s * s)1055 void search16_exit(s16info_s * s)
1056 {
1057   /* free mem for dprofile, hearray, dir, qtable */
1058   if (s->dir) {
1059     xfree(s->dir);
1060 
1061         }
1062   if (s->hearray) {
1063     xfree(s->hearray);
1064 
1065         }
1066   if (s->dprofile) {
1067     xfree(s->dprofile);
1068 
1069         }
1070   if (s->qtable) {
1071     xfree(s->qtable);
1072 
1073         }
1074   if (s->cigar) {
1075     xfree(s->cigar);
1076 
1077         }
1078   xfree(s);
1079 }
1080 
search16_qprep(s16info_s * s,char * qseq,int qlen)1081 void search16_qprep(s16info_s * s, char * qseq, int qlen)
1082 {
1083   s->qlen = qlen;
1084   s->qseq = qseq;
1085 
1086   if (s->hearray) {
1087     xfree(s->hearray);
1088 
1089         }
1090   s->hearray = (VECTOR_SHORT *) xmalloc(2 * s->qlen * sizeof(VECTOR_SHORT));
1091   memset(s->hearray, 0, 2 * s->qlen * sizeof(VECTOR_SHORT));
1092 
1093   if (s->qtable) {
1094     xfree(s->qtable);
1095 
1096         }
1097   s->qtable = (VECTOR_SHORT **) xmalloc(s->qlen * sizeof(VECTOR_SHORT*));
1098 
1099   for(int i = 0; i < qlen; i++) {
1100     s->qtable[i] = s->dprofile + 4 * chrmap_4bit[(int)(qseq[i])];
1101 
1102         }
1103 }
1104 
search16(s16info_s * s,unsigned int sequences,unsigned int * seqnos,CELL * pscores,unsigned short * paligned,unsigned short * pmatches,unsigned short * pmismatches,unsigned short * pgaps,char ** pcigar)1105 void search16(s16info_s * s,
1106               unsigned int sequences,
1107               unsigned int * seqnos,
1108               CELL * pscores,
1109               unsigned short * paligned,
1110               unsigned short * pmatches,
1111               unsigned short * pmismatches,
1112               unsigned short * pgaps,
1113               char ** pcigar)
1114 {
1115   CELL ** q_start = (CELL**) s->qtable;
1116   CELL * dprofile = (CELL*) s->dprofile;
1117   CELL * hearray = (CELL*) s->hearray;
1118   uint64_t qlen = s->qlen;
1119 
1120   if (qlen == 0)
1121     {
1122       for (unsigned int cand_id = 0; cand_id < sequences; cand_id++)
1123         {
1124           unsigned int seqno = seqnos[cand_id];
1125           int64_t length = db_getsequencelen(seqno);
1126 
1127           paligned[cand_id] = length;
1128           pmatches[cand_id] = 0;
1129           pmismatches[cand_id] = 0;
1130           pgaps[cand_id] = length;
1131 
1132           if (length == 0) {
1133             pscores[cand_id] = 0;
1134           } else {
1135             pscores[cand_id] =
1136               MAX(- s->penalty_gap_open_target_left -
1137                   length * s->penalty_gap_extension_target_left,
1138                   - s->penalty_gap_open_target_right -
1139                   length * s->penalty_gap_extension_target_right);
1140 
1141         }
1142 
1143           char * cigar = nullptr;
1144           if (length > 0)
1145             {
1146               int ret = xsprintf(&cigar, "%ldI", length);
1147               if ((ret < 2) || !cigar) {
1148                 fatal("Unable to allocate enough memory.");
1149 
1150         }
1151             }
1152           else
1153             {
1154               cigar = (char *) xmalloc(1);
1155               cigar[0] = 0;
1156             }
1157           pcigar[cand_id] = cigar;
1158         }
1159       return;
1160     }
1161 
1162   /* find longest target sequence and reallocate direction buffer */
1163   uint64_t maxdlen = 0;
1164   for(int64_t i = 0; i < sequences; i++)
1165     {
1166       uint64_t dlen = db_getsequencelen(seqnos[i]);
1167       /* skip the very long sequences */
1168       if ((int64_t)(s->qlen) * dlen <= MAXSEQLENPRODUCT)
1169         {
1170           if (dlen > maxdlen) {
1171             maxdlen = dlen;
1172 
1173         }
1174         }
1175     }
1176   maxdlen = 4 * ((maxdlen + 3) / 4);
1177   s->maxdlen = maxdlen;
1178   uint64_t dirbuffersize = s->qlen * s->maxdlen * 4;
1179 
1180   if (dirbuffersize > s->diralloc)
1181     {
1182       s->diralloc = dirbuffersize;
1183       if (s->dir) {
1184         xfree(s->dir);
1185 
1186         }
1187       s->dir = (unsigned short*) xmalloc(dirbuffersize *
1188                                          sizeof(unsigned short));
1189     }
1190 
1191   unsigned short * dirbuffer = s->dir;
1192 
1193   if (s->qlen + s->maxdlen + 1 > s->cigaralloc)
1194     {
1195       s->cigaralloc = s->qlen + s->maxdlen + 1;
1196       if (s->cigar) {
1197         xfree(s->cigar);
1198 
1199         }
1200       s->cigar = (char *) xmalloc(s->cigaralloc);
1201     }
1202 
1203   VECTOR_SHORT M, T0;
1204 
1205   VECTOR_SHORT M_QR_target_left, M_R_target_left;
1206   VECTOR_SHORT M_QR_query_interior;
1207   VECTOR_SHORT M_QR_query_right;
1208 
1209   VECTOR_SHORT R_query_left;
1210   VECTOR_SHORT QR_query_interior, R_query_interior;
1211   VECTOR_SHORT QR_query_right, R_query_right;
1212   VECTOR_SHORT QR_target_left, R_target_left;
1213   VECTOR_SHORT QR_target_interior, R_target_interior;
1214   VECTOR_SHORT QR_target_right, R_target_right;
1215   VECTOR_SHORT QR_target[4], R_target[4];
1216 
1217   VECTOR_SHORT *hep, **qp;
1218 
1219   BYTE * d_begin[CHANNELS];
1220   BYTE * d_end[CHANNELS];
1221   uint64_t d_offset[CHANNELS];
1222   BYTE * d_address[CHANNELS];
1223   uint64_t d_length[CHANNELS];
1224   int64_t seq_id[CHANNELS];
1225   bool overflow[CHANNELS];
1226 
1227   VECTOR_SHORT dseqalloc[CDEPTH];
1228   VECTOR_SHORT S[4];
1229 
1230   BYTE * dseq = (BYTE*) & dseqalloc;
1231   BYTE zero = 0;
1232 
1233   uint64_t next_id = 0;
1234   uint64_t done = 0;
1235 
1236   T0 = v_init(-1, 0, 0, 0, 0, 0, 0, 0);
1237 
1238   R_query_left = v_dup(s->penalty_gap_extension_query_left);
1239 
1240   QR_query_interior = v_dup((s->penalty_gap_open_query_interior +
1241                              s->penalty_gap_extension_query_interior));
1242   R_query_interior  = v_dup(s->penalty_gap_extension_query_interior);
1243 
1244   QR_query_right  = v_dup((s->penalty_gap_open_query_right +
1245                            s->penalty_gap_extension_query_right));
1246   R_query_right  = v_dup(s->penalty_gap_extension_query_right);
1247 
1248   QR_target_left  = v_dup((s->penalty_gap_open_target_left +
1249                            s->penalty_gap_extension_target_left));
1250   R_target_left  = v_dup(s->penalty_gap_extension_target_left);
1251 
1252   QR_target_interior = v_dup((s->penalty_gap_open_target_interior +
1253                               s->penalty_gap_extension_target_interior));
1254   R_target_interior = v_dup(s->penalty_gap_extension_target_interior);
1255 
1256   QR_target_right  = v_dup((s->penalty_gap_open_target_right +
1257                             s->penalty_gap_extension_target_right));
1258   R_target_right  = v_dup(s->penalty_gap_extension_target_right);
1259 
1260   hep = (VECTOR_SHORT*) hearray;
1261   qp = (VECTOR_SHORT**) q_start;
1262 
1263   for (int c=0; c<CHANNELS; c++)
1264     {
1265       d_begin[c] = &zero;
1266       d_end[c] = d_begin[c];
1267       d_address[c] = nullptr;
1268       d_offset[c] = 0;
1269       d_length[c] = 0;
1270       seq_id[c] = -1;
1271       overflow[c] = false;
1272     }
1273 
1274   short gap_penalty_max = 0;
1275 
1276   gap_penalty_max = MAX(gap_penalty_max,
1277                         s->penalty_gap_open_query_left +
1278                         s->penalty_gap_extension_query_left);
1279   gap_penalty_max = MAX(gap_penalty_max,
1280                         s->penalty_gap_open_query_interior +
1281                         s->penalty_gap_extension_query_interior);
1282   gap_penalty_max = MAX(gap_penalty_max,
1283                         s->penalty_gap_open_query_right +
1284                         s->penalty_gap_extension_query_right);
1285   gap_penalty_max = MAX(gap_penalty_max,
1286                         s->penalty_gap_open_target_left +
1287                         s->penalty_gap_extension_target_left);
1288   gap_penalty_max = MAX(gap_penalty_max,
1289                         s->penalty_gap_open_target_interior +
1290                         s->penalty_gap_extension_target_interior);
1291   gap_penalty_max = MAX(gap_penalty_max,
1292                         s->penalty_gap_open_target_right +
1293                         s->penalty_gap_extension_target_right);
1294 
1295   short score_min = SHRT_MIN + gap_penalty_max;
1296   short score_max = SHRT_MAX;
1297 
1298   for(int i=0; i<4; i++)
1299     {
1300       S[i] = v_zero;
1301       dseqalloc[i] = v_zero;
1302     }
1303 
1304   VECTOR_SHORT H0 = v_zero;
1305   VECTOR_SHORT H1 = v_zero;
1306   VECTOR_SHORT H2 = v_zero;
1307   VECTOR_SHORT H3 = v_zero;
1308 
1309   VECTOR_SHORT F0 = v_zero;
1310   VECTOR_SHORT F1 = v_zero;
1311   VECTOR_SHORT F2 = v_zero;
1312   VECTOR_SHORT F3 = v_zero;
1313 
1314   int easy = 0;
1315 
1316   unsigned short * dir = dirbuffer;
1317 
1318   while(true)
1319   {
1320     if (easy)
1321     {
1322       /* fill all channels with symbols from the database sequences */
1323 
1324       for(int c=0; c<CHANNELS; c++)
1325       {
1326         for(int j=0; j<CDEPTH; j++)
1327         {
1328           if (d_begin[c] < d_end[c]) {
1329             dseq[CHANNELS*j+c] = chrmap_4bit[*(d_begin[c]++)];
1330           } else {
1331             dseq[CHANNELS*j+c] = 0;
1332 
1333         }
1334         }
1335         if (d_begin[c] == d_end[c]) {
1336           easy = 0;
1337 
1338         }
1339       }
1340 
1341       dprofile_fill16(dprofile, (CELL*) s->matrix, dseq);
1342 
1343       /* create vectors of gap penalties for target depending on whether
1344          any of the database sequences ended in these four columns */
1345 
1346       if (easy)
1347         {
1348           for(unsigned int j=0; j<CDEPTH; j++)
1349             {
1350               QR_target[j] = QR_target_interior;
1351               R_target[j]  = R_target_interior;
1352             }
1353         }
1354       else
1355         {
1356           /* one or more sequences ended */
1357 
1358           VECTOR_SHORT QR_diff = v_sub(QR_target_right,
1359                                        QR_target_interior);
1360           VECTOR_SHORT R_diff  = v_sub(R_target_right,
1361                                        R_target_interior);
1362           for(unsigned int j=0; j<CDEPTH; j++)
1363             {
1364               VECTOR_SHORT MM = v_zero;
1365               VECTOR_SHORT TT = T0;
1366               for(int c=0; c<CHANNELS; c++)
1367                 {
1368                   if ((d_begin[c] == d_end[c]) &&
1369                       (j >= ((d_length[c]+3) % 4)))
1370                     {
1371                       MM = v_xor(MM, TT);
1372                     }
1373                   TT = v_shift_left(TT);
1374                 }
1375               QR_target[j] = v_add(QR_target_interior,
1376                                    v_and(QR_diff, MM));
1377               R_target[j]  = v_add(R_target_interior,
1378                                    v_and(R_diff, MM));
1379             }
1380         }
1381 
1382       VECTOR_SHORT h_min, h_max;
1383 
1384       aligncolumns_rest(S, hep, qp,
1385                         QR_query_interior, R_query_interior,
1386                         QR_query_right, R_query_right,
1387                         QR_target[0], R_target[0],
1388                         QR_target[1], R_target[1],
1389                         QR_target[2], R_target[2],
1390                         QR_target[3], R_target[3],
1391                         H0, H1, H2, H3,
1392                         F0, F1, F2, F3,
1393                         & h_min, & h_max,
1394                         qlen, dir);
1395 
1396       VECTOR_SHORT h_min_vector;
1397       VECTOR_SHORT h_max_vector;
1398       v_store(& h_min_vector, h_min);
1399       v_store(& h_max_vector, h_max);
1400       for(int c=0; c<CHANNELS; c++)
1401         {
1402           if (! overflow[c])
1403             {
1404               signed short h_min_c = ((signed short *)(& h_min_vector))[c];
1405               signed short h_max_c = ((signed short *)(& h_max_vector))[c];
1406               if ((h_min_c <= score_min) ||
1407                   (h_max_c >= score_max)) {
1408                 overflow[c] = true;
1409 
1410         }
1411             }
1412         }
1413     }
1414     else
1415     {
1416       /* One or more sequences ended in the previous block.
1417          We have to switch over to a new sequence           */
1418 
1419       easy = 1;
1420 
1421       M = v_zero;
1422 
1423       VECTOR_SHORT T = T0;
1424       for (int c=0; c<CHANNELS; c++)
1425       {
1426         if (d_begin[c] < d_end[c])
1427         {
1428           /* this channel has more sequence */
1429 
1430           for(int j=0; j<CDEPTH; j++)
1431           {
1432             if (d_begin[c] < d_end[c]) {
1433               dseq[CHANNELS*j+c] = chrmap_4bit[*(d_begin[c]++)];
1434             } else {
1435               dseq[CHANNELS*j+c] = 0;
1436 
1437         }
1438           }
1439           if (d_begin[c] == d_end[c]) {
1440             easy = 0;
1441 
1442         }
1443         }
1444         else
1445         {
1446           /* sequence in channel c ended. change of sequence */
1447 
1448           M = v_xor(M, T);
1449 
1450           int64_t cand_id = seq_id[c];
1451 
1452           if (cand_id >= 0)
1453           {
1454             /* save score */
1455 
1456             char * dbseq = (char*) d_address[c];
1457             int64_t dbseqlen = d_length[c];
1458             int64_t z = (dbseqlen+3) % 4;
1459             int64_t score = ((CELL*)S)[z*CHANNELS+c];
1460 
1461             if (overflow[c])
1462               {
1463                 pscores[cand_id] = SHRT_MAX;
1464                 paligned[cand_id] = 0;
1465                 pmatches[cand_id] = 0;
1466                 pmismatches[cand_id] = 0;
1467                 pgaps[cand_id] = 0;
1468                 pcigar[cand_id] = xstrdup("");
1469               }
1470             else
1471               {
1472                 pscores[cand_id] = score;
1473                 backtrack16(s, dbseq, dbseqlen, d_offset[c], c,
1474                             paligned + cand_id,
1475                             pmatches + cand_id,
1476                             pmismatches + cand_id,
1477                             pgaps + cand_id);
1478                 pcigar[cand_id] = (char *) xmalloc(strlen(s->cigar)+1);
1479                 strcpy(pcigar[cand_id], s->cigar);
1480               }
1481 
1482             done++;
1483           }
1484 
1485           /* get next sequence of reasonable length */
1486 
1487           int64_t length = 0;
1488 
1489           while ((length == 0) && (next_id < sequences))
1490           {
1491             cand_id = next_id++;
1492             length = db_getsequencelen(seqnos[cand_id]);
1493             if ((length==0) || (s->qlen * length > MAXSEQLENPRODUCT))
1494               {
1495                 pscores[cand_id] = SHRT_MAX;
1496                 paligned[cand_id] = 0;
1497                 pmatches[cand_id] = 0;
1498                 pmismatches[cand_id] = 0;
1499                 pgaps[cand_id] = 0;
1500                 pcigar[cand_id] = xstrdup("");
1501                 length = 0;
1502                 done++;
1503               }
1504           }
1505 
1506           if (length > 0)
1507             {
1508               seq_id[c] = cand_id;
1509               char * address = db_getsequence(seqnos[cand_id]);
1510               d_address[c] = (BYTE*) address;
1511               d_length[c] = length;
1512               d_begin[c] = (unsigned char*) address;
1513               d_end[c] = (unsigned char*) address + length;
1514               d_offset[c] = dir - dirbuffer;
1515               overflow[c] = false;
1516 
1517               ((CELL*)&H0)[c] = 0;
1518               ((CELL*)&H1)[c] = - s->penalty_gap_open_query_left
1519                 - 1*s->penalty_gap_extension_query_left;
1520               ((CELL*)&H2)[c] = - s->penalty_gap_open_query_left
1521                 - 2*s->penalty_gap_extension_query_left;
1522               ((CELL*)&H3)[c] = - s->penalty_gap_open_query_left
1523                 - 3*s->penalty_gap_extension_query_left;
1524 
1525               ((CELL*)&F0)[c] = - s->penalty_gap_open_query_left
1526                 - 1*s->penalty_gap_extension_query_left;
1527               ((CELL*)&F1)[c] = - s->penalty_gap_open_query_left
1528                 - 2*s->penalty_gap_extension_query_left;
1529               ((CELL*)&F2)[c] = - s->penalty_gap_open_query_left
1530                 - 3*s->penalty_gap_extension_query_left;
1531               ((CELL*)&F3)[c] = - s->penalty_gap_open_query_left
1532                 - 4*s->penalty_gap_extension_query_left;
1533 
1534               /* fill channel */
1535 
1536               for(int j=0; j<CDEPTH; j++)
1537                 {
1538                   if (d_begin[c] < d_end[c]) {
1539                     dseq[CHANNELS*j+c] = chrmap_4bit[*(d_begin[c]++)];
1540                   } else {
1541                     dseq[CHANNELS*j+c] = 0;
1542 
1543         }
1544                 }
1545               if (d_begin[c] == d_end[c]) {
1546                 easy = 0;
1547 
1548         }
1549             }
1550           else
1551             {
1552               /* no more sequences, empty channel */
1553 
1554               seq_id[c] = -1;
1555               d_address[c] = nullptr;
1556               d_begin[c] = &zero;
1557               d_end[c] = d_begin[c];
1558               d_length[c] = 0;
1559               d_offset[c] = 0;
1560               for (int j=0; j<CDEPTH; j++) {
1561                 dseq[CHANNELS*j+c] = 0;
1562 
1563         }
1564             }
1565         }
1566         T = v_shift_left(T);
1567       }
1568 
1569       if (done == sequences) {
1570         break;
1571 
1572         }
1573 
1574       /* make masked versions of QR and R for gaps in target */
1575 
1576       M_QR_target_left = v_and(M, QR_target_left);
1577       M_R_target_left = v_and(M, R_target_left);
1578 
1579       /* make masked versions of QR for gaps in query at target left end */
1580 
1581       M_QR_query_interior = v_and(M, QR_query_interior);
1582       M_QR_query_right = v_and(M, QR_query_right);
1583 
1584       dprofile_fill16(dprofile, (CELL*) s->matrix, dseq);
1585 
1586       /* create vectors of gap penalties for target depending on whether
1587          any of the database sequences ended in these four columns */
1588 
1589       if (easy)
1590         {
1591           for(unsigned int j=0; j<CDEPTH; j++)
1592             {
1593               QR_target[j] = QR_target_interior;
1594               R_target[j]  = R_target_interior;
1595             }
1596         }
1597       else
1598         {
1599           /* one or more sequences ended */
1600 
1601           VECTOR_SHORT QR_diff = v_sub(QR_target_right,
1602                                        QR_target_interior);
1603           VECTOR_SHORT R_diff  = v_sub(R_target_right,
1604                                        R_target_interior);
1605           for(unsigned int j=0; j<CDEPTH; j++)
1606             {
1607               VECTOR_SHORT MM = v_zero;
1608               VECTOR_SHORT TT = T0;
1609               for(int c=0; c<CHANNELS; c++)
1610                 {
1611                   if ((d_begin[c] == d_end[c]) &&
1612                       (j >= ((d_length[c]+3) % 4)))
1613                     {
1614                       MM = v_xor(MM, TT);
1615                     }
1616                   TT = v_shift_left(TT);
1617                 }
1618               QR_target[j] = v_add(QR_target_interior,
1619                                    v_and(QR_diff, MM));
1620               R_target[j]  = v_add(R_target_interior,
1621                                    v_and(R_diff, MM));
1622             }
1623         }
1624 
1625       VECTOR_SHORT h_min, h_max;
1626 
1627       aligncolumns_first(S, hep, qp,
1628                          QR_query_interior, R_query_interior,
1629                          QR_query_right, R_query_right,
1630                          QR_target[0], R_target[0],
1631                          QR_target[1], R_target[1],
1632                          QR_target[2], R_target[2],
1633                          QR_target[3], R_target[3],
1634                          H0, H1, H2, H3,
1635                          F0, F1, F2, F3,
1636                          & h_min, & h_max,
1637                          M,
1638                          M_QR_target_left, M_R_target_left,
1639                          M_QR_query_interior,
1640                          M_QR_query_right,
1641                          qlen, dir);
1642 
1643       VECTOR_SHORT h_min_vector;
1644       VECTOR_SHORT h_max_vector;
1645       v_store(& h_min_vector, h_min);
1646       v_store(& h_max_vector, h_max);
1647       for(int c=0; c<CHANNELS; c++)
1648         {
1649           if (! overflow[c])
1650             {
1651               signed short h_min_c = ((signed short *)(& h_min_vector))[c];
1652               signed short h_max_c = ((signed short *)(& h_max_vector))[c];
1653               if ((h_min_c <= score_min) ||
1654                   (h_max_c >= score_max)) {
1655                 overflow[c] = true;
1656 
1657         }
1658             }
1659         }
1660     }
1661 
1662     H0 = v_sub(H3, R_query_left);
1663     H1 = v_sub(H0, R_query_left);
1664     H2 = v_sub(H1, R_query_left);
1665     H3 = v_sub(H2, R_query_left);
1666 
1667     F0 = v_sub(F3, R_query_left);
1668     F1 = v_sub(F0, R_query_left);
1669     F2 = v_sub(F1, R_query_left);
1670     F3 = v_sub(F2, R_query_left);
1671 
1672     dir += 4 * 4 * s->qlen;
1673 
1674     if (dir >= dirbuffer + dirbuffersize) {
1675       dir -= dirbuffersize;
1676 
1677         }
1678   }
1679 }
1680