1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 /* This file contains code dependent on special cpu features. */
64 /* The file may be compiled several times with different cpu options. */
65 
66 #ifdef __aarch64__
67 
increment_counters_from_bitmap(count_t * counters,unsigned char * bitmap,unsigned int totalbits)68 void increment_counters_from_bitmap(count_t * counters,
69                                     unsigned char * bitmap,
70                                     unsigned int totalbits)
71 {
72   const uint8x16_t c1 =
73     { 0x01, 0x01, 0x02, 0x02, 0x04, 0x04, 0x08, 0x08,
74       0x10, 0x10, 0x20, 0x20, 0x40, 0x40, 0x80, 0x80 };
75 
76   unsigned short * p = (unsigned short *)(bitmap);
77   int16x8_t * q = (int16x8_t *)(counters);
78   int r = (totalbits + 15) / 16;
79 
80   for(int j=0; j<r; j++)
81     {
82       uint16x8_t r0;
83       uint8x16_t r1, r2, r3, r4;
84       int16x8_t r5, r6;
85 
86       // load and duplicate short
87       r0 = vdupq_n_u16(*p);
88       p++;
89 
90       // cast to bytes
91       r1 = vreinterpretq_u8_u16(r0);
92 
93       // bit test with mask giving 0x00 or 0xff
94       r2 = vtstq_u8(r1, c1);
95 
96       // transpose to duplicate even bytes
97       r3 = vtrn1q_u8(r2, r2);
98 
99       // transpose to duplicate odd bytes
100       r4 = vtrn2q_u8(r2, r2);
101 
102       // cast to signed 0x0000 or 0xffff
103       r5 = vreinterpretq_s16_u8(r3);
104 
105       // cast to signed 0x0000 or 0xffff
106       r6 = vreinterpretq_s16_u8(r4);
107 
108       // subtract signed 0 or -1 (i.e add 0 or 1) with saturation to counter
109       *q = vqsubq_s16(*q, r5);
110       q++;
111 
112       // subtract signed 0 or 1 (i.e. add 0 or 1) with saturation to counter
113       *q = vqsubq_s16(*q, r6);
114       q++;
115     }
116 }
117 
118 #elif defined __PPC__
119 
increment_counters_from_bitmap(count_t * counters,unsigned char * bitmap,unsigned int totalbits)120 void increment_counters_from_bitmap(count_t * counters,
121                                     unsigned char * bitmap,
122                                     unsigned int totalbits)
123 {
124   const vector unsigned char c1 =
125     { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
126   const vector unsigned char c2 =
127     { 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f,
128       0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f };
129   const vector unsigned char c3 =
130     { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
131       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
132 
133   unsigned short * p = (unsigned short *)(bitmap);
134   vector signed short * q = (vector signed short *) (counters);
135   int r = (totalbits + 15) / 16;
136 
137   for(int j=0; j<r; j++)
138     {
139       vector unsigned char r0, r1, r2;
140       vector __bool char r3;
141       vector signed short r4, r5;
142 
143       r0 = * (vector unsigned char *) p;
144       p++;
145       r1 = vec_perm(r0, r0, c1);
146       r2 = vec_or(r1, c2);
147       r3 = vec_cmpeq(r2, c3);
148       r4 = (vector signed short) vec_unpackl(r3);
149       r5 = (vector signed short) vec_unpackh(r3);
150       *q = vec_subs(*q, r4);
151       q++;
152       *q = vec_subs(*q, r5);
153       q++;
154     }
155 }
156 
157 #elif __x86_64__
158 
159 #ifdef SSSE3
increment_counters_from_bitmap_ssse3(count_t * counters,unsigned char * bitmap,unsigned int totalbits)160 void increment_counters_from_bitmap_ssse3(count_t * counters,
161                                           unsigned char * bitmap,
162                                           unsigned int totalbits)
163 #else
164 void increment_counters_from_bitmap_sse2(count_t * counters,
165                                          unsigned char * bitmap,
166                                          unsigned int totalbits)
167 #endif
168 {
169   /*
170     Increment selected elements in an array of 16 bit counters.
171     The counters to increment are indicated by 1's in the bitmap.
172 
173     We read 16 bytes from the bitmap, but use only two bytes (16 bits).
174     Convert these 16 bits into 16 bytes with either 0x00 or 0xFF.
175     Extend these to 16 words (32 bytes) with either 0x0000 or 0xFFFF.
176     Use these values to increment 16 words in an array by subtraction.
177 
178     See article below for some hints:
179     http://stackoverflow.com/questions/21622212/
180     how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb
181 
182     Because the efficient PSHUFB instruction is a SSSE3 instruction
183     lacking in many AMD cpus, we provide slightly slower alternative
184     SSE2 code.
185   */
186 
187 #ifdef SSSE3
188   const __m128i c1 =
189     _mm_set_epi32(0x01010101, 0x01010101, 0x00000000, 0x00000000);
190 #endif
191 
192   const __m128i c2 =
193     _mm_set_epi32(0x7fbfdfef, 0xf7fbfdfe, 0x7fbfdfef, 0xf7fbfdfe);
194 
195   const __m128i c3 =
196     _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
197 
198   auto * p = (unsigned short *)(bitmap);
199   auto * q = (__m128i *)(counters);
200   int r = (totalbits + 15) / 16;
201 
202   for(int j=0; j<r; j++)
203     {
204       __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
205       xmm0 = _mm_loadu_si128((__m128i*)p++);
206 #ifdef SSSE3
207       xmm1 = _mm_shuffle_epi8(xmm0, c1);
208 #else
209       __m128i xmm6, xmm7;
210       xmm6 = _mm_unpacklo_epi8(xmm0, xmm0);
211       xmm7 = _mm_unpacklo_epi16(xmm6, xmm6);
212       xmm1 = _mm_unpacklo_epi32(xmm7, xmm7);
213 #endif
214       xmm2 = _mm_or_si128(xmm1, c2);
215       xmm3 = _mm_cmpeq_epi8(xmm2, c3);
216       xmm4 = _mm_unpacklo_epi8(xmm3, xmm3);
217       xmm5 = _mm_unpackhi_epi8(xmm3, xmm3);
218       *q = _mm_subs_epi16(*q, xmm4);
219       q++;
220       *q = _mm_subs_epi16(*q, xmm5);
221       q++;
222     }
223 }
224 
225 #else
226 
227 #error Unknown architecture
228 
229 #endif
230