1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 /* This file contains code dependent on special cpu features. */
64 /* The file may be compiled several times with different cpu options. */
65
66 #ifdef __aarch64__
67
increment_counters_from_bitmap(count_t * counters,unsigned char * bitmap,unsigned int totalbits)68 void increment_counters_from_bitmap(count_t * counters,
69 unsigned char * bitmap,
70 unsigned int totalbits)
71 {
72 const uint8x16_t c1 =
73 { 0x01, 0x01, 0x02, 0x02, 0x04, 0x04, 0x08, 0x08,
74 0x10, 0x10, 0x20, 0x20, 0x40, 0x40, 0x80, 0x80 };
75
76 unsigned short * p = (unsigned short *)(bitmap);
77 int16x8_t * q = (int16x8_t *)(counters);
78 int r = (totalbits + 15) / 16;
79
80 for(int j=0; j<r; j++)
81 {
82 uint16x8_t r0;
83 uint8x16_t r1, r2, r3, r4;
84 int16x8_t r5, r6;
85
86 // load and duplicate short
87 r0 = vdupq_n_u16(*p);
88 p++;
89
90 // cast to bytes
91 r1 = vreinterpretq_u8_u16(r0);
92
93 // bit test with mask giving 0x00 or 0xff
94 r2 = vtstq_u8(r1, c1);
95
96 // transpose to duplicate even bytes
97 r3 = vtrn1q_u8(r2, r2);
98
99 // transpose to duplicate odd bytes
100 r4 = vtrn2q_u8(r2, r2);
101
102 // cast to signed 0x0000 or 0xffff
103 r5 = vreinterpretq_s16_u8(r3);
104
105 // cast to signed 0x0000 or 0xffff
106 r6 = vreinterpretq_s16_u8(r4);
107
108 // subtract signed 0 or -1 (i.e add 0 or 1) with saturation to counter
109 *q = vqsubq_s16(*q, r5);
110 q++;
111
112 // subtract signed 0 or 1 (i.e. add 0 or 1) with saturation to counter
113 *q = vqsubq_s16(*q, r6);
114 q++;
115 }
116 }
117
118 #elif defined __PPC__
119
increment_counters_from_bitmap(count_t * counters,unsigned char * bitmap,unsigned int totalbits)120 void increment_counters_from_bitmap(count_t * counters,
121 unsigned char * bitmap,
122 unsigned int totalbits)
123 {
124 const vector unsigned char c1 =
125 { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
126 const vector unsigned char c2 =
127 { 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f,
128 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f };
129 const vector unsigned char c3 =
130 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
131 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
132
133 unsigned short * p = (unsigned short *)(bitmap);
134 vector signed short * q = (vector signed short *) (counters);
135 int r = (totalbits + 15) / 16;
136
137 for(int j=0; j<r; j++)
138 {
139 vector unsigned char r0, r1, r2;
140 vector __bool char r3;
141 vector signed short r4, r5;
142
143 r0 = * (vector unsigned char *) p;
144 p++;
145 r1 = vec_perm(r0, r0, c1);
146 r2 = vec_or(r1, c2);
147 r3 = vec_cmpeq(r2, c3);
148 r4 = (vector signed short) vec_unpackl(r3);
149 r5 = (vector signed short) vec_unpackh(r3);
150 *q = vec_subs(*q, r4);
151 q++;
152 *q = vec_subs(*q, r5);
153 q++;
154 }
155 }
156
157 #elif __x86_64__
158
159 #ifdef SSSE3
increment_counters_from_bitmap_ssse3(count_t * counters,unsigned char * bitmap,unsigned int totalbits)160 void increment_counters_from_bitmap_ssse3(count_t * counters,
161 unsigned char * bitmap,
162 unsigned int totalbits)
163 #else
164 void increment_counters_from_bitmap_sse2(count_t * counters,
165 unsigned char * bitmap,
166 unsigned int totalbits)
167 #endif
168 {
169 /*
170 Increment selected elements in an array of 16 bit counters.
171 The counters to increment are indicated by 1's in the bitmap.
172
173 We read 16 bytes from the bitmap, but use only two bytes (16 bits).
174 Convert these 16 bits into 16 bytes with either 0x00 or 0xFF.
175 Extend these to 16 words (32 bytes) with either 0x0000 or 0xFFFF.
176 Use these values to increment 16 words in an array by subtraction.
177
178 See article below for some hints:
179 http://stackoverflow.com/questions/21622212/
180 how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb
181
182 Because the efficient PSHUFB instruction is a SSSE3 instruction
183 lacking in many AMD cpus, we provide slightly slower alternative
184 SSE2 code.
185 */
186
187 #ifdef SSSE3
188 const __m128i c1 =
189 _mm_set_epi32(0x01010101, 0x01010101, 0x00000000, 0x00000000);
190 #endif
191
192 const __m128i c2 =
193 _mm_set_epi32(0x7fbfdfef, 0xf7fbfdfe, 0x7fbfdfef, 0xf7fbfdfe);
194
195 const __m128i c3 =
196 _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
197
198 auto * p = (unsigned short *)(bitmap);
199 auto * q = (__m128i *)(counters);
200 int r = (totalbits + 15) / 16;
201
202 for(int j=0; j<r; j++)
203 {
204 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
205 xmm0 = _mm_loadu_si128((__m128i*)p++);
206 #ifdef SSSE3
207 xmm1 = _mm_shuffle_epi8(xmm0, c1);
208 #else
209 __m128i xmm6, xmm7;
210 xmm6 = _mm_unpacklo_epi8(xmm0, xmm0);
211 xmm7 = _mm_unpacklo_epi16(xmm6, xmm6);
212 xmm1 = _mm_unpacklo_epi32(xmm7, xmm7);
213 #endif
214 xmm2 = _mm_or_si128(xmm1, c2);
215 xmm3 = _mm_cmpeq_epi8(xmm2, c3);
216 xmm4 = _mm_unpacklo_epi8(xmm3, xmm3);
217 xmm5 = _mm_unpackhi_epi8(xmm3, xmm3);
218 *q = _mm_subs_epi16(*q, xmm4);
219 q++;
220 *q = _mm_subs_epi16(*q, xmm5);
221 q++;
222 }
223 }
224
225 #else
226
227 #error Unknown architecture
228
229 #endif
230