1 /* $Id: ncbi_fast.cpp 550823 2017-11-08 15:43:47Z gouriano $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Eugene Vasilchenko, Andrei Gourianov
27 *
28 * File Description:
29 * SSE utility functions
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbidbg.hpp>
35 #include <corelib/ncbi_fast.hpp>
36
37 BEGIN_NCBI_SCOPE
38
39 //USING_NCBI_SCOPE;
40
41 #if defined(NCBI_HAVE_FAST_OPS)
42
x_sse_ClearBuffer(char * dst,size_t count)43 void NFast::x_sse_ClearBuffer(char* dst, size_t count)
44 {
45 _ASSERT(count%16 == 0);
46 _ASSERT(uintptr_t(dst)%16 == 0);
47 __m128i zero = _mm_setzero_si128();
48 for ( auto dst_end = dst+count; dst < dst_end; dst += 16 ) {
49 _mm_store_si128((__m128i*)dst, zero);
50 }
51 }
52
x_sse_ClearBuffer(int * dst,size_t count)53 void NFast::x_sse_ClearBuffer(int* dst, size_t count)
54 {
55 #if 1
56 _ASSERT(count%16 == 0);
57 _ASSERT(uintptr_t(dst)%16 == 0);
58 __m128i zero = _mm_setzero_si128();
59 for ( auto dst_end = dst+count; dst < dst_end; dst += 16 ) {
60 _mm_store_si128((__m128i*)dst+0, zero);
61 _mm_store_si128((__m128i*)dst+1, zero);
62 _mm_store_si128((__m128i*)dst+2, zero);
63 _mm_store_si128((__m128i*)dst+3, zero);
64 }
65 #else
66 _ASSERT(count%4 == 0);
67 _ASSERT(uintptr_t(dst)%16 == 0);
68 __m128i zero = _mm_setzero_si128();
69 for ( auto dst_end = dst+count; dst < dst_end; dst += 4 ) {
70 _mm_store_si128((__m128i*)dst, zero);
71 }
72 #endif
73 }
74
x_ClearBuffer(char * dst,size_t count)75 void NFast::x_ClearBuffer(char* dst, size_t count)
76 {
77 memset(dst, 0, count*sizeof(*dst));
78 }
79
x_ClearBuffer(int * dst,size_t count)80 void NFast::x_ClearBuffer(int* dst, size_t count)
81 {
82 memset(dst, 0, count*sizeof(*dst));
83 }
84
85
86 #if 0
87 void NFast::x_sse_CopyBuffer(const char* src, size_t count, char* dst)
88 {
89 _ASSERT(count%16 == 0);
90 _ASSERT(uintptr_t(src)%16 == 0);
91 _ASSERT(uintptr_t(dst)%16 == 0);
92 for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
93 _mm_store_si128((__m128i*)dst, *(__m128i*)src);
94 }
95 }
96 #endif
97
x_sse_CopyBuffer(const int * src,size_t count,int * dst)98 void NFast::x_sse_CopyBuffer(const int* src, size_t count, int* dst)
99 {
100 #if 1
101 _ASSERT(count%16 == 0);
102 _ASSERT(uintptr_t(src)%16 == 0);
103 _ASSERT(uintptr_t(dst)%16 == 0);
104 for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
105 __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
106 __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
107 __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
108 __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
109 _mm_store_si128((__m128i*)dst+0, ww0);
110 _mm_store_si128((__m128i*)dst+1, ww1);
111 _mm_store_si128((__m128i*)dst+2, ww2);
112 _mm_store_si128((__m128i*)dst+3, ww3);
113 }
114 #elif 0
115 _ASSERT(count%4 == 0);
116 _ASSERT(uintptr_t(src)%16 == 0);
117 _ASSERT(uintptr_t(dst)%16 == 0);
118 for ( auto src_end = src+count; src < src_end; dst += 4, src += 4 ) {
119 _mm_store_si128((__m128i*)dst, *(__m128i*)src);
120 }
121 #else
122 _ASSERT(count%4 == 0);
123 _ASSERT(uintptr_t(src)%16 == 0);
124 _ASSERT(uintptr_t(dst)%16 == 0);
125 for ( auto src_end = src+count; src < src_end; dst += 4, src += 4 ) {
126 __m128i ww0 = _mm_load_si128((const __m128i*)src);
127 _mm_store_si128((__m128i*)dst, ww0);
128 }
129 #endif
130 }
131
132
x_sse_ConvertBuffer(const char * src,size_t count,int * dst)133 void NFast::x_sse_ConvertBuffer(const char* src, size_t count, int* dst)
134 {
135 #if 1
136 _ASSERT(count%16 == 0);
137 _ASSERT(uintptr_t(src)%16 == 0);
138 _ASSERT(uintptr_t(dst)%16 == 0);
139 __m128i mask = _mm_set_epi8(-128, -128, -128, 3,
140 -128, -128, -128, 2,
141 -128, -128, -128, 1,
142 -128, -128, -128, 0);
143 for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
144 uint32_t bb0 = ((const uint32_t*)src)[0];
145 uint32_t bb1 = ((const uint32_t*)src)[1];
146 uint32_t bb2 = ((const uint32_t*)src)[2];
147 uint32_t bb3 = ((const uint32_t*)src)[3];
148 __m128i ww0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb0), mask);
149 __m128i ww1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb1), mask);
150 __m128i ww2 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb2), mask);
151 __m128i ww3 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb3), mask);
152 _mm_store_si128((__m128i*)dst+0, ww0);
153 _mm_store_si128((__m128i*)dst+1, ww1);
154 _mm_store_si128((__m128i*)dst+2, ww2);
155 _mm_store_si128((__m128i*)dst+3, ww3);
156 }
157 #else
158 _ASSERT(count%4 == 0);
159 _ASSERT(uintptr_t(src)%16 == 0);
160 _ASSERT(uintptr_t(dst)%16 == 0);
161 __m128i mask = _mm_set_epi8(-128, -128, -128, 3,
162 -128, -128, -128, 2,
163 -128, -128, -128, 1,
164 -128, -128, -128, 0);
165 for ( auto src_end = src+count; src < src_end; dst += 4, src += 4 ) {
166 uint32_t bb0 = ((const uint32_t*)src)[0];
167 __m128i ww0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb0), mask);
168 _mm_store_si128((__m128i*)dst+0, ww0);
169 }
170 #endif
171 }
172
173
x_sse_ConvertBuffer(const int * src,size_t count,char * dst)174 void NFast::x_sse_ConvertBuffer(const int* src, size_t count, char* dst)
175 {
176 _ASSERT(count%16 == 0);
177 _ASSERT(uintptr_t(src)%16 == 0);
178 _ASSERT(uintptr_t(dst)%16 == 0);
179 __m128i mask = _mm_set_epi8(-128, -128, -128, -128,
180 -128, -128, -128, -128,
181 -128, -128, -128, -128,
182 12, 8, 4, 0);
183 for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
184 __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
185 __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
186 __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
187 __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
188 ww0 = _mm_shuffle_epi8(ww0, mask);
189 ww1 = _mm_shuffle_epi8(ww1, mask);
190 ww2 = _mm_shuffle_epi8(ww2, mask);
191 ww3 = _mm_shuffle_epi8(ww3, mask);
192 ww0 = _mm_or_si128(ww0, _mm_slli_si128(ww1, 4));
193 ww2 = _mm_or_si128(ww2, _mm_slli_si128(ww3, 4));
194 ww0 = _mm_or_si128(ww0, _mm_slli_si128(ww2, 8));
195 _mm_store_si128((__m128i*)dst, ww0);
196 }
197 }
198
199
200
x_sse_SplitBufferInto4(const int * src,size_t count,char * dst0,char * dst1,char * dst2,char * dst3)201 void NFast::x_sse_SplitBufferInto4(const int* src, size_t count,
202 char* dst0, char* dst1, char* dst2, char* dst3)
203 {
204 _ASSERT(count%16 == 0);
205 _ASSERT(uintptr_t(src)%16 == 0);
206 _ASSERT(uintptr_t(dst0)%16 == 0);
207 _ASSERT(uintptr_t(dst1)%16 == 0);
208 _ASSERT(uintptr_t(dst2)%16 == 0);
209 _ASSERT(uintptr_t(dst3)%16 == 0);
210 for ( auto src_end = src+count*4; src < src_end;
211 src += 64, dst0 += 16, dst1 += 16, dst2 += 16, dst3 += 16 ) {
212 __m128i ww0, ww1, ww2, ww3;
213 {
214 __m128i tt0 = _mm_load_si128((const __m128i*)src+0);
215 __m128i tt1 = _mm_load_si128((const __m128i*)src+1);
216 __m128i tt2 = _mm_load_si128((const __m128i*)src+2);
217 __m128i tt3 = _mm_load_si128((const __m128i*)src+3);
218 tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
219 tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
220 ww0 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
221 }
222 {
223 __m128i tt0 = _mm_load_si128((const __m128i*)src+4);
224 __m128i tt1 = _mm_load_si128((const __m128i*)src+5);
225 __m128i tt2 = _mm_load_si128((const __m128i*)src+6);
226 __m128i tt3 = _mm_load_si128((const __m128i*)src+7);
227 tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
228 tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
229 ww1 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
230 }
231 {
232 __m128i tt0 = _mm_load_si128((const __m128i*)src+8);
233 __m128i tt1 = _mm_load_si128((const __m128i*)src+9);
234 __m128i tt2 = _mm_load_si128((const __m128i*)src+10);
235 __m128i tt3 = _mm_load_si128((const __m128i*)src+11);
236 tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
237 tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
238 ww2 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
239 }
240 {
241 __m128i tt0 = _mm_load_si128((const __m128i*)src+12);
242 __m128i tt1 = _mm_load_si128((const __m128i*)src+13);
243 __m128i tt2 = _mm_load_si128((const __m128i*)src+14);
244 __m128i tt3 = _mm_load_si128((const __m128i*)src+15);
245 tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
246 tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
247 ww3 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
248 }
249
250 {
251 // transpose 4x4
252 __m128i tt0 = _mm_unpacklo_epi32(ww0, ww1);
253 __m128i tt1 = _mm_unpacklo_epi32(ww2, ww3);
254 __m128i tt2 = _mm_unpackhi_epi32(ww0, ww1);
255 __m128i tt3 = _mm_unpackhi_epi32(ww2, ww3);
256
257 ww0 = _mm_unpacklo_epi64(tt0, tt1);
258 ww1 = _mm_unpackhi_epi64(tt0, tt1);
259 ww2 = _mm_unpacklo_epi64(tt2, tt3);
260 ww3 = _mm_unpackhi_epi64(tt2, tt3);
261 }
262
263 _mm_store_si128((__m128i*)dst0, ww0);
264 _mm_store_si128((__m128i*)dst1, ww1);
265 _mm_store_si128((__m128i*)dst2, ww2);
266 _mm_store_si128((__m128i*)dst3, ww3);
267 }
268 }
269
270
x_sse_SplitBufferInto4(const int * src,size_t count,int * dst0,int * dst1,int * dst2,int * dst3)271 void NFast::x_sse_SplitBufferInto4(const int* src, size_t count,
272 int* dst0, int* dst1, int* dst2, int* dst3)
273 {
274 _ASSERT(count%16 == 0);
275 _ASSERT(uintptr_t(src)%16 == 0);
276 _ASSERT(uintptr_t(dst0)%16 == 0);
277 _ASSERT(uintptr_t(dst1)%16 == 0);
278 _ASSERT(uintptr_t(dst2)%16 == 0);
279 _ASSERT(uintptr_t(dst3)%16 == 0);
280 for ( auto src_end = src+count*4; src < src_end;
281 src += 16, dst0 += 4, dst1 += 4, dst2 += 4, dst3 += 4 ) {
282 __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
283 __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
284 __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
285 __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
286
287 {
288 // transpose 4x4
289 __m128i tt0 = _mm_unpacklo_epi32(ww0, ww1);
290 __m128i tt1 = _mm_unpacklo_epi32(ww2, ww3);
291 __m128i tt2 = _mm_unpackhi_epi32(ww0, ww1);
292 __m128i tt3 = _mm_unpackhi_epi32(ww2, ww3);
293
294 ww0 = _mm_unpacklo_epi64(tt0, tt1);
295 ww1 = _mm_unpackhi_epi64(tt0, tt1);
296 ww2 = _mm_unpacklo_epi64(tt2, tt3);
297 ww3 = _mm_unpackhi_epi64(tt2, tt3);
298 }
299
300 _mm_store_si128((__m128i*)dst0, ww0);
301 _mm_store_si128((__m128i*)dst1, ww1);
302 _mm_store_si128((__m128i*)dst2, ww2);
303 _mm_store_si128((__m128i*)dst3, ww3);
304 }
305 }
306
307 /*
308 void copy_n(const int* src, size_t count, char* dst)
309 {
310 for ( ; count && (uintptr_t(dst)%4); ++dst, --count, ++src ) {
311 *dst = *src;
312 }
313 __m128i mask = _mm_set_epi8(128, 128, 128, 128,
314 128, 128, 128, 128,
315 128, 128, 128, 128,
316 12, 8, 4, 0);
317 for ( ; count >= 4; dst += 4, count -= 4, src += 4 ) {
318 __m128i ww = _mm_load_si128((const __m128i*)src);
319 uint32_t bb = _mm_cvtsi128_si32(_mm_shuffle_epi8(ww, mask));
320 *(uint32_t*)dst = bb;
321 }
322 for ( ; count; ++dst, --count, ++src ) {
323 *dst = *src;
324 }
325 }
326
327
328 void copy_n(const int* src, size_t count, int* dst)
329 {
330 for ( ; count && (uintptr_t(dst)%16); ++dst, --count, ++src ) {
331 *dst = *src;
332 }
333 __m128i mask = _mm_set_epi8(128, 128, 128, 128,
334 128, 128, 128, 128,
335 128, 128, 128, 128,
336 12, 8, 4, 0);
337 for ( ; count >= 4; dst += 4, count -= 4, src += 4 ) {
338 __m128i ww = _mm_load_si128((const __m128i*)src);
339 uint32_t bb = _mm_cvtsi128_si32(_mm_shuffle_epi8(ww, mask));
340 *(uint32_t*)dst = bb;
341 }
342 for ( ; count; ++dst, --count, ++src ) {
343 *dst = *src;
344 }
345 }
346 */
347
x_sse_FindMaxElement(const unsigned int * src,size_t count)348 unsigned int NFast::x_sse_FindMaxElement(const unsigned int* src, size_t count)
349 {
350 _ASSERT(count%16 == 0);
351 _ASSERT(uintptr_t(src)%16 == 0);
352 __m128i max4 = _mm_setzero_si128();
353 for ( auto src_end = src+count; src < src_end; src += 16 ) {
354 __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
355 __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
356 __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
357 __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
358 ww0 = _mm_max_epu32(ww0, ww1);
359 ww2 = _mm_max_epu32(ww2, ww3);
360 ww0 = _mm_max_epu32(ww0, ww2);
361 max4 = _mm_max_epu32(max4, ww0);
362 }
363 max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 8));
364 max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 4));
365 return _mm_cvtsi128_si32(max4);
366 }
367
368
x_sse_FindMaxElement(const unsigned int * src,size_t count,unsigned int & dst)369 void NFast::x_sse_FindMaxElement(const unsigned int* src, size_t count, unsigned int& dst)
370 {
371 _ASSERT(count%16 == 0);
372 _ASSERT(uintptr_t(src)%16 == 0);
373 __m128i max4 = _mm_set1_epi32(dst);
374 for ( auto src_end = src+count; src < src_end; src += 16 ) {
375 __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
376 __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
377 __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
378 __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
379 ww0 = _mm_max_epu32(ww0, ww1);
380 ww2 = _mm_max_epu32(ww2, ww3);
381 ww0 = _mm_max_epu32(ww0, ww2);
382 max4 = _mm_max_epu32(max4, ww0);
383 }
384 max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 8));
385 max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 4));
386 dst = _mm_cvtsi128_si32(max4);
387 }
388
389
x_sse_Find4MaxElements(const unsigned int * src,size_t count,unsigned int dst[4])390 void NFast::x_sse_Find4MaxElements(const unsigned int* src, size_t count, unsigned int dst[4])
391 {
392 _ASSERT(count%16 == 0);
393 _ASSERT(uintptr_t(src)%16 == 0);
394 __m128i max4 = _mm_loadu_si128((__m128i*)dst);
395 for ( auto src_end = src+count*4; src < src_end; src += 16 ) {
396 __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
397 __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
398 __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
399 __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
400 ww0 = _mm_max_epu32(ww0, ww1);
401 ww2 = _mm_max_epu32(ww2, ww3);
402 ww0 = _mm_max_epu32(ww0, ww2);
403 max4 = _mm_max_epu32(max4, ww0);
404 }
405 _mm_storeu_si128((__m128i*)dst, max4);
406 }
407
408 #endif // NCBI_HAVE_FAST_OPS
409
x_no_sse_SplitBufferInto4(const int * src,size_t count,int * dest0,int * dest1,int * dest2,int * dest3)410 void NFast::x_no_sse_SplitBufferInto4(const int* src, size_t count, int* dest0, int* dest1, int* dest2, int* dest3)
411 {
412 for (size_t e = 0; e < count; ++e) {
413 int v0 = src[e*4+0];
414 int v1 = src[e*4+1];
415 int v2 = src[e*4+2];
416 int v3 = src[e*4+3];
417 dest0[e] = v0;
418 dest1[e] = v1;
419 dest2[e] = v2;
420 dest3[e] = v3;
421 }
422 }
423
x_no_sse_SplitBufferInto4(const int * src,size_t count,char * dest0,char * dest1,char * dest2,char * dest3)424 void NFast::x_no_sse_SplitBufferInto4(const int* src, size_t count, char* dest0, char* dest1, char* dest2, char* dest3)
425 {
426 for (size_t e = 0; e < count; ++e) {
427 char v0 = char(src[e*4+0]);
428 char v1 = char(src[e*4+1]);
429 char v2 = char(src[e*4+2]);
430 char v3 = char(src[e*4+3]);
431 dest0[e] = v0;
432 dest1[e] = v1;
433 dest2[e] = v2;
434 dest3[e] = v3;
435 }
436 }
437
x_no_sse_FindMaxElement(const unsigned int * src,size_t count,unsigned int v)438 unsigned int NFast::x_no_sse_FindMaxElement(const unsigned int* src, size_t count, unsigned int v)
439 {
440 unsigned int result = v;
441 for (size_t e = 0; e < count; ++e) {
442 if (result < *(src + e)) {
443 result = *(src + e);
444 }
445 }
446 return result;
447 }
448
x_no_sse_Find4MaxElements(const unsigned int * src,size_t count,unsigned int dest[4])449 void NFast::x_no_sse_Find4MaxElements(const unsigned int* src, size_t count, unsigned int dest[4])
450 {
451 unsigned int result[4];
452 memcpy(result, dest, sizeof(unsigned int) * 4);
453 for (size_t e = 0; e < 4*count; e += 4) {
454 if (result[0] < *(src + e)) {
455 result[0] = *(src + e);
456 }
457 if (result[1] < *(src + e + 1)) {
458 result[1] = *(src + e + 1);
459 }
460 if (result[2] < *(src + e + 2)) {
461 result[2] = *(src + e + 2);
462 }
463 if (result[3] < *(src + e + 3)) {
464 result[3] = *(src + e + 3);
465 }
466 }
467 memcpy(dest, result, sizeof(unsigned int) * 4);
468 }
469
470
471
472 END_NCBI_SCOPE
473
474