1 /*  $Id: ncbi_fast.cpp 550823 2017-11-08 15:43:47Z gouriano $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Eugene Vasilchenko, Andrei Gourianov
27  *
28  * File Description:
29  *   SSE utility functions
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbidbg.hpp>
35 #include <corelib/ncbi_fast.hpp>
36 
37 BEGIN_NCBI_SCOPE
38 
39 //USING_NCBI_SCOPE;
40 
41 #if defined(NCBI_HAVE_FAST_OPS)
42 
x_sse_ClearBuffer(char * dst,size_t count)43 void NFast::x_sse_ClearBuffer(char* dst, size_t count)
44 {
45     _ASSERT(count%16 == 0);
46     _ASSERT(uintptr_t(dst)%16 == 0);
47     __m128i zero = _mm_setzero_si128();
48     for ( auto dst_end = dst+count; dst < dst_end; dst += 16 ) {
49         _mm_store_si128((__m128i*)dst, zero);
50     }
51 }
52 
x_sse_ClearBuffer(int * dst,size_t count)53 void NFast::x_sse_ClearBuffer(int* dst, size_t count)
54 {
55 #if 1
56     _ASSERT(count%16 == 0);
57     _ASSERT(uintptr_t(dst)%16 == 0);
58     __m128i zero = _mm_setzero_si128();
59     for ( auto dst_end = dst+count; dst < dst_end; dst += 16 ) {
60         _mm_store_si128((__m128i*)dst+0, zero);
61         _mm_store_si128((__m128i*)dst+1, zero);
62         _mm_store_si128((__m128i*)dst+2, zero);
63         _mm_store_si128((__m128i*)dst+3, zero);
64     }
65 #else
66     _ASSERT(count%4 == 0);
67     _ASSERT(uintptr_t(dst)%16 == 0);
68     __m128i zero = _mm_setzero_si128();
69     for ( auto dst_end = dst+count; dst < dst_end; dst += 4 ) {
70         _mm_store_si128((__m128i*)dst, zero);
71     }
72 #endif
73 }
74 
x_ClearBuffer(char * dst,size_t count)75 void NFast::x_ClearBuffer(char* dst, size_t count)
76 {
77     memset(dst, 0, count*sizeof(*dst));
78 }
79 
x_ClearBuffer(int * dst,size_t count)80 void NFast::x_ClearBuffer(int* dst, size_t count)
81 {
82     memset(dst, 0, count*sizeof(*dst));
83 }
84 
85 
86 #if 0
87 void NFast::x_sse_CopyBuffer(const char* src, size_t count, char* dst)
88 {
89     _ASSERT(count%16 == 0);
90     _ASSERT(uintptr_t(src)%16 == 0);
91     _ASSERT(uintptr_t(dst)%16 == 0);
92     for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
93         _mm_store_si128((__m128i*)dst, *(__m128i*)src);
94     }
95 }
96 #endif
97 
x_sse_CopyBuffer(const int * src,size_t count,int * dst)98 void NFast::x_sse_CopyBuffer(const int* src, size_t count, int* dst)
99 {
100 #if 1
101     _ASSERT(count%16 == 0);
102     _ASSERT(uintptr_t(src)%16 == 0);
103     _ASSERT(uintptr_t(dst)%16 == 0);
104     for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
105         __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
106         __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
107         __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
108         __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
109         _mm_store_si128((__m128i*)dst+0, ww0);
110         _mm_store_si128((__m128i*)dst+1, ww1);
111         _mm_store_si128((__m128i*)dst+2, ww2);
112         _mm_store_si128((__m128i*)dst+3, ww3);
113     }
114 #elif 0
115     _ASSERT(count%4 == 0);
116     _ASSERT(uintptr_t(src)%16 == 0);
117     _ASSERT(uintptr_t(dst)%16 == 0);
118     for ( auto src_end = src+count; src < src_end; dst += 4, src += 4 ) {
119         _mm_store_si128((__m128i*)dst, *(__m128i*)src);
120     }
121 #else
122     _ASSERT(count%4 == 0);
123     _ASSERT(uintptr_t(src)%16 == 0);
124     _ASSERT(uintptr_t(dst)%16 == 0);
125     for ( auto src_end = src+count; src < src_end; dst += 4, src += 4 ) {
126         __m128i ww0 = _mm_load_si128((const __m128i*)src);
127         _mm_store_si128((__m128i*)dst, ww0);
128     }
129 #endif
130 }
131 
132 
x_sse_ConvertBuffer(const char * src,size_t count,int * dst)133 void NFast::x_sse_ConvertBuffer(const char* src, size_t count, int* dst)
134 {
135 #if 1
136     _ASSERT(count%16 == 0);
137     _ASSERT(uintptr_t(src)%16 == 0);
138     _ASSERT(uintptr_t(dst)%16 == 0);
139     __m128i mask = _mm_set_epi8(-128, -128, -128, 3,
140                                 -128, -128, -128, 2,
141                                 -128, -128, -128, 1,
142                                 -128, -128, -128, 0);
143     for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
144         uint32_t bb0 = ((const uint32_t*)src)[0];
145         uint32_t bb1 = ((const uint32_t*)src)[1];
146         uint32_t bb2 = ((const uint32_t*)src)[2];
147         uint32_t bb3 = ((const uint32_t*)src)[3];
148         __m128i ww0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb0), mask);
149         __m128i ww1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb1), mask);
150         __m128i ww2 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb2), mask);
151         __m128i ww3 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb3), mask);
152         _mm_store_si128((__m128i*)dst+0, ww0);
153         _mm_store_si128((__m128i*)dst+1, ww1);
154         _mm_store_si128((__m128i*)dst+2, ww2);
155         _mm_store_si128((__m128i*)dst+3, ww3);
156     }
157 #else
158     _ASSERT(count%4 == 0);
159     _ASSERT(uintptr_t(src)%16 == 0);
160     _ASSERT(uintptr_t(dst)%16 == 0);
161     __m128i mask = _mm_set_epi8(-128, -128, -128, 3,
162                                 -128, -128, -128, 2,
163                                 -128, -128, -128, 1,
164                                 -128, -128, -128, 0);
165     for ( auto src_end = src+count; src < src_end; dst += 4, src += 4 ) {
166         uint32_t bb0 = ((const uint32_t*)src)[0];
167         __m128i ww0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bb0), mask);
168         _mm_store_si128((__m128i*)dst+0, ww0);
169     }
170 #endif
171 }
172 
173 
x_sse_ConvertBuffer(const int * src,size_t count,char * dst)174 void NFast::x_sse_ConvertBuffer(const int* src, size_t count, char* dst)
175 {
176     _ASSERT(count%16 == 0);
177     _ASSERT(uintptr_t(src)%16 == 0);
178     _ASSERT(uintptr_t(dst)%16 == 0);
179     __m128i mask = _mm_set_epi8(-128, -128, -128, -128,
180                                 -128, -128, -128, -128,
181                                 -128, -128, -128, -128,
182                                 12, 8, 4, 0);
183     for ( auto src_end = src+count; src < src_end; dst += 16, src += 16 ) {
184         __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
185         __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
186         __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
187         __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
188         ww0 = _mm_shuffle_epi8(ww0, mask);
189         ww1 = _mm_shuffle_epi8(ww1, mask);
190         ww2 = _mm_shuffle_epi8(ww2, mask);
191         ww3 = _mm_shuffle_epi8(ww3, mask);
192         ww0 = _mm_or_si128(ww0, _mm_slli_si128(ww1, 4));
193         ww2 = _mm_or_si128(ww2, _mm_slli_si128(ww3, 4));
194         ww0 = _mm_or_si128(ww0, _mm_slli_si128(ww2, 8));
195         _mm_store_si128((__m128i*)dst, ww0);
196     }
197 }
198 
199 
200 
x_sse_SplitBufferInto4(const int * src,size_t count,char * dst0,char * dst1,char * dst2,char * dst3)201 void NFast::x_sse_SplitBufferInto4(const int* src, size_t count,
202                              char* dst0, char* dst1, char* dst2, char* dst3)
203 {
204     _ASSERT(count%16 == 0);
205     _ASSERT(uintptr_t(src)%16 == 0);
206     _ASSERT(uintptr_t(dst0)%16 == 0);
207     _ASSERT(uintptr_t(dst1)%16 == 0);
208     _ASSERT(uintptr_t(dst2)%16 == 0);
209     _ASSERT(uintptr_t(dst3)%16 == 0);
210     for ( auto src_end = src+count*4; src < src_end;
211           src += 64, dst0 += 16, dst1 += 16, dst2 += 16, dst3 += 16 ) {
212         __m128i ww0, ww1, ww2, ww3;
213         {
214             __m128i tt0 = _mm_load_si128((const __m128i*)src+0);
215             __m128i tt1 = _mm_load_si128((const __m128i*)src+1);
216             __m128i tt2 = _mm_load_si128((const __m128i*)src+2);
217             __m128i tt3 = _mm_load_si128((const __m128i*)src+3);
218             tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
219             tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
220             ww0 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
221         }
222         {
223             __m128i tt0 = _mm_load_si128((const __m128i*)src+4);
224             __m128i tt1 = _mm_load_si128((const __m128i*)src+5);
225             __m128i tt2 = _mm_load_si128((const __m128i*)src+6);
226             __m128i tt3 = _mm_load_si128((const __m128i*)src+7);
227             tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
228             tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
229             ww1 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
230         }
231         {
232             __m128i tt0 = _mm_load_si128((const __m128i*)src+8);
233             __m128i tt1 = _mm_load_si128((const __m128i*)src+9);
234             __m128i tt2 = _mm_load_si128((const __m128i*)src+10);
235             __m128i tt3 = _mm_load_si128((const __m128i*)src+11);
236             tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
237             tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
238             ww2 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
239         }
240         {
241             __m128i tt0 = _mm_load_si128((const __m128i*)src+12);
242             __m128i tt1 = _mm_load_si128((const __m128i*)src+13);
243             __m128i tt2 = _mm_load_si128((const __m128i*)src+14);
244             __m128i tt3 = _mm_load_si128((const __m128i*)src+15);
245             tt0 = _mm_or_si128(tt0, _mm_slli_si128(tt1, 1));
246             tt2 = _mm_or_si128(tt2, _mm_slli_si128(tt3, 1));
247             ww3 = _mm_or_si128(tt0, _mm_slli_si128(tt2, 2));
248         }
249 
250         {
251             // transpose 4x4
252             __m128i tt0 = _mm_unpacklo_epi32(ww0, ww1);
253             __m128i tt1 = _mm_unpacklo_epi32(ww2, ww3);
254             __m128i tt2 = _mm_unpackhi_epi32(ww0, ww1);
255             __m128i tt3 = _mm_unpackhi_epi32(ww2, ww3);
256 
257             ww0 = _mm_unpacklo_epi64(tt0, tt1);
258             ww1 = _mm_unpackhi_epi64(tt0, tt1);
259             ww2 = _mm_unpacklo_epi64(tt2, tt3);
260             ww3 = _mm_unpackhi_epi64(tt2, tt3);
261         }
262 
263         _mm_store_si128((__m128i*)dst0, ww0);
264         _mm_store_si128((__m128i*)dst1, ww1);
265         _mm_store_si128((__m128i*)dst2, ww2);
266         _mm_store_si128((__m128i*)dst3, ww3);
267     }
268 }
269 
270 
x_sse_SplitBufferInto4(const int * src,size_t count,int * dst0,int * dst1,int * dst2,int * dst3)271 void NFast::x_sse_SplitBufferInto4(const int* src, size_t count,
272                              int* dst0, int* dst1, int* dst2, int* dst3)
273 {
274     _ASSERT(count%16 == 0);
275     _ASSERT(uintptr_t(src)%16 == 0);
276     _ASSERT(uintptr_t(dst0)%16 == 0);
277     _ASSERT(uintptr_t(dst1)%16 == 0);
278     _ASSERT(uintptr_t(dst2)%16 == 0);
279     _ASSERT(uintptr_t(dst3)%16 == 0);
280     for ( auto src_end = src+count*4; src < src_end;
281           src += 16, dst0 += 4, dst1 += 4, dst2 += 4, dst3 += 4 ) {
282         __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
283         __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
284         __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
285         __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
286 
287         {
288             // transpose 4x4
289             __m128i tt0 = _mm_unpacklo_epi32(ww0, ww1);
290             __m128i tt1 = _mm_unpacklo_epi32(ww2, ww3);
291             __m128i tt2 = _mm_unpackhi_epi32(ww0, ww1);
292             __m128i tt3 = _mm_unpackhi_epi32(ww2, ww3);
293 
294             ww0 = _mm_unpacklo_epi64(tt0, tt1);
295             ww1 = _mm_unpackhi_epi64(tt0, tt1);
296             ww2 = _mm_unpacklo_epi64(tt2, tt3);
297             ww3 = _mm_unpackhi_epi64(tt2, tt3);
298         }
299 
300         _mm_store_si128((__m128i*)dst0, ww0);
301         _mm_store_si128((__m128i*)dst1, ww1);
302         _mm_store_si128((__m128i*)dst2, ww2);
303         _mm_store_si128((__m128i*)dst3, ww3);
304     }
305 }
306 
307 /*
308 void copy_n(const int* src, size_t count, char* dst)
309 {
310     for ( ; count && (uintptr_t(dst)%4); ++dst, --count, ++src ) {
311         *dst = *src;
312     }
313     __m128i mask = _mm_set_epi8(128, 128, 128, 128,
314                                 128, 128, 128, 128,
315                                 128, 128, 128, 128,
316                                 12, 8, 4, 0);
317     for ( ; count >= 4; dst += 4, count -= 4, src += 4 ) {
318         __m128i ww = _mm_load_si128((const __m128i*)src);
319         uint32_t bb = _mm_cvtsi128_si32(_mm_shuffle_epi8(ww, mask));
320         *(uint32_t*)dst = bb;
321     }
322     for ( ; count; ++dst, --count, ++src ) {
323         *dst = *src;
324     }
325 }
326 
327 
328 void copy_n(const int* src, size_t count, int* dst)
329 {
330     for ( ; count && (uintptr_t(dst)%16); ++dst, --count, ++src ) {
331         *dst = *src;
332     }
333     __m128i mask = _mm_set_epi8(128, 128, 128, 128,
334                                 128, 128, 128, 128,
335                                 128, 128, 128, 128,
336                                 12, 8, 4, 0);
337     for ( ; count >= 4; dst += 4, count -= 4, src += 4 ) {
338         __m128i ww = _mm_load_si128((const __m128i*)src);
339         uint32_t bb = _mm_cvtsi128_si32(_mm_shuffle_epi8(ww, mask));
340         *(uint32_t*)dst = bb;
341     }
342     for ( ; count; ++dst, --count, ++src ) {
343         *dst = *src;
344     }
345 }
346 */
347 
x_sse_FindMaxElement(const unsigned int * src,size_t count)348 unsigned int NFast::x_sse_FindMaxElement(const unsigned int* src, size_t count)
349 {
350     _ASSERT(count%16 == 0);
351     _ASSERT(uintptr_t(src)%16 == 0);
352     __m128i max4 = _mm_setzero_si128();
353     for ( auto src_end = src+count; src < src_end; src += 16 ) {
354         __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
355         __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
356         __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
357         __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
358         ww0 = _mm_max_epu32(ww0, ww1);
359         ww2 = _mm_max_epu32(ww2, ww3);
360         ww0 = _mm_max_epu32(ww0, ww2);
361         max4 = _mm_max_epu32(max4, ww0);
362     }
363     max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 8));
364     max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 4));
365     return _mm_cvtsi128_si32(max4);
366 }
367 
368 
x_sse_FindMaxElement(const unsigned int * src,size_t count,unsigned int & dst)369 void NFast::x_sse_FindMaxElement(const unsigned int* src, size_t count, unsigned int& dst)
370 {
371     _ASSERT(count%16 == 0);
372     _ASSERT(uintptr_t(src)%16 == 0);
373     __m128i max4 = _mm_set1_epi32(dst);
374     for ( auto src_end = src+count; src < src_end; src += 16 ) {
375         __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
376         __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
377         __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
378         __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
379         ww0 = _mm_max_epu32(ww0, ww1);
380         ww2 = _mm_max_epu32(ww2, ww3);
381         ww0 = _mm_max_epu32(ww0, ww2);
382         max4 = _mm_max_epu32(max4, ww0);
383     }
384     max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 8));
385     max4 = _mm_max_epu32(max4, _mm_srli_si128(max4, 4));
386     dst = _mm_cvtsi128_si32(max4);
387 }
388 
389 
x_sse_Find4MaxElements(const unsigned int * src,size_t count,unsigned int dst[4])390 void NFast::x_sse_Find4MaxElements(const unsigned int* src, size_t count, unsigned int dst[4])
391 {
392     _ASSERT(count%16 == 0);
393     _ASSERT(uintptr_t(src)%16 == 0);
394     __m128i max4 = _mm_loadu_si128((__m128i*)dst);
395     for ( auto src_end = src+count*4; src < src_end; src += 16 ) {
396         __m128i ww0 = _mm_load_si128((const __m128i*)src+0);
397         __m128i ww1 = _mm_load_si128((const __m128i*)src+1);
398         __m128i ww2 = _mm_load_si128((const __m128i*)src+2);
399         __m128i ww3 = _mm_load_si128((const __m128i*)src+3);
400         ww0 = _mm_max_epu32(ww0, ww1);
401         ww2 = _mm_max_epu32(ww2, ww3);
402         ww0 = _mm_max_epu32(ww0, ww2);
403         max4 = _mm_max_epu32(max4, ww0);
404     }
405     _mm_storeu_si128((__m128i*)dst, max4);
406 }
407 
408 #endif // NCBI_HAVE_FAST_OPS
409 
x_no_sse_SplitBufferInto4(const int * src,size_t count,int * dest0,int * dest1,int * dest2,int * dest3)410 void NFast::x_no_sse_SplitBufferInto4(const int* src, size_t count, int*  dest0, int*  dest1, int*  dest2, int*  dest3)
411 {
412     for (size_t e = 0; e < count; ++e) {
413         int v0 = src[e*4+0];
414         int v1 = src[e*4+1];
415         int v2 = src[e*4+2];
416         int v3 = src[e*4+3];
417         dest0[e] = v0;
418         dest1[e] = v1;
419         dest2[e] = v2;
420         dest3[e] = v3;
421     }
422 }
423 
x_no_sse_SplitBufferInto4(const int * src,size_t count,char * dest0,char * dest1,char * dest2,char * dest3)424 void NFast::x_no_sse_SplitBufferInto4(const int* src, size_t count, char* dest0, char* dest1, char* dest2, char* dest3)
425 {
426     for (size_t e = 0; e < count; ++e) {
427         char v0 = char(src[e*4+0]);
428         char v1 = char(src[e*4+1]);
429         char v2 = char(src[e*4+2]);
430         char v3 = char(src[e*4+3]);
431         dest0[e] = v0;
432         dest1[e] = v1;
433         dest2[e] = v2;
434         dest3[e] = v3;
435     }
436 }
437 
x_no_sse_FindMaxElement(const unsigned int * src,size_t count,unsigned int v)438 unsigned int NFast::x_no_sse_FindMaxElement(const unsigned int* src, size_t count, unsigned int v)
439 {
440     unsigned int result = v;
441     for (size_t e = 0; e < count; ++e) {
442         if (result < *(src + e)) {
443             result = *(src + e);
444         }
445     }
446     return result;
447 }
448 
x_no_sse_Find4MaxElements(const unsigned int * src,size_t count,unsigned int dest[4])449 void NFast::x_no_sse_Find4MaxElements(const unsigned int* src, size_t count, unsigned int dest[4])
450 {
451     unsigned int result[4];
452     memcpy(result, dest, sizeof(unsigned int) * 4);
453     for (size_t e = 0; e < 4*count; e += 4) {
454         if (result[0] < *(src + e)) {
455             result[0] = *(src + e);
456         }
457         if (result[1] < *(src + e + 1)) {
458             result[1] = *(src + e + 1);
459         }
460         if (result[2] < *(src + e + 2)) {
461             result[2] = *(src + e + 2);
462         }
463         if (result[3] < *(src + e + 3)) {
464             result[3] = *(src + e + 3);
465         }
466     }
467     memcpy(dest, result, sizeof(unsigned int) * 4);
468 }
469 
470 
471 
472 END_NCBI_SCOPE
473 
474