1 /******************************************************************************
2  *
3  * Project:  GDAL Core
4  * Purpose:  SSSE3 specializations
5  * Author:   Even Rouault <even dot rouault at spatialys dot com>
6  *
7  ******************************************************************************
8  * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a
11  * copy of this software and associated documentation files (the "Software"),
12  * to deal in the Software without restriction, including without limitation
13  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14  * and/or sell copies of the Software, and to permit persons to whom the
15  * Software is furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included
18  * in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26  * DEALINGS IN THE SOFTWARE.
27  ****************************************************************************/
28 
29 #include "cpl_port.h"
30 
31 CPL_CVSID("$Id: rasterio_ssse3.cpp 9afe586299c716a3b3473ae00ed79aaa487144c4 2020-12-09 09:31:20 +0100 Even Rouault $")
32 
33 #if defined(HAVE_SSSE3_AT_COMPILE_TIME) && ( defined(__x86_64) || defined(_M_X64) )
34 
35 #include <tmmintrin.h>
36 #include "gdal_priv_templates.hpp"
37 
38 void GDALUnrolledCopy_GByte_3_1_SSSE3( GByte* CPL_RESTRICT pDest,
39                                              const GByte* CPL_RESTRICT pSrc,
40                                              GInt64 nIters );
41 
GDALUnrolledCopy_GByte_3_1_SSSE3(GByte * CPL_RESTRICT pDest,const GByte * CPL_RESTRICT pSrc,GInt64 nIters)42 void GDALUnrolledCopy_GByte_3_1_SSSE3( GByte* CPL_RESTRICT pDest,
43                                              const GByte* CPL_RESTRICT pSrc,
44                                              GInt64 nIters )
45 {
46     decltype(nIters) i;
47     const __m128i xmm_shuffle0 = _mm_set_epi8(-1  ,-1  ,-1  ,-1,
48                                               -1  ,-1  ,-1  ,-1,
49                                               -1  ,-1  ,15  ,12,
50                                               9   ,6   ,3   ,0);
51     const __m128i xmm_shuffle1 = _mm_set_epi8(-1  ,-1  ,-1  ,-1,
52                                               -1  ,14  ,11  ,8,
53                                               5   ,2   ,-1  ,-1,
54                                               -1  ,-1  ,-1  ,-1);
55     const __m128i xmm_shuffle2 = _mm_set_epi8(13  ,10  ,7   ,4,
56                                               1   ,-1  ,-1  ,-1,
57                                               -1  ,-1  ,-1  ,-1,
58                                               -1  ,-1  ,-1  ,-1);
59     // If we were sure that there would always be 2 trailing bytes, we could
60     // check against nIters - 15
61     for ( i = 0; i < nIters - 16; i += 16 )
62     {
63         __m128i xmm0 = _mm_loadu_si128( reinterpret_cast<__m128i const*>(pSrc + 0) );
64         __m128i xmm1 = _mm_loadu_si128( reinterpret_cast<__m128i const*>(pSrc + 16) );
65         __m128i xmm2 = _mm_loadu_si128( reinterpret_cast<__m128i const*>(pSrc + 32) );
66 
67         // From LSB to MSB:
68         // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
69         xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
70         // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
71         xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
72         // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x --> 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
73         xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
74         xmm0 = _mm_or_si128(xmm0, xmm1);
75         xmm0 = _mm_or_si128(xmm0, xmm2);
76 
77         _mm_storeu_si128( reinterpret_cast<__m128i*> (pDest + i), xmm0);
78 
79         pSrc += 3 * 16;
80     }
81     for( ; i < nIters; i++ )
82     {
83         pDest[i] = *pSrc;
84         pSrc += 3;
85     }
86 }
87 
88 #endif // HAVE_SSSE3_AT_COMPILE_TIME
89