1 /******************************************************************************
2 *
3 * Project: GDAL Core
4 * Purpose: SSSE3 specializations
5 * Author: Even Rouault <even dot rouault at spatialys dot com>
6 *
7 ******************************************************************************
8 * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a
11 * copy of this software and associated documentation files (the "Software"),
12 * to deal in the Software without restriction, including without limitation
13 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 * and/or sell copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included
18 * in all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 * DEALINGS IN THE SOFTWARE.
27 ****************************************************************************/
28
29 #include "cpl_port.h"
30
31 CPL_CVSID("$Id: rasterio_ssse3.cpp 9afe586299c716a3b3473ae00ed79aaa487144c4 2020-12-09 09:31:20 +0100 Even Rouault $")
32
33 #if defined(HAVE_SSSE3_AT_COMPILE_TIME) && ( defined(__x86_64) || defined(_M_X64) )
34
35 #include <tmmintrin.h>
36 #include "gdal_priv_templates.hpp"
37
38 void GDALUnrolledCopy_GByte_3_1_SSSE3( GByte* CPL_RESTRICT pDest,
39 const GByte* CPL_RESTRICT pSrc,
40 GInt64 nIters );
41
GDALUnrolledCopy_GByte_3_1_SSSE3(GByte * CPL_RESTRICT pDest,const GByte * CPL_RESTRICT pSrc,GInt64 nIters)42 void GDALUnrolledCopy_GByte_3_1_SSSE3( GByte* CPL_RESTRICT pDest,
43 const GByte* CPL_RESTRICT pSrc,
44 GInt64 nIters )
45 {
46 decltype(nIters) i;
47 const __m128i xmm_shuffle0 = _mm_set_epi8(-1 ,-1 ,-1 ,-1,
48 -1 ,-1 ,-1 ,-1,
49 -1 ,-1 ,15 ,12,
50 9 ,6 ,3 ,0);
51 const __m128i xmm_shuffle1 = _mm_set_epi8(-1 ,-1 ,-1 ,-1,
52 -1 ,14 ,11 ,8,
53 5 ,2 ,-1 ,-1,
54 -1 ,-1 ,-1 ,-1);
55 const __m128i xmm_shuffle2 = _mm_set_epi8(13 ,10 ,7 ,4,
56 1 ,-1 ,-1 ,-1,
57 -1 ,-1 ,-1 ,-1,
58 -1 ,-1 ,-1 ,-1);
59 // If we were sure that there would always be 2 trailing bytes, we could
60 // check against nIters - 15
61 for ( i = 0; i < nIters - 16; i += 16 )
62 {
63 __m128i xmm0 = _mm_loadu_si128( reinterpret_cast<__m128i const*>(pSrc + 0) );
64 __m128i xmm1 = _mm_loadu_si128( reinterpret_cast<__m128i const*>(pSrc + 16) );
65 __m128i xmm2 = _mm_loadu_si128( reinterpret_cast<__m128i const*>(pSrc + 32) );
66
67 // From LSB to MSB:
68 // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
69 xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
70 // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
71 xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
72 // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x --> 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
73 xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
74 xmm0 = _mm_or_si128(xmm0, xmm1);
75 xmm0 = _mm_or_si128(xmm0, xmm2);
76
77 _mm_storeu_si128( reinterpret_cast<__m128i*> (pDest + i), xmm0);
78
79 pSrc += 3 * 16;
80 }
81 for( ; i < nIters; i++ )
82 {
83 pDest[i] = *pSrc;
84 pSrc += 3;
85 }
86 }
87
88 #endif // HAVE_SSSE3_AT_COMPILE_TIME
89