1 /******************************************************************************
2  * Project:  GDAL
3  * Purpose:  AVX2 emulation with SSE2 + a few SSE4.1 emulation
4  * Author:   Even Rouault <even dot rouault at spatialys dot com>
5  *
6  ******************************************************************************
7  * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a
10  * copy of this software and associated documentation files (the "Software"),
11  * to deal in the Software without restriction, including without limitation
12  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13  * and/or sell copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included
17  * in all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  * DEALINGS IN THE SOFTWARE.
26  ****************************************************************************/
27 
28 #ifndef GDAL_AVX2_EMULATION_H_INCLUDED
29 #define GDAL_AVX2_EMULATION_H_INCLUDED
30 
31 #include <emmintrin.h>
32 
33 #ifdef __SSE4_1__
34 #include <smmintrin.h>
35 
36 #define GDALmm_min_epu16   _mm_min_epu16
37 #define GDALmm_max_epu16   _mm_max_epu16
38 #define GDALmm_mullo_epi32 _mm_mullo_epi32
39 
40 #else
41 // Emulation of SSE4.1 _mm_min_epu16 and _mm_max_epu16 with SSE2 only
42 
GDALAVX2Emul_mm_cmple_epu16(__m128i x,__m128i y)43 static inline __m128i GDALAVX2Emul_mm_cmple_epu16 (__m128i x, __m128i y)
44 {
45     return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128() );
46 }
47 
GDALAVX2Emul_mm_ternary(__m128i mask,__m128i then_reg,__m128i else_reg)48 static inline __m128i GDALAVX2Emul_mm_ternary(__m128i mask,
49                                       __m128i then_reg,
50                                       __m128i else_reg)
51 {
52     return _mm_or_si128(_mm_and_si128(mask, then_reg),
53                         _mm_andnot_si128(mask, else_reg));
54 }
55 
GDALmm_min_epu16(__m128i x,__m128i y)56 static inline __m128i GDALmm_min_epu16 (__m128i x, __m128i y)
57 {
58     const __m128i mask = GDALAVX2Emul_mm_cmple_epu16(x, y);
59     return GDALAVX2Emul_mm_ternary(mask, x, y);
60 }
61 
GDALmm_max_epu16(__m128i x,__m128i y)62 static inline __m128i GDALmm_max_epu16 (__m128i x, __m128i y)
63 {
64     const __m128i mask = GDALAVX2Emul_mm_cmple_epu16(x, y);
65     return GDALAVX2Emul_mm_ternary(mask, y, x);
66 }
67 
GDALmm_mullo_epi32(__m128i x,__m128i y)68 static inline __m128i GDALmm_mullo_epi32 (__m128i x, __m128i y)
69 {
70     const __m128i mul02 = _mm_shuffle_epi32(_mm_mul_epu32(x, y), 2 << 2);
71     const __m128i mul13 = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_si128(x, 4),
72                                                           _mm_srli_si128(y, 4)),
73                                             2 << 2);
74     return _mm_unpacklo_epi32(mul02, mul13);;
75 }
76 #endif // __SSE4_1__
77 
78 
79 #ifdef __AVX2__
80 
81 #include <immintrin.h>
82 
83 typedef __m256i GDALm256i;
84 
85 #define GDALmm256_set1_epi8             _mm256_set1_epi8
86 #define GDALmm256_set1_epi16            _mm256_set1_epi16
87 #define GDALmm256_setzero_si256         _mm256_setzero_si256
88 #define GDALmm256_load_si256            _mm256_load_si256
89 #define GDALmm256_store_si256           _mm256_store_si256
90 #define GDALmm256_storeu_si256          _mm256_storeu_si256
91 #define GDALmm256_cmpeq_epi8            _mm256_cmpeq_epi8
92 #define GDALmm256_sad_epu8              _mm256_sad_epu8
93 #define GDALmm256_add_epi32             _mm256_add_epi32
94 #define GDALmm256_andnot_si256          _mm256_andnot_si256
95 #define GDALmm256_and_si256             _mm256_and_si256
96 #define GDALmm256_or_si256              _mm256_or_si256
97 #define GDALmm256_min_epu8              _mm256_min_epu8
98 #define GDALmm256_max_epu8              _mm256_max_epu8
99 #define GDALmm256_extracti128_si256     _mm256_extracti128_si256
100 #define GDALmm256_cvtepu8_epi16         _mm256_cvtepu8_epi16
101 #define GDALmm256_madd_epi16            _mm256_madd_epi16
102 #define GDALmm256_min_epu16             _mm256_min_epu16
103 #define GDALmm256_max_epu16             _mm256_max_epu16
104 #define GDALmm256_cvtepu16_epi32        _mm256_cvtepu16_epi32
105 #define GDALmm256_cvtepu16_epi64        _mm256_cvtepu16_epi64
106 #define GDALmm256_cvtepu32_epi64        _mm256_cvtepu32_epi64
107 #define GDALmm256_mullo_epi32           _mm256_mullo_epi32
108 #define GDALmm256_add_epi64             _mm256_add_epi64
109 #define GDALmm256_add_epi16             _mm256_add_epi16
110 #define GDALmm256_sub_epi16             _mm256_sub_epi16
111 #define GDALmm256_min_epi16             _mm256_min_epi16
112 #define GDALmm256_max_epi16             _mm256_max_epi16
113 
114 #else
115 
116 typedef struct
117 {
118     __m128i low;
119     __m128i high;
120 } GDALm256i;
121 
GDALmm256_set1_epi8(char c)122 static inline GDALm256i GDALmm256_set1_epi8(char c)
123 {
124     GDALm256i reg;
125     reg.low = _mm_set1_epi8(c);
126     reg.high = _mm_set1_epi8(c);
127     return reg;
128 }
129 
GDALmm256_set1_epi16(short s)130 static inline GDALm256i GDALmm256_set1_epi16(short s)
131 {
132     GDALm256i reg;
133     reg.low = _mm_set1_epi16(s);
134     reg.high = _mm_set1_epi16(s);
135     return reg;
136 }
137 
GDALmm256_setzero_si256()138 static inline GDALm256i GDALmm256_setzero_si256()
139 {
140     GDALm256i reg;
141     reg.low = _mm_setzero_si128();
142     reg.high = _mm_setzero_si128();
143     return reg;
144 }
145 
GDALmm256_load_si256(GDALm256i const * p)146 static inline GDALm256i GDALmm256_load_si256(GDALm256i const * p)
147 {
148     GDALm256i reg;
149     reg.low = _mm_load_si128(reinterpret_cast<__m128i const*>(p));
150     reg.high = _mm_load_si128(reinterpret_cast<__m128i const*>(reinterpret_cast<const char*>(p)+16));
151     return reg;
152 }
153 
GDALmm256_store_si256(GDALm256i * p,GDALm256i reg)154 static inline void GDALmm256_store_si256(GDALm256i * p, GDALm256i reg)
155 {
156     _mm_store_si128(reinterpret_cast<__m128i*>(p), reg.low);
157     _mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(p)+16), reg.high);
158 }
159 
GDALmm256_storeu_si256(GDALm256i * p,GDALm256i reg)160 static inline void GDALmm256_storeu_si256(GDALm256i * p, GDALm256i reg)
161 {
162     _mm_storeu_si128(reinterpret_cast<__m128i*>(p), reg.low);
163     _mm_storeu_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(p)+16), reg.high);
164 }
165 
166 #define DEFINE_BINARY_MM256(mm256name, mm128name) \
167 static inline GDALm256i mm256name(GDALm256i r1, GDALm256i r2) \
168 { \
169     GDALm256i reg; \
170     reg.low = mm128name(r1.low, r2.low); \
171     reg.high = mm128name(r1.high, r2.high); \
172     return reg; \
173 }
174 
DEFINE_BINARY_MM256(GDALmm256_cmpeq_epi8,_mm_cmpeq_epi8)175 DEFINE_BINARY_MM256(GDALmm256_cmpeq_epi8, _mm_cmpeq_epi8)
176 DEFINE_BINARY_MM256(GDALmm256_sad_epu8, _mm_sad_epu8)
177 DEFINE_BINARY_MM256(GDALmm256_add_epi32, _mm_add_epi32)
178 DEFINE_BINARY_MM256(GDALmm256_andnot_si256, _mm_andnot_si128)
179 DEFINE_BINARY_MM256(GDALmm256_and_si256, _mm_and_si128)
180 DEFINE_BINARY_MM256(GDALmm256_or_si256, _mm_or_si128)
181 DEFINE_BINARY_MM256(GDALmm256_min_epu8, _mm_min_epu8)
182 DEFINE_BINARY_MM256(GDALmm256_max_epu8, _mm_max_epu8)
183 DEFINE_BINARY_MM256(GDALmm256_madd_epi16, _mm_madd_epi16)
184 DEFINE_BINARY_MM256(GDALmm256_min_epu16, GDALmm_min_epu16)
185 DEFINE_BINARY_MM256(GDALmm256_max_epu16, GDALmm_max_epu16)
186 DEFINE_BINARY_MM256(GDALmm256_mullo_epi32, GDALmm_mullo_epi32)
187 DEFINE_BINARY_MM256(GDALmm256_add_epi64, _mm_add_epi64)
188 DEFINE_BINARY_MM256(GDALmm256_add_epi16, _mm_add_epi16)
189 DEFINE_BINARY_MM256(GDALmm256_sub_epi16, _mm_sub_epi16)
190 DEFINE_BINARY_MM256(GDALmm256_min_epi16, _mm_min_epi16)
191 DEFINE_BINARY_MM256(GDALmm256_max_epi16, _mm_max_epi16)
192 
193 static inline __m128i GDALmm256_extracti128_si256(GDALm256i reg, int index)
194 {
195     return (index == 0) ? reg.low : reg.high;
196 }
197 
GDALmm256_cvtepu8_epi16(__m128i reg128)198 static inline GDALm256i GDALmm256_cvtepu8_epi16(__m128i reg128)
199 {
200     GDALm256i reg;
201     reg.low = _mm_unpacklo_epi8(reg128, _mm_setzero_si128());
202     reg.high = _mm_unpacklo_epi8(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
203                                  _mm_setzero_si128());
204     return reg;
205 }
206 
GDALmm256_cvtepu16_epi32(__m128i reg128)207 static inline GDALm256i GDALmm256_cvtepu16_epi32(__m128i reg128)
208 {
209     GDALm256i reg;
210     reg.low = _mm_unpacklo_epi16(reg128, _mm_setzero_si128());
211     reg.high = _mm_unpacklo_epi16(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
212                                   _mm_setzero_si128());
213     return reg;
214 }
215 
GDALmm256_cvtepu16_epi64(__m128i reg128)216 static inline GDALm256i GDALmm256_cvtepu16_epi64(__m128i reg128)
217 {
218     GDALm256i reg;
219     reg.low = _mm_unpacklo_epi32(_mm_unpacklo_epi16(reg128,
220                                                     _mm_setzero_si128()),
221                                  _mm_setzero_si128());
222     reg.high = _mm_unpacklo_epi32(_mm_unpacklo_epi16(
223                                      _mm_srli_si128(reg128, 4),
224                                                     _mm_setzero_si128()),
225                                      _mm_setzero_si128());
226     return reg;
227 }
228 
GDALmm256_cvtepu32_epi64(__m128i reg128)229 static inline GDALm256i GDALmm256_cvtepu32_epi64(__m128i reg128)
230 {
231     GDALm256i reg;
232     reg.low = _mm_unpacklo_epi32(reg128, _mm_setzero_si128());
233     reg.high = _mm_unpacklo_epi32(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
234                                   _mm_setzero_si128());
235     return reg;
236 }
237 
238 #endif
239 
240 #endif /* GDAL_AVX2_EMULATION_H_INCLUDED */
241