1 /******************************************************************************
2 * Project: GDAL
3 * Purpose: AVX2 emulation with SSE2 + a few SSE4.1 emulation
4 * Author: Even Rouault <even dot rouault at spatialys dot com>
5 *
6 ******************************************************************************
7 * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 ****************************************************************************/
27
28 #ifndef GDAL_AVX2_EMULATION_H_INCLUDED
29 #define GDAL_AVX2_EMULATION_H_INCLUDED
30
31 #include <emmintrin.h>
32
33 #ifdef __SSE4_1__
34 #include <smmintrin.h>
35
36 #define GDALmm_min_epu16 _mm_min_epu16
37 #define GDALmm_max_epu16 _mm_max_epu16
38 #define GDALmm_mullo_epi32 _mm_mullo_epi32
39
40 #else
41 // Emulation of SSE4.1 _mm_min_epu16 and _mm_max_epu16 with SSE2 only
42
GDALAVX2Emul_mm_cmple_epu16(__m128i x,__m128i y)43 static inline __m128i GDALAVX2Emul_mm_cmple_epu16 (__m128i x, __m128i y)
44 {
45 return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128() );
46 }
47
GDALAVX2Emul_mm_ternary(__m128i mask,__m128i then_reg,__m128i else_reg)48 static inline __m128i GDALAVX2Emul_mm_ternary(__m128i mask,
49 __m128i then_reg,
50 __m128i else_reg)
51 {
52 return _mm_or_si128(_mm_and_si128(mask, then_reg),
53 _mm_andnot_si128(mask, else_reg));
54 }
55
GDALmm_min_epu16(__m128i x,__m128i y)56 static inline __m128i GDALmm_min_epu16 (__m128i x, __m128i y)
57 {
58 const __m128i mask = GDALAVX2Emul_mm_cmple_epu16(x, y);
59 return GDALAVX2Emul_mm_ternary(mask, x, y);
60 }
61
GDALmm_max_epu16(__m128i x,__m128i y)62 static inline __m128i GDALmm_max_epu16 (__m128i x, __m128i y)
63 {
64 const __m128i mask = GDALAVX2Emul_mm_cmple_epu16(x, y);
65 return GDALAVX2Emul_mm_ternary(mask, y, x);
66 }
67
GDALmm_mullo_epi32(__m128i x,__m128i y)68 static inline __m128i GDALmm_mullo_epi32 (__m128i x, __m128i y)
69 {
70 const __m128i mul02 = _mm_shuffle_epi32(_mm_mul_epu32(x, y), 2 << 2);
71 const __m128i mul13 = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_si128(x, 4),
72 _mm_srli_si128(y, 4)),
73 2 << 2);
74 return _mm_unpacklo_epi32(mul02, mul13);;
75 }
76 #endif // __SSE4_1__
77
78
79 #ifdef __AVX2__
80
81 #include <immintrin.h>
82
83 typedef __m256i GDALm256i;
84
85 #define GDALmm256_set1_epi8 _mm256_set1_epi8
86 #define GDALmm256_set1_epi16 _mm256_set1_epi16
87 #define GDALmm256_setzero_si256 _mm256_setzero_si256
88 #define GDALmm256_load_si256 _mm256_load_si256
89 #define GDALmm256_store_si256 _mm256_store_si256
90 #define GDALmm256_storeu_si256 _mm256_storeu_si256
91 #define GDALmm256_cmpeq_epi8 _mm256_cmpeq_epi8
92 #define GDALmm256_sad_epu8 _mm256_sad_epu8
93 #define GDALmm256_add_epi32 _mm256_add_epi32
94 #define GDALmm256_andnot_si256 _mm256_andnot_si256
95 #define GDALmm256_and_si256 _mm256_and_si256
96 #define GDALmm256_or_si256 _mm256_or_si256
97 #define GDALmm256_min_epu8 _mm256_min_epu8
98 #define GDALmm256_max_epu8 _mm256_max_epu8
99 #define GDALmm256_extracti128_si256 _mm256_extracti128_si256
100 #define GDALmm256_cvtepu8_epi16 _mm256_cvtepu8_epi16
101 #define GDALmm256_madd_epi16 _mm256_madd_epi16
102 #define GDALmm256_min_epu16 _mm256_min_epu16
103 #define GDALmm256_max_epu16 _mm256_max_epu16
104 #define GDALmm256_cvtepu16_epi32 _mm256_cvtepu16_epi32
105 #define GDALmm256_cvtepu16_epi64 _mm256_cvtepu16_epi64
106 #define GDALmm256_cvtepu32_epi64 _mm256_cvtepu32_epi64
107 #define GDALmm256_mullo_epi32 _mm256_mullo_epi32
108 #define GDALmm256_add_epi64 _mm256_add_epi64
109 #define GDALmm256_add_epi16 _mm256_add_epi16
110 #define GDALmm256_sub_epi16 _mm256_sub_epi16
111 #define GDALmm256_min_epi16 _mm256_min_epi16
112 #define GDALmm256_max_epi16 _mm256_max_epi16
113
114 #else
115
116 typedef struct
117 {
118 __m128i low;
119 __m128i high;
120 } GDALm256i;
121
GDALmm256_set1_epi8(char c)122 static inline GDALm256i GDALmm256_set1_epi8(char c)
123 {
124 GDALm256i reg;
125 reg.low = _mm_set1_epi8(c);
126 reg.high = _mm_set1_epi8(c);
127 return reg;
128 }
129
GDALmm256_set1_epi16(short s)130 static inline GDALm256i GDALmm256_set1_epi16(short s)
131 {
132 GDALm256i reg;
133 reg.low = _mm_set1_epi16(s);
134 reg.high = _mm_set1_epi16(s);
135 return reg;
136 }
137
GDALmm256_setzero_si256()138 static inline GDALm256i GDALmm256_setzero_si256()
139 {
140 GDALm256i reg;
141 reg.low = _mm_setzero_si128();
142 reg.high = _mm_setzero_si128();
143 return reg;
144 }
145
GDALmm256_load_si256(GDALm256i const * p)146 static inline GDALm256i GDALmm256_load_si256(GDALm256i const * p)
147 {
148 GDALm256i reg;
149 reg.low = _mm_load_si128(reinterpret_cast<__m128i const*>(p));
150 reg.high = _mm_load_si128(reinterpret_cast<__m128i const*>(reinterpret_cast<const char*>(p)+16));
151 return reg;
152 }
153
GDALmm256_store_si256(GDALm256i * p,GDALm256i reg)154 static inline void GDALmm256_store_si256(GDALm256i * p, GDALm256i reg)
155 {
156 _mm_store_si128(reinterpret_cast<__m128i*>(p), reg.low);
157 _mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(p)+16), reg.high);
158 }
159
GDALmm256_storeu_si256(GDALm256i * p,GDALm256i reg)160 static inline void GDALmm256_storeu_si256(GDALm256i * p, GDALm256i reg)
161 {
162 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), reg.low);
163 _mm_storeu_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(p)+16), reg.high);
164 }
165
166 #define DEFINE_BINARY_MM256(mm256name, mm128name) \
167 static inline GDALm256i mm256name(GDALm256i r1, GDALm256i r2) \
168 { \
169 GDALm256i reg; \
170 reg.low = mm128name(r1.low, r2.low); \
171 reg.high = mm128name(r1.high, r2.high); \
172 return reg; \
173 }
174
DEFINE_BINARY_MM256(GDALmm256_cmpeq_epi8,_mm_cmpeq_epi8)175 DEFINE_BINARY_MM256(GDALmm256_cmpeq_epi8, _mm_cmpeq_epi8)
176 DEFINE_BINARY_MM256(GDALmm256_sad_epu8, _mm_sad_epu8)
177 DEFINE_BINARY_MM256(GDALmm256_add_epi32, _mm_add_epi32)
178 DEFINE_BINARY_MM256(GDALmm256_andnot_si256, _mm_andnot_si128)
179 DEFINE_BINARY_MM256(GDALmm256_and_si256, _mm_and_si128)
180 DEFINE_BINARY_MM256(GDALmm256_or_si256, _mm_or_si128)
181 DEFINE_BINARY_MM256(GDALmm256_min_epu8, _mm_min_epu8)
182 DEFINE_BINARY_MM256(GDALmm256_max_epu8, _mm_max_epu8)
183 DEFINE_BINARY_MM256(GDALmm256_madd_epi16, _mm_madd_epi16)
184 DEFINE_BINARY_MM256(GDALmm256_min_epu16, GDALmm_min_epu16)
185 DEFINE_BINARY_MM256(GDALmm256_max_epu16, GDALmm_max_epu16)
186 DEFINE_BINARY_MM256(GDALmm256_mullo_epi32, GDALmm_mullo_epi32)
187 DEFINE_BINARY_MM256(GDALmm256_add_epi64, _mm_add_epi64)
188 DEFINE_BINARY_MM256(GDALmm256_add_epi16, _mm_add_epi16)
189 DEFINE_BINARY_MM256(GDALmm256_sub_epi16, _mm_sub_epi16)
190 DEFINE_BINARY_MM256(GDALmm256_min_epi16, _mm_min_epi16)
191 DEFINE_BINARY_MM256(GDALmm256_max_epi16, _mm_max_epi16)
192
193 static inline __m128i GDALmm256_extracti128_si256(GDALm256i reg, int index)
194 {
195 return (index == 0) ? reg.low : reg.high;
196 }
197
GDALmm256_cvtepu8_epi16(__m128i reg128)198 static inline GDALm256i GDALmm256_cvtepu8_epi16(__m128i reg128)
199 {
200 GDALm256i reg;
201 reg.low = _mm_unpacklo_epi8(reg128, _mm_setzero_si128());
202 reg.high = _mm_unpacklo_epi8(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
203 _mm_setzero_si128());
204 return reg;
205 }
206
GDALmm256_cvtepu16_epi32(__m128i reg128)207 static inline GDALm256i GDALmm256_cvtepu16_epi32(__m128i reg128)
208 {
209 GDALm256i reg;
210 reg.low = _mm_unpacklo_epi16(reg128, _mm_setzero_si128());
211 reg.high = _mm_unpacklo_epi16(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
212 _mm_setzero_si128());
213 return reg;
214 }
215
GDALmm256_cvtepu16_epi64(__m128i reg128)216 static inline GDALm256i GDALmm256_cvtepu16_epi64(__m128i reg128)
217 {
218 GDALm256i reg;
219 reg.low = _mm_unpacklo_epi32(_mm_unpacklo_epi16(reg128,
220 _mm_setzero_si128()),
221 _mm_setzero_si128());
222 reg.high = _mm_unpacklo_epi32(_mm_unpacklo_epi16(
223 _mm_srli_si128(reg128, 4),
224 _mm_setzero_si128()),
225 _mm_setzero_si128());
226 return reg;
227 }
228
GDALmm256_cvtepu32_epi64(__m128i reg128)229 static inline GDALm256i GDALmm256_cvtepu32_epi64(__m128i reg128)
230 {
231 GDALm256i reg;
232 reg.low = _mm_unpacklo_epi32(reg128, _mm_setzero_si128());
233 reg.high = _mm_unpacklo_epi32(_mm_shuffle_epi32(reg128, 2 | (3 << 2)),
234 _mm_setzero_si128());
235 return reg;
236 }
237
238 #endif
239
240 #endif /* GDAL_AVX2_EMULATION_H_INCLUDED */
241