1 // SIMD helper
2 // optimze based on technolegy double, float and integer (32) SIMD instructions
3 // writen by Martin Steinegger
4 
5 #ifndef SIMD_H
6 #define SIMD_H
7 #include <cstdlib>
8 #include <limits>
9 #include <algorithm>
10 #include <iostream>
11 
12 #define AVX512_ALIGN_DOUBLE		64
13 #define AVX512_VECSIZE_DOUBLE	8
14 #define AVX512_ALIGN_FLOAT		64
15 #define AVX512_VECSIZE_FLOAT	16
16 #define AVX512_ALIGN_INT		64
17 #define AVX512_VECSIZE_INT		16
18 
19 #define AVX_ALIGN_DOUBLE		32
20 #define AVX_VECSIZE_DOUBLE		4
21 #define AVX_ALIGN_FLOAT			32
22 #define AVX_VECSIZE_FLOAT		8
23 #define AVX2_ALIGN_INT			32
24 #define AVX2_VECSIZE_INT		8
25 
26 #define SSE_ALIGN_DOUBLE		16
27 #define SSE_VECSIZE_DOUBLE		2
28 #define SSE_ALIGN_FLOAT			16
29 #define SSE_VECSIZE_FLOAT		4
30 #define SSE_ALIGN_INT			16
31 #define SSE_VECSIZE_INT			4
32 
33 #define MAX_ALIGN_DOUBLE	AVX512_ALIGN_DOUBLE
34 #define MAX_VECSIZE_DOUBLE	AVX512_VECSIZE_DOUBLE
35 #define MAX_ALIGN_FLOAT		AVX512_ALIGN_FLOAT
36 #define MAX_VECSIZE_FLOAT	AVX512_VECSIZE_FLOAT
37 #define MAX_ALIGN_INT		AVX512_ALIGN_INT
38 #define MAX_VECSIZE_INT		AVX512_VECSIZE_INT
39 
40 #define SIMDE_ENABLE_NATIVE_ALIASES
41 #include <simde/simde-features.h>
42 
43 // FIXME: Finish AVX512 implementation
44 //#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE)
45 //#define AVX512
46 //#endif
47 
48 #if defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE)
49 #define AVX2
50 #endif
51 
52 #ifdef AVX512
53 #include <simde/x86/avx512f.h>
54 #include <simde/x86/avx512bw.h>
55 
56 // double support
57 #ifndef SIMD_DOUBLE
58 #define SIMD_DOUBLE
59 #define ALIGN_DOUBLE        AVX512_ALIGN_DOUBLE
60 #define VECSIZE_DOUBLE      AVX512_VECSIZE_DOUBLE
61 typedef __m512d simd_double;
62 #define simdf64_add(x,y)    _mm512_add_pd(x,y)
63 #define simdf64_sub(x,y)    _mm512_sub_pd(x,y)
64 #define simdf64_mul(x,y)    _mm512_mul_pd(x,y)
65 #define simdf64_div(x,y)    _mm512_div_pd(x,y)
66 #define simdf64_max(x,y)    _mm512_max_pd(x,y)
67 #define simdf64_load(x)     _mm512_load_pd(x)
68 #define simdf64_store(x,y)  _mm512_store_pd(x,y)
69 #define simdf64_set(x)      _mm512_set1_pd(x)
70 #define simdf64_setzero(x)  _mm512_setzero_pd()
71 #define simdf64_gt(x,y)     _mm512_cmpnle_pd_mask(x,y)
72 #define simdf64_lt(x,y)     _mm512_cmplt_pd_mask(x,y)
73 #define simdf64_or(x,y)     _mm512_or_si512(x,y)
74 #define simdf64_and(x,y)    _mm512_and_si512 (x,y)
75 #define simdf64_andnot(x,y) _mm512_andnot_si512(x,y)
76 #define simdf64_xor(x,y)    _mm512_xor_si512(x,y)
77 #endif //SIMD_DOUBLE
78 // float support
79 #ifndef SIMD_FLOAT
80 #define SIMD_FLOAT
81 #define ALIGN_FLOAT         AVX512_ALIGN_FLOAT
82 #define VECSIZE_FLOAT       AVX512_VECSIZE_FLOAT
83 typedef __m512  simd_float;
84 #define simdf32_add(x,y)    _mm512_add_ps(x,y)
85 #define simdf32_sub(x,y)    _mm512_sub_ps(x,y)
86 #define simdf32_mul(x,y)    _mm512_mul_ps(x,y)
87 #define simdf32_div(x,y)    _mm512_div_ps(x,y)
88 #define simdf32_rcp(x)      _mm512_rcp_ps(x)
89 #define simdf32_max(x,y)    _mm512_max_ps(x,y)
90 #define simdf32_min(x,y)    _mm512_min_ps(x,y)
91 #define simdf32_load(x)     _mm512_load_ps(x)
92 #define simdf32_store(x,y)  _mm512_store_ps(x,y)
93 #define simdf32_set(x)      _mm512_set1_ps(x)
94 #define simdf32_setzero(x)  _mm512_setzero_ps()
95 #define simdf32_gt(x,y)     _mm512_cmpnle_ps_mask(x,y)
96 #define simdf32_eq(x,y)     _mm512_cmpeq_ps_mask(x,y)
97 #define simdf32_lt(x,y)     _mm512_cmplt_ps_mask(x,y)
98 #define simdf32_or(x,y)     _mm512_or_si512(x,y)
99 #define simdf32_and(x,y)    _mm512_and_si512(x,y)
100 #define simdf32_andnot(x,y) _mm512_andnot_si512(x,y)
101 #define simdf32_xor(x,y)    _mm512_xor_si512(x,y)
102 #define simdf32_f2i(x) 	    _mm512_cvtps_epi32(x)  // convert s.p. float to integer
103 #define simdf_f2icast(x)    _mm512_castps_si512(x)
104 #endif //SIMD_FLOAT
105 // integer support
106 #ifndef SIMD_INT
107 #define SIMD_INT
108 #define ALIGN_INT           AVX512_ALIGN_INT
109 #define VECSIZE_INT         AVX512_VECSIZE_INT
110 typedef __m512i simd_int;
111 #define simdi32_add(x,y)    _mm512_add_epi32(x,y)
112 #define simdi16_add(x,y)    _mm512_add_epi16(x,y)
113 #define simdi16_adds(x,y)   _mm512_adds_epi16(x,y)
114 #define simdui8_adds(x,y)   _mm512_adds_epu8()
115 #define simdi32_sub(x,y)    _mm512_sub_epi32(x,y)
116 #define simdui8_subs(x,y)   _mm512_subs_epu8()
117 #define simdi32_mul(x,y)    _mm512_mullo_epi32(x,y)
118 #define simdui8_max(x,y)    _mm512_max_epu8()
119 #define simdi16_max(x,y)    _mm512_max_epi16(x,y)
120 #define simdi32_max(x,y)    _mm512_max_epi32(x,y)
121 #define simdi_load(x)       _mm512_load_si512(x)
122 #define simdi_streamload(x) _mm512_stream_load_si512(x)
123 #define simdi_store(x,y)    _mm512_store_si512(x,y)
124 #define simdi_storeu(x,y)   _mm512_storeu_si512(x,y)
125 #define simdi32_set(x)      _mm512_set1_epi32(x)
126 #define simdi16_set(x)      _mm512_set1_epi16(x)
127 #define simdi8_set(x)       _mm512_set1_epi8(x)
128 #define simdi32_shuffle(x,y) _mm512_shuffle_epi32(x,y)
129 #define simdi16_shuffle(x,y) NOT_YET_IMP(x,y)
130 #define simdi8_shuffle(x,y)  _mm512_shuffle_epi8(x,y)
131 #define simdi_setzero()     _mm512_setzero_si512()
132 #define simdi32_gt(x,y)     _mm512_cmpgt_epi32(x,y)
133 #define simdi8_gt(x,y)      NOT_YET_IMP()
134 #define simdi16_gt(x,y)     NOT_YET_IMP()
135 #define simdi8_eq(x,y)      NOT_YET_IMP()
136 #define simdi32_lt(x,y)     NOT_YET_IMP()
137 #define simdi16_lt(x,y)     NOT_YET_IMP()
138 #define simdi8_lt(x,y)      NOT_YET_IMP()
139 
140 #define simdi_or(x,y)       _mm512_or_si512(x,y)
141 #define simdi_and(x,y)      _mm512_and_si512(x,y)
142 #define simdi_andnot(x,y)   _mm512_andnot_si512(x,y)
143 #define simdi_xor(x,y)      _mm512_xor_si512(x,y)
144 #define simdi8_shiftl(x,y)  NOT_YET_IMP()
145 #define simdi8_shiftr(x,y)  NOT_YET_IMP()
146 #define simdi8_movemask(x)  NOT_YET_IMP()
147 #define simdi16_extract(x,y) NOT_YET_IMP()
148 #define simdi16_slli(x,y)	_mm512_slli_epi16(x,y) // shift integers in a left by y
149 #define simdi16_srli(x,y)	_mm512_srli_epi16(x,y) // shift integers in a right by y
150 #define simdi32_slli(x,y)	_mm512_slli_epi32(x,y) // shift integers in a left by y
151 #define simdi32_srli(x,y)	_mm512_srli_epi32(x,y) // shift integers in a right by y
152 #define simdi32_i2f(x) 	    _mm512_cvtepi32_ps(x)  // convert integer to s.p. float
153 #define simdi_i2fcast(x)    _mm512_castsi512_ps(x)
154 #endif //SIMD_INT
155 #endif //AVX512_SUPPORT
156 
157 
158 #ifdef AVX2
159 #include <simde/x86/avx2.h>
160 // integer support  (usable with AVX2)
161 #ifndef SIMD_INT
162 #define SIMD_INT
163 #define ALIGN_INT           AVX2_ALIGN_INT
164 #define VECSIZE_INT         AVX2_VECSIZE_INT
165 uint16_t simd_hmax16_sse(const __m128i buffer);
166 uint8_t simd_hmax8_sse(const __m128i buffer);
simd_hmax16_avx(const __m256i buffer)167 inline uint16_t simd_hmax16_avx(const __m256i buffer) {
168     const __m128i abcd = _mm256_castsi256_si128(buffer);
169     const uint16_t first = simd_hmax16_sse(abcd);
170     const __m128i efgh = _mm256_extracti128_si256(buffer, 1);
171     const uint16_t second = simd_hmax16_sse(efgh);
172     return std::max(first, second);
173 }
174 
simd_hmax8_avx(const __m256i buffer)175 inline uint8_t simd_hmax8_avx(const __m256i buffer) {
176     const __m128i abcd = _mm256_castsi256_si128(buffer);
177     const uint8_t first = simd_hmax8_sse(abcd);
178     const __m128i efgh = _mm256_extracti128_si256(buffer, 1);
179     const uint8_t second = simd_hmax8_sse(efgh);
180     return std::max(first, second);
181 }
182 
183 template  <unsigned int N>
_mm256_shift_left(__m256i a)184 inline __m256i _mm256_shift_left(__m256i a) {
185     __m256i mask = _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,3,0) );
186     return _mm256_alignr_epi8(a,mask,16-N);
187 }
188 
extract_epi16(__m256i v,int pos)189 inline unsigned short extract_epi16(__m256i v, int pos) {
190     switch(pos){
191         case 0: return _mm256_extract_epi16(v, 0);
192         case 1: return _mm256_extract_epi16(v, 1);
193         case 2: return _mm256_extract_epi16(v, 2);
194         case 3: return _mm256_extract_epi16(v, 3);
195         case 4: return _mm256_extract_epi16(v, 4);
196         case 5: return _mm256_extract_epi16(v, 5);
197         case 6: return _mm256_extract_epi16(v, 6);
198         case 7: return _mm256_extract_epi16(v, 7);
199         case 8: return _mm256_extract_epi16(v, 8);
200         case 9: return _mm256_extract_epi16(v, 9);
201         case 10: return _mm256_extract_epi16(v, 10);
202         case 11: return _mm256_extract_epi16(v, 11);
203         case 12: return _mm256_extract_epi16(v, 12);
204         case 13: return _mm256_extract_epi16(v, 13);
205         case 14: return _mm256_extract_epi16(v, 14);
206         case 15: return _mm256_extract_epi16(v, 15);
207     }
208     return 0;
209 }
210 
211 typedef __m256i simd_int;
212 #define simdi32_add(x,y)    _mm256_add_epi32(x,y)
213 #define simdi16_add(x,y)    _mm256_add_epi16(x,y)
214 #define simdi16_adds(x,y)   _mm256_adds_epi16(x,y)
215 #define simdui8_adds(x,y)   _mm256_adds_epu8(x,y)
216 #define simdi32_sub(x,y)    _mm256_sub_epi32(x,y)
217 #define simdui16_subs(x,y)  _mm256_subs_epu16(x,y)
218 #define simdui8_subs(x,y)   _mm256_subs_epu8(x,y)
219 #define simdi32_mul(x,y)    _mm256_mullo_epi32(x,y)
220 #define simdi32_max(x,y)    _mm256_max_epi32(x,y)
221 #define simdi16_max(x,y)    _mm256_max_epi16(x,y)
222 #define simdi32_insert(x,y,z) _mm256_insert_epi32(x,y,z)
223 #define simdi32_extract(x,y) _mm256_extract_epi32(x,y)
224 #define simdi16_hmax(x)     simd_hmax16_avx(x)
225 #define simdui8_max(x,y)    _mm256_max_epu8(x,y)
226 #define simdi8_hmax(x)      simd_hmax8_avx(x)
227 #define simdi_load(x)       _mm256_load_si256(x)
228 #define simdi_loadu(x)       _mm256_loadu_si256(x)
229 #define simdi_streamload(x) _mm256_stream_load_si256(x)
230 #define simdi_store(x,y)    _mm256_store_si256(x,y)
231 #define simdi_storeu(x,y)   _mm256_storeu_si256(x,y)
232 #define simdi32_set(x)      _mm256_set1_epi32(x)
233 #define simdi16_set(x)      _mm256_set1_epi16(x)
234 #define simdi8_set(x)       _mm256_set1_epi8(x)
235 #define simdi32_shuffle(x,y) _mm256_shuffle_epi32(x,y)
236 #define simdi16_shuffle(x,y) _mm256_shuffle_epi16(x,y)
237 #define simdi8_shuffle(x,y)  _mm256_shuffle_epi8(x,y)
238 #define simdi_setzero()     _mm256_setzero_si256()
239 #define simdi8_blend(x,y,z) _mm256_blendv_epi8(x,y,z)
240 #define simdi32_gt(x,y)     _mm256_cmpgt_epi32(x,y)
241 #define simdi8_gt(x,y)      _mm256_cmpgt_epi8(x,y)
242 #define simdi16_gt(x,y)     _mm256_cmpgt_epi16(x,y)
243 #define simdi8_eq(x,y)      _mm256_cmpeq_epi8(x,y)
244 #define simdi16_eq(x,y)     _mm256_cmpeq_epi16(x,y)
245 #define simdi32_eq(x,y)     _mm256_cmpeq_epi32(x,y)
246 #define simdi32_lt(x,y)     _mm256_cmpgt_epi32(y,x) // inverse
247 #define simdi16_lt(x,y)     _mm256_cmpgt_epi16(y,x) // inverse
248 #define simdi8_lt(x,y)      _mm256_cmpgt_epi8(y,x)
249 #define simdi_or(x,y)       _mm256_or_si256(x,y)
250 #define simdi_and(x,y)      _mm256_and_si256(x,y)
251 #define simdi_andnot(x,y)   _mm256_andnot_si256(x,y)
252 #define simdi_xor(x,y)      _mm256_xor_si256(x,y)
253 #define simdi8_shiftl(x,y)  _mm256_shift_left<y>(x)
254 //TODO fix like shift_left
255 #define simdi8_shiftr(x,y)  _mm256_srli_si256(x,y)
256 #define SIMD_MOVEMASK_MAX   0xffffffff
257 #define simdi8_movemask(x)  _mm256_movemask_epi8(x)
258 #define simdi16_extract(x,y) extract_epi16(x,y)
259 #define simdi32_pack(x,y)   _mm256_packs_epi32(x,y)
260 #define simdi16_pack(x,y)   _mm256_packs_epi16(x,y)
261 #define simdi16_slli(x,y)	_mm256_slli_epi16(x,y) // shift integers in a left by y
262 #define simdi16_srli(x,y)	_mm256_srli_epi16(x,y) // shift integers in a right by y
263 #define simdi32_slli(x,y)   _mm256_slli_epi32(x,y) // shift integers in a left by y
264 #define simdi32_srli(x,y)   _mm256_srli_epi32(x,y) // shift integers in a right by y
265 #define simdi32_i2f(x) 	    _mm256_cvtepi32_ps(x)  // convert integer to s.p. float
266 #define simdi_i2fcast(x)    _mm256_castsi256_ps(x)
267 #endif
268 
269 #include <simde/x86/avx.h>
270 // double support (usable with AVX1)
271 #ifndef SIMD_DOUBLE
272 #define SIMD_DOUBLE
273 #define ALIGN_DOUBLE        AVX_ALIGN_DOUBLE
274 #define VECSIZE_DOUBLE      AVX_VECSIZE_DOUBLE
275 typedef __m256d simd_double;
276 #define simdf64_add(x,y)    _mm256_add_pd(x,y)
277 #define simdf64_sub(x,y)    _mm256_sub_pd(x,y)
278 #define simdf64_mul(x,y)    _mm256_mul_pd(x,y)
279 #define simdf64_div(x,y)    _mm256_div_pd(x,y)
280 #define simdf64_max(x,y)    _mm256_max_pd(x,y)
281 #define simdf64_load(x)     _mm256_load_pd(x)
282 #define simdf64_store(x,y)  _mm256_store_pd(x,y)
283 #define simdf64_set(x)      _mm256_set1_pd(x)
284 #define simdf64_setzero(x)  _mm256_setzero_pd()
285 #define simdf64_gt(x,y)     _mm256_cmp_pd(x,y,_CMP_GT_OS)
286 #define simdf64_lt(x,y)     _mm256_cmp_pd(x,y,_CMP_LT_OS)
287 #define simdf64_or(x,y)     _mm256_or_pd(x,y)
288 #define simdf64_and(x,y)    _mm256_and_pd(x,y)
289 #define simdf64_andnot(x,y) _mm256_andnot_pd(x,y)
290 #define simdf64_xor(x,y)    _mm256_xor_pd(x,y)
291 #endif //SIMD_DOUBLE
292 // float support (usable with AVX1)
293 #ifndef SIMD_FLOAT
294 #define SIMD_FLOAT
295 #define ALIGN_FLOAT         AVX_ALIGN_FLOAT
296 #define VECSIZE_FLOAT       AVX_VECSIZE_FLOAT
297 typedef __m256 simd_float;
298 #define simdf32_add(x,y)    _mm256_add_ps(x,y)
299 #define simdf32_sub(x,y)    _mm256_sub_ps(x,y)
300 #define simdf32_mul(x,y)    _mm256_mul_ps(x,y)
301 #define simdf32_div(x,y)    _mm256_div_ps(x,y)
302 #define simdf32_rcp(x)      _mm256_rcp_ps(x)
303 #define simdf32_max(x,y)    _mm256_max_ps(x,y)
304 #define simdf32_min(x,y)    _mm256_min_ps(x,y)
305 #define simdf32_load(x)     _mm256_load_ps(x)
306 #define simdf32_store(x,y)  _mm256_store_ps(x,y)
307 #define simdf32_set(x)      _mm256_set1_ps(x)
308 #define simdf32_setzero(x)  _mm256_setzero_ps()
309 #define simdf32_gt(x,y)     _mm256_cmp_ps(x,y,_CMP_GT_OS)
310 #define simdf32_eq(x,y)     _mm256_cmp_ps(x,y,_CMP_EQ_OS)
311 #define simdf32_lt(x,y)     _mm256_cmp_ps(x,y,_CMP_LT_OS)
312 #define simdf32_or(x,y)     _mm256_or_ps(x,y)
313 #define simdf32_and(x,y)    _mm256_and_ps(x,y)
314 #define simdf32_andnot(x,y) _mm256_andnot_ps(x,y)
315 #define simdf32_xor(x,y)    _mm256_xor_ps(x,y)
316 #define simdf32_f2i(x) 	    _mm256_cvtps_epi32(x)  // convert s.p. float to integer
317 #define simdf_f2icast(x)    _mm256_castps_si256(x) // compile time cast
318 #endif
319 #endif
320 
321 #include <simde/x86/sse4.1.h>
simd_hmax16_sse(const __m128i buffer)322 inline uint16_t simd_hmax16_sse(const __m128i buffer) {
323     __m128i tmp1 = _mm_subs_epu16(_mm_set1_epi16((short)65535), buffer);
324     __m128i tmp3 = _mm_minpos_epu16(tmp1);
325     return (65535 - _mm_cvtsi128_si32(tmp3));
326 }
327 
simd_hmax8_sse(const __m128i buffer)328 inline uint8_t simd_hmax8_sse(const __m128i buffer) {
329     __m128i tmp1 = _mm_subs_epu8(_mm_set1_epi8((char)255), buffer);
330     __m128i tmp2 = _mm_min_epu8(tmp1, _mm_srli_epi16(tmp1, 8));
331     __m128i tmp3 = _mm_minpos_epu16(tmp2);
332     return (int8_t)(255 -(int8_t) _mm_cvtsi128_si32(tmp3));
333 }
334 
335 // double support
336 #ifndef SIMD_DOUBLE
337 #define SIMD_DOUBLE
338 #define ALIGN_DOUBLE        SSE_ALIGN_DOUBLE
339 #define VECSIZE_DOUBLE      SSE_VECSIZE_DOUBLE
340 typedef __m128d simd_double;
341 #define simdf64_add(x,y)    _mm_add_pd(x,y)
342 #define simdf64_sub(x,y)    _mm_sub_pd(x,y)
343 #define simdf64_mul(x,y)    _mm_mul_pd(x,y)
344 #define simdf64_div(x,y)    _mm_div_pd(x,y)
345 #define simdf64_max(x,y)    _mm_max_pd(x,y)
346 #define simdf64_load(x)     _mm_load_pd(x)
347 #define simdf64_store(x,y)  _mm_store_pd(x,y)
348 #define simdf64_set(x)      _mm_set1_pd(x)
349 #define simdf64_setzero(x)  _mm_setzero_pd()
350 #define simdf64_gt(x,y)     _mm_cmpgt_pd(x,y)
351 #define simdf64_lt(x,y)     _mm_cmplt_pd(x,y)
352 #define simdf64_or(x,y)     _mm_or_pd(x,y)
353 #define simdf64_and(x,y)    _mm_and_pd(x,y)
354 #define simdf64_andnot(x,y) _mm_andnot_pd(x,y)
355 #define simdf64_xor(x,y)    _mm_xor_pd(x,y)
356 #endif //SIMD_DOUBLE
357 
358 // float support
359 #ifndef SIMD_FLOAT
360 #define SIMD_FLOAT
361 #define ALIGN_FLOAT         SSE_ALIGN_FLOAT
362 #define VECSIZE_FLOAT       SSE_VECSIZE_FLOAT
363 typedef __m128  simd_float;
364 #define simdf32_add(x,y)    _mm_add_ps(x,y)
365 #define simdf32_sub(x,y)    _mm_sub_ps(x,y)
366 #define simdf32_mul(x,y)    _mm_mul_ps(x,y)
367 #define simdf32_div(x,y)    _mm_div_ps(x,y)
368 #define simdf32_rcp(x)      _mm_rcp_ps(x)
369 #define simdf32_max(x,y)    _mm_max_ps(x,y)
370 #define simdf32_min(x,y)    _mm_min_ps(x,y)
371 #define simdf32_load(x)     _mm_load_ps(x)
372 #define simdf32_store(x,y)  _mm_store_ps(x,y)
373 #define simdf32_set(x)      _mm_set1_ps(x)
374 #define simdf32_setzero(x)  _mm_setzero_ps()
375 #define simdf32_gt(x,y)     _mm_cmpgt_ps(x,y)
376 #define simdf32_eq(x,y)     _mm_cmpeq_ps(x,y)
377 #define simdf32_lt(x,y)     _mm_cmplt_ps(x,y)
378 #define simdf32_or(x,y)     _mm_or_ps(x,y)
379 #define simdf32_and(x,y)    _mm_and_ps(x,y)
380 #define simdf32_andnot(x,y) _mm_andnot_ps(x,y)
381 #define simdf32_xor(x,y)    _mm_xor_ps(x,y)
382 #define simdf32_f2i(x) 	    _mm_cvtps_epi32(x)  // convert s.p. float to integer
383 #define simdf_f2icast(x)    _mm_castps_si128(x) // compile time cast
384 #endif //SIMD_FLOAT
385 
386 // integer support
387 #ifndef SIMD_INT
388 #define SIMD_INT
extract_epi16(__m128i v,int pos)389 inline unsigned short extract_epi16(__m128i v, int pos) {
390     switch(pos){
391         case 0: return _mm_extract_epi16(v, 0);
392         case 1: return _mm_extract_epi16(v, 1);
393         case 2: return _mm_extract_epi16(v, 2);
394         case 3: return _mm_extract_epi16(v, 3);
395         case 4: return _mm_extract_epi16(v, 4);
396         case 5: return _mm_extract_epi16(v, 5);
397         case 6: return _mm_extract_epi16(v, 6);
398         case 7: return _mm_extract_epi16(v, 7);
399     }
400     return 0;
401 }
402 #define ALIGN_INT           SSE_ALIGN_INT
403 #define VECSIZE_INT         SSE_VECSIZE_INT
404 typedef __m128i simd_int;
405 #define simdi32_add(x,y)    _mm_add_epi32(x,y)
406 #define simdi16_add(x,y)    _mm_add_epi16(x,y)
407 #define simdi16_adds(x,y)   _mm_adds_epi16(x,y)
408 #define simdui8_adds(x,y)   _mm_adds_epu8(x,y)
409 #define simdi32_sub(x,y)    _mm_sub_epi32(x,y)
410 #define simdui16_subs(x,y)  _mm_subs_epu16(x,y)
411 #define simdui8_subs(x,y)   _mm_subs_epu8(x,y)
412 #define simdi32_mul(x,y)    _mm_mullo_epi32(x,y) // SSE4.1
413 #define simdi32_max(x,y)    _mm_max_epi32(x,y) // SSE4.1
414 #define simdi16_max(x,y)    _mm_max_epi16(x,y)
415 #define simdi32_insert(x,y,z) _mm_insert_epi32(x,y,z)
416 #define simdi32_extract(x,y) _mm_extract_epi32(x,y)
417 #define simdi16_hmax(x)     simd_hmax16_sse(x)
418 #define simdui8_max(x,y)    _mm_max_epu8(x,y)
419 #define simdi8_hmax(x)      simd_hmax8_sse(x)
420 #define simdi_load(x)       _mm_load_si128(x)
421 #define simdi_loadu(x)      _mm_loadu_si128(x)
422 #define simdi_streamload(x) _mm_stream_load_si128(x)
423 #define simdi_storeu(x,y)   _mm_storeu_si128(x,y)
424 #define simdi_store(x,y)    _mm_store_si128(x,y)
425 #define simdi32_set(x)      _mm_set1_epi32(x)
426 #define simdi16_set(x)      _mm_set1_epi16(x)
427 #define simdi8_set(x)       _mm_set1_epi8(x)
428 #define simdi32_shuffle(x,y) _mm_shuffle_epi32(x,y)
429 #define simdi16_shuffle(x,y) _mm_shuffle_epi16(x,y)
430 #define simdi8_shuffle(x,y)  _mm_shuffle_epi8(x,y)
431 #define simdi_setzero()     _mm_setzero_si128()
432 #define simdi8_blend(x,y,z) _mm_blendv_epi8(x,y,z)
433 #define simdi32_gt(x,y)     _mm_cmpgt_epi32(x,y)
434 #define simdi8_gt(x,y)      _mm_cmpgt_epi8(x,y)
435 #define simdi32_eq(x,y)     _mm_cmpeq_epi32(x,y)
436 #define simdi16_eq(x,y)     _mm_cmpeq_epi16(x,y)
437 #define simdi8_eq(x,y)      _mm_cmpeq_epi8(x,y)
438 #define simdi32_lt(x,y)     _mm_cmplt_epi32(x,y)
439 #define simdi16_lt(x,y)     _mm_cmplt_epi16(x,y)
440 #define simdi8_lt(x,y)      _mm_cmplt_epi8(x,y)
441 #define simdi16_gt(x,y)     _mm_cmpgt_epi16(x,y)
442 #define simdi_or(x,y)       _mm_or_si128(x,y)
443 #define simdi_and(x,y)      _mm_and_si128(x,y)
444 #define simdi_andnot(x,y)   _mm_andnot_si128(x,y)
445 #define simdi_xor(x,y)      _mm_xor_si128(x,y)
446 #define simdi8_shiftl(x,y)  _mm_slli_si128(x,y)
447 #define simdi8_shiftr(x,y)  _mm_srli_si128(x,y)
448 #define SIMD_MOVEMASK_MAX   0xffff
449 #define simdi8_movemask(x)  _mm_movemask_epi8(x)
450 #define simdi16_extract(x,y) extract_epi16(x,y)
451 #define simdi32_pack(x,y)   _mm_packs_epi32(x,y)
452 #define simdi16_pack(x,y)   _mm_packs_epi16(x,y)
453 #define simdi16_slli(x,y)	_mm_slli_epi16(x,y) // shift integers in a left by y
454 #define simdi16_srli(x,y)	_mm_srli_epi16(x,y) // shift integers in a right by y
455 #define simdi32_slli(x,y)	_mm_slli_epi32(x,y) // shift integers in a left by y
456 #define simdi32_srli(x,y)	_mm_srli_epi32(x,y) // shift integers in a right by y
457 #define simdi32_i2f(x) 	    _mm_cvtepi32_ps(x)  // convert integer to s.p. float
458 #define simdi_i2fcast(x)    _mm_castsi128_ps(x)
459 #endif //SIMD_INT
460 
mem_align(size_t boundary,size_t size)461 inline void *mem_align(size_t boundary, size_t size) {
462     void *pointer;
463     if (posix_memalign(&pointer, boundary, size) != 0) {
464 #define MEM_ALIGN_ERROR "mem_align could not allocate memory.\n"
465         fwrite(MEM_ALIGN_ERROR, sizeof(MEM_ALIGN_ERROR), 1, stderr);
466 #undef MEM_ALIGN_ERROR
467         exit(3);
468     }
469     return pointer;
470 }
471 #ifdef SIMD_FLOAT
malloc_simd_float(const size_t size)472 inline simd_float * malloc_simd_float(const size_t size) {
473     return (simd_float *) mem_align(ALIGN_FLOAT, size);
474 }
475 #endif
476 #ifdef SIMD_DOUBLE
malloc_simd_double(const size_t size)477 inline simd_double * malloc_simd_double(const size_t size) {
478     return (simd_double *) mem_align(ALIGN_DOUBLE, size);
479 }
480 #endif
481 #ifdef SIMD_INT
malloc_simd_int(const size_t size)482 inline simd_int * malloc_simd_int(const size_t size) {
483     return (simd_int *) mem_align(ALIGN_INT, size);
484 }
485 #endif
486 
487 template <typename T>
malloc_matrix(int dim1,int dim2)488 T** malloc_matrix(int dim1, int dim2) {
489 #define ICEIL(x_int, fac_int) ((x_int + fac_int - 1) / fac_int) * fac_int
490     // Compute mem sizes rounded up to nearest multiple of ALIGN_FLOAT
491     size_t size_pointer_array = ICEIL(dim1*sizeof(T*), ALIGN_FLOAT);
492     size_t dim2_padded = ICEIL(dim2*sizeof(T), ALIGN_FLOAT)/sizeof(T);
493 
494     T** matrix = (T**) mem_align( ALIGN_FLOAT, size_pointer_array + dim1*dim2_padded*sizeof(T) );
495     if (matrix == NULL)
496         return matrix;
497 
498     T* ptr = (T*) (matrix + (size_pointer_array/sizeof(T*)) );
499     for (int i=0; i<dim1; ++i) {
500         matrix[i] = ptr;
501         ptr += dim2_padded;
502     }
503 #undef ICEIL
504     return matrix;
505 }
506 
507 
ScalarProd20(const float * qi,const float * tj)508 inline float ScalarProd20(const float* qi, const float* tj) {
509 //#ifdef AVX
510 //  float __attribute__((aligned(ALIGN_FLOAT))) res;
511 //  __m256 P; // query 128bit SSE2 register holding 4 floats
512 //  __m256 S; // aux register
513 //  __m256 R; // result
514 //  __m256* Qi = (__m256*) qi;
515 //  __m256* Tj = (__m256*) tj;
516 //
517 //  R = _mm256_mul_ps(*(Qi++),*(Tj++));
518 //  P = _mm256_mul_ps(*(Qi++),*(Tj++));
519 //  S = _mm256_mul_ps(*Qi,*Tj); // floats A, B, C, D, ?, ?, ? ,?
520 //  R = _mm256_add_ps(R,P);     // floats 0, 1, 2, 3, 4, 5, 6, 7
521 //  P = _mm256_permute2f128_ps(R, R, 0x01); // swap hi and lo 128 bits: 4, 5, 6, 7, 0, 1, 2, 3
522 //  R = _mm256_add_ps(R,P);     // 0+4, 1+5, 2+6, 3+7, 0+4, 1+5, 2+6, 3+7
523 //  R = _mm256_add_ps(R,S);     // 0+4+A, 1+5+B, 2+6+C, 3+7+D, ?, ?, ? ,?
524 //  R = _mm256_hadd_ps(R,R);    // 04A15B, 26C37D, ?, ?, 04A15B, 26C37D, ?, ?
525 //  R = _mm256_hadd_ps(R,R);    // 01234567ABCD, ?, 01234567ABCD, ?, 01234567ABCD, ?, 01234567ABCD, ?
526 //  _mm256_store_ps(&res, R);
527 //  return res;
528 //#else
529 //
530 //
531 //TODO fix this
532     float __attribute__((aligned(16))) res;
533     __m128 P; // query 128bit SSE2 register holding 4 floats
534     __m128 R;// result
535     __m128* Qi = (__m128*) qi;
536     __m128* Tj = (__m128*) tj;
537 
538     __m128 P1 = _mm_mul_ps(*(Qi),*(Tj));
539     __m128 P2 = _mm_mul_ps(*(Qi+1),*(Tj+1));
540     __m128 R1 = _mm_add_ps(P1, P2);
541 
542     __m128 P3 = _mm_mul_ps(*(Qi + 2), *(Tj + 2));
543     __m128 P4 = _mm_mul_ps(*(Qi + 3), *(Tj + 3));
544     __m128 R2 = _mm_add_ps(P3, P4);
545     __m128 P5 = _mm_mul_ps(*(Qi+4), *(Tj+4));
546 
547     R = _mm_add_ps(R1, R2);
548     R = _mm_add_ps(R,P5);
549 
550 //    R = _mm_hadd_ps(R,R);
551 //    R = _mm_hadd_ps(R,R);
552     P = _mm_shuffle_ps(R, R, _MM_SHUFFLE(2,0,2,0));
553     R = _mm_shuffle_ps(R, R, _MM_SHUFFLE(3,1,3,1));
554     R = _mm_add_ps(R,P);
555     P = _mm_shuffle_ps(R, R, _MM_SHUFFLE(2,0,2,0));
556     R = _mm_shuffle_ps(R, R, _MM_SHUFFLE(3,1,3,1));
557     R = _mm_add_ps(R,P);
558     _mm_store_ss(&res, R);
559     return res;
560 //#endif
561 //    return tj[0] * qi[0] + tj[1] * qi[1] + tj[2] * qi[2] + tj[3] * qi[3]
562 //            + tj[4] * qi[4] + tj[5] * qi[5] + tj[6] * qi[6] + tj[7] * qi[7]
563 //            + tj[8] * qi[8] + tj[9] * qi[9] + tj[10] * qi[10] + tj[11] * qi[11]
564 //            + tj[12] * qi[12] + tj[13] * qi[13] + tj[14] * qi[14]
565 //            + tj[15] * qi[15] + tj[16] * qi[16] + tj[17] * qi[17]
566 //            + tj[18] * qi[18] + tj[19] * qi[19];
567 }
568 
569 #endif //SIMD_H
570