1 ////////////////////////////////////////////////////////////////
2 //
3 // this code was taken from http://shibatch.sourceforge.net/
4 // Many thanks to the author of original version: Naoki Shibata
5 //
6 // Copyright Naoki Shibata and contributors 2010 - 2021.
7 // Distributed under the Boost Software License, Version 1.0.
8 // (See accompanying file LICENSE.txt or copy at
9 // http://www.boost.org/LICENSE_1_0.txt)
10 //
11 // This version contains modifications made by Ingo Weyrich
12 //
13 ////////////////////////////////////////////////////////////////
14
15 #ifndef __SSE2__
16 #error Please specify -msse2.
17 #endif
18
19 #ifdef __GNUC__
20 #define INLINE __inline
21 #else
22 #define INLINE inline
23 #endif
24
25 #include <x86intrin.h>
26
27 #include <stdint.h>
28
29 typedef __m128d vdouble;
30 typedef __m128i vint;
31 typedef __m128i vmask;
32
33 typedef __m128 vfloat;
34 typedef __m128i vint2;
35
36 //
37 #define LVF(x) _mm_load_ps((float*)&x)
38 #define LVFU(x) _mm_loadu_ps(&x)
39 #define STVF(x,y) _mm_store_ps(&x,y)
40 #define STVFU(x,y) _mm_storeu_ps(&x,y)
41 #define LVI(x) _mm_load_si128((__m128i*)&x)
42
43 #ifdef __AVX__
44 #define PERMUTEPS(a,mask) _mm_permute_ps(a,mask)
45 #else
46 #define PERMUTEPS(a,mask) _mm_shuffle_ps(a,a,mask)
47 #endif
48
LC2VFU(float & a)49 static INLINE vfloat LC2VFU(float &a)
50 {
51 // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
52 vfloat a1 = _mm_loadu_ps( &a );
53 vfloat a2 = _mm_loadu_ps( (&a) + 4 );
54 return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 ));
55 }
56
57
58 // Store a vector of 4 floats in a[0],a[2],a[4] and a[6]
59 #ifdef __SSE4_1__
60 // SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself
61 #define STC2VFU(a,v) {\
62 __m128 TST1V = _mm_loadu_ps(&a);\
63 __m128 TST2V = _mm_unpacklo_ps(v,v);\
64 _mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\
65 TST1V = _mm_loadu_ps((&a)+4);\
66 TST2V = _mm_unpackhi_ps(v,v);\
67 _mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\
68 }
69 #else
70 #define STC2VFU(a,v) {\
71 __m128 TST1V = _mm_loadu_ps(&a);\
72 __m128 TST2V = _mm_unpacklo_ps(v,v);\
73 vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\
74 _mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\
75 TST1V = _mm_loadu_ps((&a)+4);\
76 TST2V = _mm_unpackhi_ps(v,v);\
77 _mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\
78 }
79 #endif
80
81 #define ZEROV _mm_setzero_ps()
82 #define F2V(a) _mm_set1_ps((a))
83
vrint_vi_vd(vdouble vd)84 static INLINE vint vrint_vi_vd(vdouble vd)
85 {
86 return _mm_cvtpd_epi32(vd);
87 }
vtruncate_vi_vd(vdouble vd)88 static INLINE vint vtruncate_vi_vd(vdouble vd)
89 {
90 return _mm_cvttpd_epi32(vd);
91 }
vcast_vd_vi(vint vi)92 static INLINE vdouble vcast_vd_vi(vint vi)
93 {
94 return _mm_cvtepi32_pd(vi);
95 }
vcast_vd_d(double d)96 static INLINE vdouble vcast_vd_d(double d)
97 {
98 return _mm_set_pd(d, d);
99 }
vcast_vi_i(int i)100 static INLINE vint vcast_vi_i(int i)
101 {
102 return _mm_set_epi32(0, 0, i, i);
103 }
104
vreinterpret_vm_vd(vdouble vd)105 static INLINE vmask vreinterpret_vm_vd(vdouble vd)
106 {
107 return (__m128i)vd;
108 }
vreinterpret_vd_vm(vint vm)109 static INLINE vdouble vreinterpret_vd_vm(vint vm)
110 {
111 return (__m128d)vm;
112 }
113
vreinterpret_vm_vf(vfloat vf)114 static INLINE vmask vreinterpret_vm_vf(vfloat vf)
115 {
116 return (__m128i)vf;
117 }
vreinterpret_vf_vm(vmask vm)118 static INLINE vfloat vreinterpret_vf_vm(vmask vm)
119 {
120 return (__m128)vm;
121 }
122
123 //
124
vcast_vf_f(float f)125 static INLINE vfloat vcast_vf_f(float f)
126 {
127 return _mm_set_ps(f, f, f, f);
128 }
129
130 // Don't use intrinsics here. Newer gcc versions (>= 4.9, maybe also before 4.9) generate better code when not using intrinsics
131 // example: vaddf(vmulf(a,b),c) will generate an FMA instruction when build for chips with that feature only when vaddf and vmulf don't use intrinsics
vaddf(vfloat x,vfloat y)132 static INLINE vfloat vaddf(vfloat x, vfloat y)
133 {
134 return x + y;
135 }
vsubf(vfloat x,vfloat y)136 static INLINE vfloat vsubf(vfloat x, vfloat y)
137 {
138 return x - y;
139 }
vmulf(vfloat x,vfloat y)140 static INLINE vfloat vmulf(vfloat x, vfloat y)
141 {
142 return x * y;
143 }
vdivf(vfloat x,vfloat y)144 static INLINE vfloat vdivf(vfloat x, vfloat y)
145 {
146 return x / y;
147 }
148 // Also don't use intrinsic here: Some chips support FMA instructions with 3 and 4 operands
149 // 3 operands: a = a*b+c, b = a*b+c, c = a*b+c // destination has to be one of a,b,c
150 // 4 operands: d = a*b+c // destination does not have to be one of a,b,c
151 // gcc will use the one which fits best when not using intrinsics. With using intrinsics that's not possible
vmlaf(vfloat x,vfloat y,vfloat z)152 static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) {
153 return x * y + z;
154 }
vrecf(vfloat x)155 static INLINE vfloat vrecf(vfloat x)
156 {
157 return vdivf(vcast_vf_f(1.0f), x);
158 }
vsqrtf(vfloat x)159 static INLINE vfloat vsqrtf(vfloat x)
160 {
161 return _mm_sqrt_ps(x);
162 }
vmaxf(vfloat x,vfloat y)163 static INLINE vfloat vmaxf(vfloat x, vfloat y)
164 {
165 return _mm_max_ps(x, y);
166 }
vminf(vfloat x,vfloat y)167 static INLINE vfloat vminf(vfloat x, vfloat y)
168 {
169 return _mm_min_ps(x, y);
170 }
171
172 //
173
vadd(vdouble x,vdouble y)174 static INLINE vdouble vadd(vdouble x, vdouble y)
175 {
176 return _mm_add_pd(x, y);
177 }
vsub(vdouble x,vdouble y)178 static INLINE vdouble vsub(vdouble x, vdouble y)
179 {
180 return _mm_sub_pd(x, y);
181 }
vmul(vdouble x,vdouble y)182 static INLINE vdouble vmul(vdouble x, vdouble y)
183 {
184 return _mm_mul_pd(x, y);
185 }
vdiv(vdouble x,vdouble y)186 static INLINE vdouble vdiv(vdouble x, vdouble y)
187 {
188 return _mm_div_pd(x, y);
189 }
vrec(vdouble x)190 static INLINE vdouble vrec(vdouble x)
191 {
192 return _mm_div_pd(_mm_set_pd(1, 1), x);
193 }
vsqrt(vdouble x)194 static INLINE vdouble vsqrt(vdouble x)
195 {
196 return _mm_sqrt_pd(x);
197 }
vmla(vdouble x,vdouble y,vdouble z)198 static INLINE vdouble vmla(vdouble x, vdouble y, vdouble z)
199 {
200 return vadd(vmul(x, y), z);
201 }
202
vmax(vdouble x,vdouble y)203 static INLINE vdouble vmax(vdouble x, vdouble y)
204 {
205 return _mm_max_pd(x, y);
206 }
vmin(vdouble x,vdouble y)207 static INLINE vdouble vmin(vdouble x, vdouble y)
208 {
209 return _mm_min_pd(x, y);
210 }
211
vabs(vdouble d)212 static INLINE vdouble vabs(vdouble d)
213 {
214 return (__m128d)_mm_andnot_pd(_mm_set_pd(-0.0, -0.0), d);
215 }
vneg(vdouble d)216 static INLINE vdouble vneg(vdouble d)
217 {
218 return (__m128d)_mm_xor_pd(_mm_set_pd(-0.0, -0.0), d);
219 }
220
221 //
222
vaddi(vint x,vint y)223 static INLINE vint vaddi(vint x, vint y)
224 {
225 return _mm_add_epi32(x, y);
226 }
vsubi(vint x,vint y)227 static INLINE vint vsubi(vint x, vint y)
228 {
229 return _mm_sub_epi32(x, y);
230 }
231
vandi(vint x,vint y)232 static INLINE vint vandi(vint x, vint y)
233 {
234 return _mm_and_si128(x, y);
235 }
vandnoti(vint x,vint y)236 static INLINE vint vandnoti(vint x, vint y)
237 {
238 return _mm_andnot_si128(x, y);
239 }
vori(vint x,vint y)240 static INLINE vint vori(vint x, vint y)
241 {
242 return _mm_or_si128(x, y);
243 }
vxori(vint x,vint y)244 static INLINE vint vxori(vint x, vint y)
245 {
246 return _mm_xor_si128(x, y);
247 }
248
vslli(vint x,int c)249 static INLINE vint vslli(vint x, int c)
250 {
251 return _mm_slli_epi32(x, c);
252 }
vsrli(vint x,int c)253 static INLINE vint vsrli(vint x, int c)
254 {
255 return _mm_srli_epi32(x, c);
256 }
vsrai(vint x,int c)257 static INLINE vint vsrai(vint x, int c)
258 {
259 return _mm_srai_epi32(x, c);
260 }
261
262 //
263
vandm(vmask x,vmask y)264 static INLINE vmask vandm(vmask x, vmask y)
265 {
266 return _mm_and_si128(x, y);
267 }
vandnotm(vmask x,vmask y)268 static INLINE vmask vandnotm(vmask x, vmask y)
269 {
270 return _mm_andnot_si128(x, y);
271 }
vorm(vmask x,vmask y)272 static INLINE vmask vorm(vmask x, vmask y)
273 {
274 return _mm_or_si128(x, y);
275 }
vxorm(vmask x,vmask y)276 static INLINE vmask vxorm(vmask x, vmask y)
277 {
278 return _mm_xor_si128(x, y);
279 }
vnotm(vmask x)280 static INLINE vmask vnotm(vmask x)
281 {
282 return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
283 }
284
vmask_eq(vdouble x,vdouble y)285 static INLINE vmask vmask_eq(vdouble x, vdouble y)
286 {
287 return (__m128i)_mm_cmpeq_pd(x, y);
288 }
vmask_neq(vdouble x,vdouble y)289 static INLINE vmask vmask_neq(vdouble x, vdouble y)
290 {
291 return (__m128i)_mm_cmpneq_pd(x, y);
292 }
vmask_lt(vdouble x,vdouble y)293 static INLINE vmask vmask_lt(vdouble x, vdouble y)
294 {
295 return (__m128i)_mm_cmplt_pd(x, y);
296 }
vmask_le(vdouble x,vdouble y)297 static INLINE vmask vmask_le(vdouble x, vdouble y)
298 {
299 return (__m128i)_mm_cmple_pd(x, y);
300 }
vmask_gt(vdouble x,vdouble y)301 static INLINE vmask vmask_gt(vdouble x, vdouble y)
302 {
303 return (__m128i)_mm_cmpgt_pd(x, y);
304 }
vmask_ge(vdouble x,vdouble y)305 static INLINE vmask vmask_ge(vdouble x, vdouble y)
306 {
307 return (__m128i)_mm_cmpge_pd(x, y);
308 }
309
vmaskf_eq(vfloat x,vfloat y)310 static INLINE vmask vmaskf_eq(vfloat x, vfloat y)
311 {
312 return (__m128i)_mm_cmpeq_ps(x, y);
313 }
vmaskf_neq(vfloat x,vfloat y)314 static INLINE vmask vmaskf_neq(vfloat x, vfloat y)
315 {
316 return (__m128i)_mm_cmpneq_ps(x, y);
317 }
vmaskf_lt(vfloat x,vfloat y)318 static INLINE vmask vmaskf_lt(vfloat x, vfloat y)
319 {
320 return (__m128i)_mm_cmplt_ps(x, y);
321 }
vmaskf_le(vfloat x,vfloat y)322 static INLINE vmask vmaskf_le(vfloat x, vfloat y)
323 {
324 return (__m128i)_mm_cmple_ps(x, y);
325 }
vmaskf_gt(vfloat x,vfloat y)326 static INLINE vmask vmaskf_gt(vfloat x, vfloat y)
327 {
328 return (__m128i)_mm_cmpgt_ps(x, y);
329 }
vmaskf_ge(vfloat x,vfloat y)330 static INLINE vmask vmaskf_ge(vfloat x, vfloat y)
331 {
332 return (__m128i)_mm_cmpge_ps(x, y);
333 }
334
335
vmaski_eq(vint x,vint y)336 static INLINE vmask vmaski_eq(vint x, vint y)
337 {
338 __m128 s = (__m128)_mm_cmpeq_epi32(x, y);
339 return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0));
340 }
341
vsel(vmask mask,vdouble x,vdouble y)342 static INLINE vdouble vsel(vmask mask, vdouble x, vdouble y)
343 {
344 return (__m128d)vorm(vandm(mask, (__m128i)x), vandnotm(mask, (__m128i)y));
345 }
346
vseli_lt(vdouble d0,vdouble d1,vint x,vint y)347 static INLINE vint vseli_lt(vdouble d0, vdouble d1, vint x, vint y)
348 {
349 vmask mask = (vmask)_mm_cmpeq_ps(_mm_cvtpd_ps((vdouble)vmask_lt(d0, d1)), _mm_set_ps(0, 0, 0, 0));
350 return vori(vandnoti(mask, x), vandi(mask, y));
351 }
352
353 //
354
vcast_vi2_vm(vmask vm)355 static INLINE vint2 vcast_vi2_vm(vmask vm)
356 {
357 return (vint2)vm;
358 }
vcast_vm_vi2(vint2 vi)359 static INLINE vmask vcast_vm_vi2(vint2 vi)
360 {
361 return (vmask)vi;
362 }
363
vrint_vi2_vf(vfloat vf)364 static INLINE vint2 vrint_vi2_vf(vfloat vf)
365 {
366 return _mm_cvtps_epi32(vf);
367 }
vtruncate_vi2_vf(vfloat vf)368 static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
369 {
370 return _mm_cvttps_epi32(vf);
371 }
vcast_vf_vi2(vint2 vi)372 static INLINE vfloat vcast_vf_vi2(vint2 vi)
373 {
374 return _mm_cvtepi32_ps(vcast_vm_vi2(vi));
375 }
vcast_vi2_i(int i)376 static INLINE vint2 vcast_vi2_i(int i)
377 {
378 return _mm_set_epi32(i, i, i, i);
379 }
380
vaddi2(vint2 x,vint2 y)381 static INLINE vint2 vaddi2(vint2 x, vint2 y)
382 {
383 return vaddi(x, y);
384 }
vsubi2(vint2 x,vint2 y)385 static INLINE vint2 vsubi2(vint2 x, vint2 y)
386 {
387 return vsubi(x, y);
388 }
389
vandi2(vint2 x,vint2 y)390 static INLINE vint2 vandi2(vint2 x, vint2 y)
391 {
392 return vandi(x, y);
393 }
vandnoti2(vint2 x,vint2 y)394 static INLINE vint2 vandnoti2(vint2 x, vint2 y)
395 {
396 return vandnoti(x, y);
397 }
vori2(vint2 x,vint2 y)398 static INLINE vint2 vori2(vint2 x, vint2 y)
399 {
400 return vori(x, y);
401 }
vxori2(vint2 x,vint2 y)402 static INLINE vint2 vxori2(vint2 x, vint2 y)
403 {
404 return vxori(x, y);
405 }
406
vslli2(vint2 x,int c)407 static INLINE vint2 vslli2(vint2 x, int c)
408 {
409 return vslli(x, c);
410 }
vsrli2(vint2 x,int c)411 static INLINE vint2 vsrli2(vint2 x, int c)
412 {
413 return vsrli(x, c);
414 }
vsrai2(vint2 x,int c)415 static INLINE vint2 vsrai2(vint2 x, int c)
416 {
417 return vsrai(x, c);
418 }
419
vmaski2_eq(vint2 x,vint2 y)420 static INLINE vmask vmaski2_eq(vint2 x, vint2 y)
421 {
422 return _mm_cmpeq_epi32(x, y);
423 }
vseli2(vmask m,vint2 x,vint2 y)424 static INLINE vint2 vseli2(vmask m, vint2 x, vint2 y)
425 {
426 return vorm(vandm(m, x), vandnotm(m, y));
427 }
428
429 //
430
vcast_d_vd(vdouble v)431 static INLINE double vcast_d_vd(vdouble v)
432 {
433 double s[2];
434 _mm_storeu_pd(s, v);
435 return s[0];
436 }
437
vcast_f_vf(vfloat v)438 static INLINE float vcast_f_vf(vfloat v)
439 {
440 float s[4];
441 _mm_storeu_ps(s, v);
442 return s[0];
443 }
444
vsignbit(vdouble d)445 static INLINE vmask vsignbit(vdouble d)
446 {
447 return _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0));
448 }
449
vsign(vdouble d)450 static INLINE vdouble vsign(vdouble d)
451 {
452 return (__m128d)_mm_or_si128((__m128i)_mm_set_pd(1, 1), _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0)));
453 }
454
vmulsign(vdouble x,vdouble y)455 static INLINE vdouble vmulsign(vdouble x, vdouble y)
456 {
457 return (__m128d)vxori((__m128i)x, vsignbit(y));
458 }
459
vmask_isinf(vdouble d)460 static INLINE vmask vmask_isinf(vdouble d)
461 {
462 return (vmask)_mm_cmpeq_pd(vabs(d), _mm_set_pd(INFINITY, INFINITY));
463 }
464
vmask_ispinf(vdouble d)465 static INLINE vmask vmask_ispinf(vdouble d)
466 {
467 return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(INFINITY, INFINITY));
468 }
469
vmask_isminf(vdouble d)470 static INLINE vmask vmask_isminf(vdouble d)
471 {
472 return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(-INFINITY, -INFINITY));
473 }
474
vmask_isnan(vdouble d)475 static INLINE vmask vmask_isnan(vdouble d)
476 {
477 return (vmask)_mm_cmpneq_pd(d, d);
478 }
479
visinf(vdouble d)480 static INLINE vdouble visinf(vdouble d)
481 {
482 return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)_mm_set_pd(1, 1)));
483 }
484
visinf2(vdouble d,vdouble m)485 static INLINE vdouble visinf2(vdouble d, vdouble m)
486 {
487 return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)m));
488 }
489
490 //
491
vpow2i(vint q)492 static INLINE vdouble vpow2i(vint q)
493 {
494 q = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), q);
495 q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1, 3, 0, 3));
496 return (__m128d)_mm_slli_epi32(q, 20);
497 }
498
vldexp(vdouble x,vint q)499 static INLINE vdouble vldexp(vdouble x, vint q)
500 {
501 vint m = _mm_srai_epi32(q, 31);
502 m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7);
503 q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2));
504 vdouble y = vpow2i(m);
505 return vmul(vmul(vmul(vmul(vmul(x, y), y), y), y), vpow2i(q));
506 }
507
vilogbp1(vdouble d)508 static INLINE vint vilogbp1(vdouble d)
509 {
510 vint m = vmask_lt(d, vcast_vd_d(4.9090934652977266E-91));
511 d = vsel(m, vmul(vcast_vd_d(2.037035976334486E90), d), d);
512 __m128i q = _mm_and_si128((__m128i)d, _mm_set_epi32(((1 << 12) - 1) << 20, 0, ((1 << 12) - 1) << 20, 0));
513 q = _mm_srli_epi32(q, 20);
514 q = vorm(vandm (m, _mm_sub_epi32(q, _mm_set_epi32(300 + 0x3fe, 0, 300 + 0x3fe, 0))),
515 vandnotm(m, _mm_sub_epi32(q, _mm_set_epi32( 0x3fe, 0, 0x3fe, 0))));
516 q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(0, 0, 3, 1));
517 return q;
518 }
519
vupper(vdouble d)520 static INLINE vdouble vupper(vdouble d)
521 {
522 return (__m128d)_mm_and_si128((__m128i)d, _mm_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000));
523 }
524
525 //
526
527 typedef struct {
528 vdouble x, y;
529 } vdouble2;
530
dd(vdouble h,vdouble l)531 static INLINE vdouble2 dd(vdouble h, vdouble l)
532 {
533 vdouble2 ret = {h, l};
534 return ret;
535 }
536
vsel2(vmask mask,vdouble2 x,vdouble2 y)537 static INLINE vdouble2 vsel2(vmask mask, vdouble2 x, vdouble2 y)
538 {
539 return dd((__m128d)vorm(vandm(mask, (__m128i)x.x), vandnotm(mask, (__m128i)y.x)),
540 (__m128d)vorm(vandm(mask, (__m128i)x.y), vandnotm(mask, (__m128i)y.y)));
541 }
542
abs_d(vdouble2 x)543 static INLINE vdouble2 abs_d(vdouble2 x)
544 {
545 return dd((__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0, -0.0), x.x), x.x),
546 (__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0, -0.0), x.x), x.y));
547 }
548