1 ////////////////////////////////////////////////////////////////
2 //
3 //  this code was taken from http://shibatch.sourceforge.net/
4 //  Many thanks to the author of original version: Naoki Shibata
5 //
6 //   Copyright Naoki Shibata and contributors 2010 - 2021.
7 // Distributed under the Boost Software License, Version 1.0.
8 //    (See accompanying file LICENSE.txt or copy at
9 //          http://www.boost.org/LICENSE_1_0.txt)
10 //
11 //  This version contains modifications made by Ingo Weyrich
12 //
13 ////////////////////////////////////////////////////////////////
14 
15 #ifndef __SSE2__
16 #error Please specify -msse2.
17 #endif
18 
19 #ifdef __GNUC__
20 #define INLINE __inline
21 #else
22 #define INLINE inline
23 #endif
24 
25 #include <x86intrin.h>
26 
27 #include <stdint.h>
28 
29 typedef __m128d vdouble;
30 typedef __m128i vint;
31 typedef __m128i vmask;
32 
33 typedef __m128 vfloat;
34 typedef __m128i vint2;
35 
36 //
37 #define LVF(x) _mm_load_ps((float*)&x)
38 #define LVFU(x) _mm_loadu_ps(&x)
39 #define STVF(x,y) _mm_store_ps(&x,y)
40 #define STVFU(x,y) _mm_storeu_ps(&x,y)
41 #define LVI(x) _mm_load_si128((__m128i*)&x)
42 
43 #ifdef __AVX__
44 #define PERMUTEPS(a,mask) _mm_permute_ps(a,mask)
45 #else
46 #define PERMUTEPS(a,mask) _mm_shuffle_ps(a,a,mask)
47 #endif
48 
LC2VFU(float & a)49 static INLINE vfloat LC2VFU(float &a)
50 {
51     // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
52     vfloat a1 = _mm_loadu_ps( &a );
53     vfloat a2 = _mm_loadu_ps( (&a) + 4 );
54     return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 ));
55 }
56 
57 
58 // Store a vector of 4 floats in a[0],a[2],a[4] and a[6]
59 #ifdef __SSE4_1__
60 // SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself
61 #define STC2VFU(a,v) {\
62                          __m128 TST1V = _mm_loadu_ps(&a);\
63                          __m128 TST2V = _mm_unpacklo_ps(v,v);\
64                          _mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\
65                          TST1V = _mm_loadu_ps((&a)+4);\
66                          TST2V = _mm_unpackhi_ps(v,v);\
67                          _mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\
68                      }
69 #else
70 #define STC2VFU(a,v) {\
71                          __m128 TST1V = _mm_loadu_ps(&a);\
72                          __m128 TST2V = _mm_unpacklo_ps(v,v);\
73                          vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\
74                          _mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\
75                          TST1V = _mm_loadu_ps((&a)+4);\
76                          TST2V = _mm_unpackhi_ps(v,v);\
77                          _mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\
78                      }
79 #endif
80 
81 #define ZEROV _mm_setzero_ps()
82 #define F2V(a) _mm_set1_ps((a))
83 
vrint_vi_vd(vdouble vd)84 static INLINE vint vrint_vi_vd(vdouble vd)
85 {
86     return _mm_cvtpd_epi32(vd);
87 }
vtruncate_vi_vd(vdouble vd)88 static INLINE vint vtruncate_vi_vd(vdouble vd)
89 {
90     return _mm_cvttpd_epi32(vd);
91 }
vcast_vd_vi(vint vi)92 static INLINE vdouble vcast_vd_vi(vint vi)
93 {
94     return _mm_cvtepi32_pd(vi);
95 }
vcast_vd_d(double d)96 static INLINE vdouble vcast_vd_d(double d)
97 {
98     return _mm_set_pd(d, d);
99 }
vcast_vi_i(int i)100 static INLINE vint vcast_vi_i(int i)
101 {
102     return _mm_set_epi32(0, 0, i, i);
103 }
104 
vreinterpret_vm_vd(vdouble vd)105 static INLINE vmask vreinterpret_vm_vd(vdouble vd)
106 {
107     return (__m128i)vd;
108 }
vreinterpret_vd_vm(vint vm)109 static INLINE vdouble vreinterpret_vd_vm(vint vm)
110 {
111     return (__m128d)vm;
112 }
113 
vreinterpret_vm_vf(vfloat vf)114 static INLINE vmask vreinterpret_vm_vf(vfloat vf)
115 {
116     return (__m128i)vf;
117 }
vreinterpret_vf_vm(vmask vm)118 static INLINE vfloat vreinterpret_vf_vm(vmask vm)
119 {
120     return (__m128)vm;
121 }
122 
123 //
124 
vcast_vf_f(float f)125 static INLINE vfloat vcast_vf_f(float f)
126 {
127     return _mm_set_ps(f, f, f, f);
128 }
129 
130 // Don't use intrinsics here. Newer gcc versions (>= 4.9, maybe also before 4.9) generate better code when not using intrinsics
131 // example: vaddf(vmulf(a,b),c) will generate an FMA instruction when build for chips with that feature only when vaddf and vmulf don't use intrinsics
vaddf(vfloat x,vfloat y)132 static INLINE vfloat vaddf(vfloat x, vfloat y)
133 {
134     return x + y;
135 }
vsubf(vfloat x,vfloat y)136 static INLINE vfloat vsubf(vfloat x, vfloat y)
137 {
138     return x - y;
139 }
vmulf(vfloat x,vfloat y)140 static INLINE vfloat vmulf(vfloat x, vfloat y)
141 {
142     return x * y;
143 }
vdivf(vfloat x,vfloat y)144 static INLINE vfloat vdivf(vfloat x, vfloat y)
145 {
146     return x / y;
147 }
148 // Also don't use intrinsic here: Some chips support FMA instructions with 3 and 4 operands
149 // 3 operands: a = a*b+c, b = a*b+c, c = a*b+c // destination has to be one of a,b,c
150 // 4 operands: d = a*b+c // destination does not have to be one of a,b,c
151 // gcc will use the one which fits best when not using intrinsics. With using intrinsics that's not possible
vmlaf(vfloat x,vfloat y,vfloat z)152 static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) {
153     return x * y + z;
154 }
vrecf(vfloat x)155 static INLINE vfloat vrecf(vfloat x)
156 {
157     return vdivf(vcast_vf_f(1.0f), x);
158 }
vsqrtf(vfloat x)159 static INLINE vfloat vsqrtf(vfloat x)
160 {
161     return _mm_sqrt_ps(x);
162 }
vmaxf(vfloat x,vfloat y)163 static INLINE vfloat vmaxf(vfloat x, vfloat y)
164 {
165     return _mm_max_ps(x, y);
166 }
vminf(vfloat x,vfloat y)167 static INLINE vfloat vminf(vfloat x, vfloat y)
168 {
169     return _mm_min_ps(x, y);
170 }
171 
172 //
173 
vadd(vdouble x,vdouble y)174 static INLINE vdouble vadd(vdouble x, vdouble y)
175 {
176     return _mm_add_pd(x, y);
177 }
vsub(vdouble x,vdouble y)178 static INLINE vdouble vsub(vdouble x, vdouble y)
179 {
180     return _mm_sub_pd(x, y);
181 }
vmul(vdouble x,vdouble y)182 static INLINE vdouble vmul(vdouble x, vdouble y)
183 {
184     return _mm_mul_pd(x, y);
185 }
vdiv(vdouble x,vdouble y)186 static INLINE vdouble vdiv(vdouble x, vdouble y)
187 {
188     return _mm_div_pd(x, y);
189 }
vrec(vdouble x)190 static INLINE vdouble vrec(vdouble x)
191 {
192     return _mm_div_pd(_mm_set_pd(1, 1), x);
193 }
vsqrt(vdouble x)194 static INLINE vdouble vsqrt(vdouble x)
195 {
196     return _mm_sqrt_pd(x);
197 }
vmla(vdouble x,vdouble y,vdouble z)198 static INLINE vdouble vmla(vdouble x, vdouble y, vdouble z)
199 {
200     return vadd(vmul(x, y), z);
201 }
202 
vmax(vdouble x,vdouble y)203 static INLINE vdouble vmax(vdouble x, vdouble y)
204 {
205     return _mm_max_pd(x, y);
206 }
vmin(vdouble x,vdouble y)207 static INLINE vdouble vmin(vdouble x, vdouble y)
208 {
209     return _mm_min_pd(x, y);
210 }
211 
vabs(vdouble d)212 static INLINE vdouble vabs(vdouble d)
213 {
214     return (__m128d)_mm_andnot_pd(_mm_set_pd(-0.0, -0.0), d);
215 }
vneg(vdouble d)216 static INLINE vdouble vneg(vdouble d)
217 {
218     return (__m128d)_mm_xor_pd(_mm_set_pd(-0.0, -0.0), d);
219 }
220 
221 //
222 
vaddi(vint x,vint y)223 static INLINE vint vaddi(vint x, vint y)
224 {
225     return _mm_add_epi32(x, y);
226 }
vsubi(vint x,vint y)227 static INLINE vint vsubi(vint x, vint y)
228 {
229     return _mm_sub_epi32(x, y);
230 }
231 
vandi(vint x,vint y)232 static INLINE vint vandi(vint x, vint y)
233 {
234     return _mm_and_si128(x, y);
235 }
vandnoti(vint x,vint y)236 static INLINE vint vandnoti(vint x, vint y)
237 {
238     return _mm_andnot_si128(x, y);
239 }
vori(vint x,vint y)240 static INLINE vint vori(vint x, vint y)
241 {
242     return _mm_or_si128(x, y);
243 }
vxori(vint x,vint y)244 static INLINE vint vxori(vint x, vint y)
245 {
246     return _mm_xor_si128(x, y);
247 }
248 
vslli(vint x,int c)249 static INLINE vint vslli(vint x, int c)
250 {
251     return _mm_slli_epi32(x, c);
252 }
vsrli(vint x,int c)253 static INLINE vint vsrli(vint x, int c)
254 {
255     return _mm_srli_epi32(x, c);
256 }
vsrai(vint x,int c)257 static INLINE vint vsrai(vint x, int c)
258 {
259     return _mm_srai_epi32(x, c);
260 }
261 
262 //
263 
vandm(vmask x,vmask y)264 static INLINE vmask vandm(vmask x, vmask y)
265 {
266     return _mm_and_si128(x, y);
267 }
vandnotm(vmask x,vmask y)268 static INLINE vmask vandnotm(vmask x, vmask y)
269 {
270     return _mm_andnot_si128(x, y);
271 }
vorm(vmask x,vmask y)272 static INLINE vmask vorm(vmask x, vmask y)
273 {
274     return _mm_or_si128(x, y);
275 }
vxorm(vmask x,vmask y)276 static INLINE vmask vxorm(vmask x, vmask y)
277 {
278     return _mm_xor_si128(x, y);
279 }
vnotm(vmask x)280 static INLINE vmask vnotm(vmask x)
281 {
282     return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
283 }
284 
vmask_eq(vdouble x,vdouble y)285 static INLINE vmask vmask_eq(vdouble x, vdouble y)
286 {
287     return (__m128i)_mm_cmpeq_pd(x, y);
288 }
vmask_neq(vdouble x,vdouble y)289 static INLINE vmask vmask_neq(vdouble x, vdouble y)
290 {
291     return (__m128i)_mm_cmpneq_pd(x, y);
292 }
vmask_lt(vdouble x,vdouble y)293 static INLINE vmask vmask_lt(vdouble x, vdouble y)
294 {
295     return (__m128i)_mm_cmplt_pd(x, y);
296 }
vmask_le(vdouble x,vdouble y)297 static INLINE vmask vmask_le(vdouble x, vdouble y)
298 {
299     return (__m128i)_mm_cmple_pd(x, y);
300 }
vmask_gt(vdouble x,vdouble y)301 static INLINE vmask vmask_gt(vdouble x, vdouble y)
302 {
303     return (__m128i)_mm_cmpgt_pd(x, y);
304 }
vmask_ge(vdouble x,vdouble y)305 static INLINE vmask vmask_ge(vdouble x, vdouble y)
306 {
307     return (__m128i)_mm_cmpge_pd(x, y);
308 }
309 
vmaskf_eq(vfloat x,vfloat y)310 static INLINE vmask vmaskf_eq(vfloat x, vfloat y)
311 {
312     return (__m128i)_mm_cmpeq_ps(x, y);
313 }
vmaskf_neq(vfloat x,vfloat y)314 static INLINE vmask vmaskf_neq(vfloat x, vfloat y)
315 {
316     return (__m128i)_mm_cmpneq_ps(x, y);
317 }
vmaskf_lt(vfloat x,vfloat y)318 static INLINE vmask vmaskf_lt(vfloat x, vfloat y)
319 {
320     return (__m128i)_mm_cmplt_ps(x, y);
321 }
vmaskf_le(vfloat x,vfloat y)322 static INLINE vmask vmaskf_le(vfloat x, vfloat y)
323 {
324     return (__m128i)_mm_cmple_ps(x, y);
325 }
vmaskf_gt(vfloat x,vfloat y)326 static INLINE vmask vmaskf_gt(vfloat x, vfloat y)
327 {
328     return (__m128i)_mm_cmpgt_ps(x, y);
329 }
vmaskf_ge(vfloat x,vfloat y)330 static INLINE vmask vmaskf_ge(vfloat x, vfloat y)
331 {
332     return (__m128i)_mm_cmpge_ps(x, y);
333 }
334 
335 
vmaski_eq(vint x,vint y)336 static INLINE vmask vmaski_eq(vint x, vint y)
337 {
338     __m128 s = (__m128)_mm_cmpeq_epi32(x, y);
339     return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0));
340 }
341 
vsel(vmask mask,vdouble x,vdouble y)342 static INLINE vdouble vsel(vmask mask, vdouble x, vdouble y)
343 {
344     return (__m128d)vorm(vandm(mask, (__m128i)x), vandnotm(mask, (__m128i)y));
345 }
346 
vseli_lt(vdouble d0,vdouble d1,vint x,vint y)347 static INLINE vint vseli_lt(vdouble d0, vdouble d1, vint x, vint y)
348 {
349     vmask mask = (vmask)_mm_cmpeq_ps(_mm_cvtpd_ps((vdouble)vmask_lt(d0, d1)), _mm_set_ps(0, 0, 0, 0));
350     return vori(vandnoti(mask, x), vandi(mask, y));
351 }
352 
353 //
354 
vcast_vi2_vm(vmask vm)355 static INLINE vint2 vcast_vi2_vm(vmask vm)
356 {
357     return (vint2)vm;
358 }
vcast_vm_vi2(vint2 vi)359 static INLINE vmask vcast_vm_vi2(vint2 vi)
360 {
361     return (vmask)vi;
362 }
363 
vrint_vi2_vf(vfloat vf)364 static INLINE vint2 vrint_vi2_vf(vfloat vf)
365 {
366     return _mm_cvtps_epi32(vf);
367 }
vtruncate_vi2_vf(vfloat vf)368 static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
369 {
370     return _mm_cvttps_epi32(vf);
371 }
vcast_vf_vi2(vint2 vi)372 static INLINE vfloat vcast_vf_vi2(vint2 vi)
373 {
374     return _mm_cvtepi32_ps(vcast_vm_vi2(vi));
375 }
vcast_vi2_i(int i)376 static INLINE vint2 vcast_vi2_i(int i)
377 {
378     return _mm_set_epi32(i, i, i, i);
379 }
380 
vaddi2(vint2 x,vint2 y)381 static INLINE vint2 vaddi2(vint2 x, vint2 y)
382 {
383     return vaddi(x, y);
384 }
vsubi2(vint2 x,vint2 y)385 static INLINE vint2 vsubi2(vint2 x, vint2 y)
386 {
387     return vsubi(x, y);
388 }
389 
vandi2(vint2 x,vint2 y)390 static INLINE vint2 vandi2(vint2 x, vint2 y)
391 {
392     return vandi(x, y);
393 }
vandnoti2(vint2 x,vint2 y)394 static INLINE vint2 vandnoti2(vint2 x, vint2 y)
395 {
396     return vandnoti(x, y);
397 }
vori2(vint2 x,vint2 y)398 static INLINE vint2 vori2(vint2 x, vint2 y)
399 {
400     return vori(x, y);
401 }
vxori2(vint2 x,vint2 y)402 static INLINE vint2 vxori2(vint2 x, vint2 y)
403 {
404     return vxori(x, y);
405 }
406 
vslli2(vint2 x,int c)407 static INLINE vint2 vslli2(vint2 x, int c)
408 {
409     return vslli(x, c);
410 }
vsrli2(vint2 x,int c)411 static INLINE vint2 vsrli2(vint2 x, int c)
412 {
413     return vsrli(x, c);
414 }
vsrai2(vint2 x,int c)415 static INLINE vint2 vsrai2(vint2 x, int c)
416 {
417     return vsrai(x, c);
418 }
419 
vmaski2_eq(vint2 x,vint2 y)420 static INLINE vmask vmaski2_eq(vint2 x, vint2 y)
421 {
422     return _mm_cmpeq_epi32(x, y);
423 }
vseli2(vmask m,vint2 x,vint2 y)424 static INLINE vint2 vseli2(vmask m, vint2 x, vint2 y)
425 {
426     return vorm(vandm(m, x), vandnotm(m, y));
427 }
428 
429 //
430 
vcast_d_vd(vdouble v)431 static INLINE double vcast_d_vd(vdouble v)
432 {
433     double s[2];
434     _mm_storeu_pd(s, v);
435     return s[0];
436 }
437 
vcast_f_vf(vfloat v)438 static INLINE float vcast_f_vf(vfloat v)
439 {
440     float s[4];
441     _mm_storeu_ps(s, v);
442     return s[0];
443 }
444 
vsignbit(vdouble d)445 static INLINE vmask vsignbit(vdouble d)
446 {
447     return _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0));
448 }
449 
vsign(vdouble d)450 static INLINE vdouble vsign(vdouble d)
451 {
452     return (__m128d)_mm_or_si128((__m128i)_mm_set_pd(1, 1), _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0)));
453 }
454 
vmulsign(vdouble x,vdouble y)455 static INLINE vdouble vmulsign(vdouble x, vdouble y)
456 {
457     return (__m128d)vxori((__m128i)x, vsignbit(y));
458 }
459 
vmask_isinf(vdouble d)460 static INLINE vmask vmask_isinf(vdouble d)
461 {
462     return (vmask)_mm_cmpeq_pd(vabs(d), _mm_set_pd(INFINITY, INFINITY));
463 }
464 
vmask_ispinf(vdouble d)465 static INLINE vmask vmask_ispinf(vdouble d)
466 {
467     return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(INFINITY, INFINITY));
468 }
469 
vmask_isminf(vdouble d)470 static INLINE vmask vmask_isminf(vdouble d)
471 {
472     return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(-INFINITY, -INFINITY));
473 }
474 
vmask_isnan(vdouble d)475 static INLINE vmask vmask_isnan(vdouble d)
476 {
477     return (vmask)_mm_cmpneq_pd(d, d);
478 }
479 
visinf(vdouble d)480 static INLINE vdouble visinf(vdouble d)
481 {
482     return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)_mm_set_pd(1, 1)));
483 }
484 
visinf2(vdouble d,vdouble m)485 static INLINE vdouble visinf2(vdouble d, vdouble m)
486 {
487     return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)m));
488 }
489 
490 //
491 
vpow2i(vint q)492 static INLINE vdouble vpow2i(vint q)
493 {
494     q = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), q);
495     q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1, 3, 0, 3));
496     return (__m128d)_mm_slli_epi32(q, 20);
497 }
498 
vldexp(vdouble x,vint q)499 static INLINE vdouble vldexp(vdouble x, vint q)
500 {
501     vint m = _mm_srai_epi32(q, 31);
502     m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7);
503     q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2));
504     vdouble y = vpow2i(m);
505     return vmul(vmul(vmul(vmul(vmul(x, y), y), y), y), vpow2i(q));
506 }
507 
vilogbp1(vdouble d)508 static INLINE vint vilogbp1(vdouble d)
509 {
510     vint m = vmask_lt(d, vcast_vd_d(4.9090934652977266E-91));
511     d = vsel(m, vmul(vcast_vd_d(2.037035976334486E90), d), d);
512     __m128i q = _mm_and_si128((__m128i)d, _mm_set_epi32(((1 << 12) - 1) << 20, 0, ((1 << 12) - 1) << 20, 0));
513     q = _mm_srli_epi32(q, 20);
514     q = vorm(vandm   (m, _mm_sub_epi32(q, _mm_set_epi32(300 + 0x3fe, 0, 300 + 0x3fe, 0))),
515              vandnotm(m, _mm_sub_epi32(q, _mm_set_epi32(      0x3fe, 0,       0x3fe, 0))));
516     q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(0, 0, 3, 1));
517     return q;
518 }
519 
vupper(vdouble d)520 static INLINE vdouble vupper(vdouble d)
521 {
522     return (__m128d)_mm_and_si128((__m128i)d, _mm_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000));
523 }
524 
525 //
526 
527 typedef struct {
528     vdouble x, y;
529 } vdouble2;
530 
dd(vdouble h,vdouble l)531 static INLINE vdouble2 dd(vdouble h, vdouble l)
532 {
533     vdouble2 ret = {h, l};
534     return ret;
535 }
536 
vsel2(vmask mask,vdouble2 x,vdouble2 y)537 static INLINE vdouble2 vsel2(vmask mask, vdouble2 x, vdouble2 y)
538 {
539     return dd((__m128d)vorm(vandm(mask, (__m128i)x.x), vandnotm(mask, (__m128i)y.x)),
540               (__m128d)vorm(vandm(mask, (__m128i)x.y), vandnotm(mask, (__m128i)y.y)));
541 }
542 
abs_d(vdouble2 x)543 static INLINE vdouble2 abs_d(vdouble2 x)
544 {
545     return dd((__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0, -0.0), x.x), x.x),
546               (__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0, -0.0), x.x), x.y));
547 }
548