1 ////////////////////////////////////////////////////////////////
2 //
3 //  this code was taken from http://shibatch.sourceforge.net/
4 //  Many thanks to the author of original version: Naoki Shibata
5 //
6 //   Copyright Naoki Shibata and contributors 2010 - 2021.
7 // Distributed under the Boost Software License, Version 1.0.
8 //    (See accompanying file LICENSE.txt or copy at
9 //          http://www.boost.org/LICENSE_1_0.txt)
10 //
11 //  This version contains modifications made by Ingo Weyrich
12 //
13 ////////////////////////////////////////////////////////////////
14 
15 
16 #ifndef SLEEFSSEAVX
17 #define SLEEFSSEAVX
18 
19 #include <assert.h>
20 #include "rt_math.h"
21 #ifdef __SSE2__
22 #include "helpersse2.h"
23 
24 #ifdef ENABLE_AVX
25 #include "helperavx.h"
26 #endif
27 
28 #ifdef __GNUC__
29 #define INLINE __inline
30 #else
31 #define INLINE inline
32 #endif
33 
34 #define PI4_A .7853981554508209228515625
35 #define PI4_B .794662735614792836713604629039764404296875e-8
36 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
37 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
38 
39 #define L2U .69314718055966295651160180568695068359375
40 #define L2L .28235290563031577122588448175013436025525412068e-12
41 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
42 
43 #define PI4_Af 0.78515625f
44 #define PI4_Bf 0.00024127960205078125f
45 #define PI4_Cf 6.3329935073852539062e-07f
46 #define PI4_Df 4.9604681473525147339e-10f
47 
48 #define L2Uf 0.693145751953125f
49 #define L2Lf 1.428606765330187045e-06f
50 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
51 
52 #define INFINITYf ((float)librtprocess::RT_INFINITY)
53 #define NANf ((float)librtprocess::RT_NAN)
54 
vadd3(vdouble v0,vdouble v1,vdouble v2)55 static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
56   return vadd(vadd(v0, v1), v2);
57 }
58 
vadd4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)59 static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
60   return vadd3(vadd(v0, v1), v2, v3);
61 }
62 
vadd5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)63 static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
64   return vadd4(vadd(v0, v1), v2, v3, v4);
65 }
66 
vadd6(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5)67 static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
68   return vadd5(vadd(v0, v1), v2, v3, v4, v5);
69 }
70 
vadd7(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5,vdouble v6)71 static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
72   return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
73 }
74 
vsub3(vdouble v0,vdouble v1,vdouble v2)75 static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
76   return vsub(vsub(v0, v1), v2);
77 }
78 
vsub4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)79 static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
80   return vsub3(vsub(v0, v1), v2, v3);
81 }
82 
vsub5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)83 static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
84   return vsub4(vsub(v0, v1), v2, v3, v4);
85 }
86 
87 //
88 
normalize_d(vdouble2 t)89 static INLINE vdouble2 normalize_d(vdouble2 t) {
90   vdouble2 s;
91 
92   s.x = vadd(t.x, t.y);
93   s.y = vadd(vsub(t.x, s.x), t.y);
94 
95   return s;
96 }
97 
scale_d(vdouble2 d,vdouble s)98 static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
99   vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
100   return r;
101 }
102 
add_ss(vdouble x,vdouble y)103 static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
104   vdouble2 r;
105 
106   r.x = vadd(x, y);
107   r.y = vadd(vsub(x, r.x), y);
108 
109   return r;
110 }
111 
add2_ss(vdouble x,vdouble y)112 static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
113   vdouble2 r;
114 
115   r.x = vadd(x, y);
116   vdouble v = vsub(r.x, x);
117   r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
118 
119   return r;
120 }
121 
add_ds(vdouble2 x,vdouble y)122 static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
123   vdouble2 r;
124 
125   r.x = vadd(x.x, y);
126   r.y = vadd3(vsub(x.x, r.x), y, x.y);
127 
128   return r;
129 }
130 
add2_ds(vdouble2 x,vdouble y)131 static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
132   vdouble2 r;
133 
134   r.x = vadd(x.x, y);
135   vdouble v = vsub(r.x, x.x);
136   r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
137   r.y = vadd(r.y, x.y);
138 
139   return r;
140 }
141 
add_sd(vdouble x,vdouble2 y)142 static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
143   vdouble2 r;
144 
145   r.x = vadd(x, y.x);
146   r.y = vadd3(vsub(x, r.x), y.x, y.y);
147 
148   return r;
149 }
150 
add_dd(vdouble2 x,vdouble2 y)151 static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
152   // |x| >= |y|
153 
154   vdouble2 r;
155 
156   r.x = vadd(x.x, y.x);
157   r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
158 
159   return r;
160 }
161 
add2_dd(vdouble2 x,vdouble2 y)162 static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
163   vdouble2 r;
164 
165   r.x  = vadd(x.x, y.x);
166   vdouble v = vsub(r.x, x.x);
167   r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
168   r.y = vadd(r.y, vadd(x.y, y.y));
169 
170   return r;
171 }
172 
div_dd(vdouble2 n,vdouble2 d)173 static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
174   vdouble t = vrec(d.x);
175   vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
176   vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
177   vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
178 
179   vdouble2 q;
180 
181   q.x = vmul(n.x, t);
182 
183   vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
184 		    vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
185 
186   q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
187 
188   return q;
189 }
190 
mul_ss(vdouble x,vdouble y)191 static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
192   vdouble xh = vupper(x), xl = vsub(x, xh);
193   vdouble yh = vupper(y), yl = vsub(y, yh);
194   vdouble2 r;
195 
196   r.x = vmul(x, y);
197   r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
198 
199   return r;
200 }
201 
mul_ds(vdouble2 x,vdouble y)202 static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
203   vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
204   vdouble yh = vupper(y  ), yl = vsub(y  , yh);
205   vdouble2 r;
206 
207   r.x = vmul(x.x, y);
208   r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
209 
210   return r;
211 }
212 
mul_dd(vdouble2 x,vdouble2 y)213 static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
214   vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
215   vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
216   vdouble2 r;
217 
218   r.x = vmul(x.x, y.x);
219   r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
220 
221   return r;
222 }
223 
squ_d(vdouble2 x)224 static INLINE vdouble2 squ_d(vdouble2 x) {
225   vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
226   vdouble2 r;
227 
228   r.x = vmul(x.x, x.x);
229   r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
230 
231   return r;
232 }
233 
rec_s(vdouble d)234 static INLINE vdouble2 rec_s(vdouble d) {
235   vdouble t = vrec(d);
236   vdouble dh = vupper(d), dl = vsub(d, dh);
237   vdouble th = vupper(t), tl = vsub(t, th);
238   vdouble2 q;
239 
240   q.x = t;
241   q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
242 
243   return q;
244 }
245 
sqrt_d(vdouble2 d)246 static INLINE vdouble2 sqrt_d(vdouble2 d) {
247   vdouble t = vsqrt(vadd(d.x, d.y));
248   return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
249 }
250 
251 //
252 
xldexp(vdouble x,vint q)253 static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
254 
xilogb(vdouble d)255 static INLINE vint xilogb(vdouble d) {
256   vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
257   e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
258   e = vsel(vmask_eq(vabs(d), vcast_vd_d(librtprocess::RT_INFINITY)), vcast_vd_d(2147483647), e);
259   return vrint_vi_vd(e);
260 }
261 
xsin(vdouble d)262 static INLINE vdouble xsin(vdouble d) {
263   vint q;
264   vdouble u, s;
265 
266   q = vrint_vi_vd(vmul(d, vcast_vd_d(librtprocess::RT_1_PI)));
267 
268   u = vcast_vd_vi(q);
269   d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
270   d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
271   d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
272 
273   s = vmul(d, d);
274 
275   d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
276 
277   u = vcast_vd_d(-7.97255955009037868891952e-18);
278   u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
279   u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
280   u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
281   u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
282   u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
283   u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
284   u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
285   u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
286 
287   u = vmla(s, vmul(u, d), d);
288 
289   return u;
290 }
291 
xcos(vdouble d)292 static INLINE vdouble xcos(vdouble d) {
293   vint q;
294   vdouble u, s;
295 
296   q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(librtprocess::RT_1_PI)), vcast_vd_d(0.5)));
297   q = vaddi(vaddi(q, q), vcast_vi_i(1));
298 
299   u = vcast_vd_vi(q);
300   d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
301   d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
302   d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
303 
304   s = vmul(d, d);
305 
306   d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
307 
308   u = vcast_vd_d(-7.97255955009037868891952e-18);
309   u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
310   u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
311   u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
312   u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
313   u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
314   u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
315   u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
316   u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
317 
318   u = vmla(s, vmul(u, d), d);
319 
320   return u;
321 }
322 
xsincos(vdouble d)323 static INLINE vdouble2 xsincos(vdouble d) {
324   vint q;
325   vmask m;
326   vdouble u, s, t, rx, ry;
327   vdouble2 r;
328 
329   q = vrint_vi_vd(vmul(d, vcast_vd_d(librtprocess::RT_2_PI)));
330 
331   s = d;
332 
333   u = vcast_vd_vi(q);
334   s = vmla(u, vcast_vd_d(-PI4_A*2), s);
335   s = vmla(u, vcast_vd_d(-PI4_B*2), s);
336   s = vmla(u, vcast_vd_d(-PI4_C*2), s);
337 
338   t = s;
339 
340   s = vmul(s, s);
341 
342   u = vcast_vd_d(1.58938307283228937328511e-10);
343   u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
344   u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
345   u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
346   u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
347   u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
348   u = vmul(vmul(u, s), t);
349 
350   rx = vadd(t, u);
351 
352   u = vcast_vd_d(-1.13615350239097429531523e-11);
353   u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
354   u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
355   u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
356   u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
357   u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
358   u = vmla(u, s, vcast_vd_d(-0.5));
359 
360   ry = vadd(vcast_vd_d(1), vmul(s, u));
361 
362   m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
363   r.x = vsel(m, rx, ry);
364   r.y = vsel(m, ry, rx);
365 
366   m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
367   r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
368 
369   m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
370   r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
371 
372   m = vmask_isinf(d);
373   r.x = vsel(m, vcast_vd_d(librtprocess::RT_NAN), r.x);
374   r.y = vsel(m, vcast_vd_d(librtprocess::RT_NAN), r.y);
375 
376   return r;
377 }
378 
xtan(vdouble d)379 static INLINE vdouble xtan(vdouble d) {
380   vint q;
381   vdouble u, s, x;
382   vmask m;
383 
384   q = vrint_vi_vd(vmul(d, vcast_vd_d(librtprocess::RT_2_PI)));
385 
386   u = vcast_vd_vi(q);
387   x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
388   x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
389   x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
390 
391   s = vmul(x, x);
392 
393   m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
394   x = vsel(m, vneg(x), x);
395 
396   u = vcast_vd_d(1.01419718511083373224408e-05);
397   u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
398   u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
399   u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
400   u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
401   u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
402   u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
403   u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
404   u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
405   u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
406   u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
407   u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
408   u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
409   u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
410   u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
411 
412   u = vmla(s, vmul(u, x), x);
413 
414   u = vsel(m, vrec(u), u);
415 
416   u = vsel(vmask_isinf(d), vcast_vd_d(librtprocess::RT_NAN), u);
417 
418   return u;
419 }
420 
atan2k(vdouble y,vdouble x)421 static INLINE vdouble atan2k(vdouble y, vdouble x) {
422   vdouble s, t, u;
423   vint q;
424   vmask p;
425 
426   q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
427   x = vabs(x);
428 
429   q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
430   p = vmask_lt(x, y);
431   s = vsel (p, vneg(x), y);
432   t = vmax (x, y);
433 
434   s = vdiv(s, t);
435   t = vmul(s, s);
436 
437   u = vcast_vd_d(-1.88796008463073496563746e-05);
438   u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
439   u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
440   u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
441   u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
442   u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
443   u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
444   u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
445   u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
446   u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
447   u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
448   u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
449   u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
450   u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
451   u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
452   u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
453   u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
454   u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
455   u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
456 
457   t = vadd(s, vmul(s, vmul(t, u)));
458   t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(librtprocess::RT_PI/2)));
459 
460   return t;
461 }
462 
xatan2(vdouble y,vdouble x)463 static INLINE vdouble xatan2(vdouble y, vdouble x) {
464   vdouble r = atan2k(vabs(y), x);
465 
466   r = vmulsign(r, x);
467   r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(librtprocess::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(librtprocess::RT_PI/2), x))), r);
468   r = vsel(vmask_isinf(y), vsub(vcast_vd_d(librtprocess::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(librtprocess::RT_PI/4), x))), r);
469   r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(librtprocess::RT_PI), vcast_vd_d(0)), r);
470 
471   return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(librtprocess::RT_NAN), vmulsign(r, y));
472 }
473 
xasin(vdouble d)474 static INLINE vdouble xasin(vdouble d) {
475   vdouble x, y;
476   x = vadd(vcast_vd_d(1), d);
477   y = vsub(vcast_vd_d(1), d);
478   x = vmul(x, y);
479   x = vsqrt(x);
480   x = vsel(vmask_isnan(x), vcast_vd_d(librtprocess::RT_NAN), atan2k(vabs(d), x));
481   return vmulsign(x, d);
482 }
483 
xacos(vdouble d)484 static INLINE vdouble xacos(vdouble d) {
485   vdouble x, y;
486   x = vadd(vcast_vd_d(1), d);
487   y = vsub(vcast_vd_d(1), d);
488   x = vmul(x, y);
489   x = vsqrt(x);
490   x = vmulsign(atan2k(x, vabs(d)), d);
491   y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(librtprocess::RT_PI));
492   x = vadd(x, y);
493   return x;
494 }
495 
xatan(vdouble s)496 static INLINE vdouble xatan(vdouble s) {
497   vdouble t, u;
498   vint q;
499 
500   q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
501   s = vabs(s);
502 
503   q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
504   s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
505 
506   t = vmul(s, s);
507 
508   u = vcast_vd_d(-1.88796008463073496563746e-05);
509   u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
510   u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
511   u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
512   u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
513   u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
514   u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
515   u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
516   u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
517   u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
518   u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
519   u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
520   u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
521   u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
522   u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
523   u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
524   u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
525   u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
526   u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
527 
528   t = vadd(s, vmul(s, vmul(t, u)));
529 
530   t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(librtprocess::RT_PI/2), t), t);
531   t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
532 
533   return t;
534 }
535 
xlog(vdouble d)536 static INLINE vdouble xlog(vdouble d) {
537   vdouble x, x2;
538   vdouble t, m;
539   vint e;
540 
541   e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
542   m = vldexp(d, vsubi(vcast_vi_i(0), e));
543 
544   x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
545   x2 = vmul(x, x);
546 
547   t = vcast_vd_d(0.148197055177935105296783);
548   t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
549   t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
550   t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
551   t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
552   t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
553   t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
554   t = vmla(t, x2, vcast_vd_d(2));
555 
556   x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
557 
558   x = vsel(vmask_ispinf(d), vcast_vd_d(librtprocess::RT_INFINITY), x);
559   x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(librtprocess::RT_NAN), x);
560   x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-librtprocess::RT_INFINITY), x);
561 
562   return x;
563 }
564 
xexp(vdouble d)565 static INLINE vdouble xexp(vdouble d) {
566   vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
567   vdouble s, u;
568 
569   s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
570   s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
571 
572   u = vcast_vd_d(2.08860621107283687536341e-09);
573   u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
574   u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
575   u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
576   u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
577   u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
578   u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
579   u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
580   u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
581   u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
582   u = vmla(u, s, vcast_vd_d(0.5));
583 
584   u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
585 
586   u = vldexp(u, q);
587 
588   u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
589 
590   return u;
591 }
592 
logk(vdouble d)593 static INLINE vdouble2 logk(vdouble d) {
594   vdouble2 x, x2;
595   vdouble t, m;
596   vint e;
597 
598   e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
599   m = vldexp(d, vsubi(vcast_vi_i(0), e));
600 
601   x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
602   x2 = squ_d(x);
603   x2 = normalize_d(x2);
604 
605   t = vcast_vd_d(0.134601987501262130076155);
606   t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
607   t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
608   t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
609   t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
610   t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
611   t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
612   t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
613 
614   return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
615 		       vcast_vd_vi(e)),
616 		add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
617 }
618 
expk(vdouble2 d)619 static INLINE vdouble expk(vdouble2 d) {
620   vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
621   vint q = vrint_vi_vd(u);
622   vdouble2 s, t;
623 
624   s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
625   s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
626 
627   q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
628 
629   s = normalize_d(s);
630 
631   u = vcast_vd_d(2.51069683420950419527139e-08);
632   u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
633   u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
634   u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
635   u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
636   u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
637   u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
638   u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
639   u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
640   u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
641 
642   t = add_dd(s, mul_ds(squ_d(s), u));
643 
644   t = add_sd(vcast_vd_d(1), t);
645   u = vadd(t.x, t.y);
646   u = vldexp(u, q);
647 
648   return u;
649 }
650 
xpow(vdouble x,vdouble y)651 static INLINE vdouble xpow(vdouble x, vdouble y) {
652 #if 1
653   vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
654   vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
655 
656   vdouble result = expk(mul_ds(logk(vabs(x)), y));
657 
658   //result = vsel(vmask_isnan(result), vcast_vd_d(librtprocess::RT_INFINITY), result);
659 
660   result = vmul(result,
661 		vsel(vmask_gt(x, vcast_vd_d(0)),
662 		     vcast_vd_d(1),
663 		     vsel(yisint,
664 			  vsel(yisodd,
665 			       vcast_vd_d(-1),
666 			       vcast_vd_d(1)),
667 			  vcast_vd_d(librtprocess::RT_NAN))));
668 
669   vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
670 
671   result = vsel(vmask_isinf(y),
672 		vsel(vmask_lt(efx, vcast_vd_d(0)),
673 		     vcast_vd_d(0),
674 		     vsel(vmask_eq(efx, vcast_vd_d(0)),
675 			  vcast_vd_d(1.0),
676 			  vcast_vd_d(librtprocess::RT_INFINITY))),
677 		result);
678 
679   result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
680 		vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
681 		     vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
682 			  vcast_vd_d(0),
683 			  vcast_vd_d(librtprocess::RT_INFINITY))),
684 		result);
685 
686   result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(librtprocess::RT_NAN), result);
687 
688   result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
689 
690   return result;
691 #else
692   return expk(mul_ds(logk(x), y));
693 #endif
694 }
695 
expk2(vdouble2 d)696 static INLINE vdouble2 expk2(vdouble2 d) {
697   vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
698   vint q = vrint_vi_vd(u);
699   vdouble2 s, t;
700 
701   s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
702   s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
703 
704   q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
705 
706   s = normalize_d(s);
707 
708   u = vcast_vd_d(2.51069683420950419527139e-08);
709   u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
710   u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
711   u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
712   u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
713   u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
714   u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
715   u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
716   u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
717   u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
718 
719   t = add_dd(s, mul_ds(squ_d(s), u));
720 
721   t = add_sd(vcast_vd_d(1), t);
722 
723   return dd(vldexp(t.x, q), vldexp(t.y, q));
724 }
725 
xsinh(vdouble x)726 static INLINE vdouble xsinh(vdouble x) {
727   vdouble y = vabs(x);
728   vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
729   d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
730   y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
731 
732   y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(librtprocess::RT_INFINITY), y);
733   y = vmulsign(y, x);
734   y = vsel(vmask_isnan(x), vcast_vd_d(librtprocess::RT_NAN), y);
735 
736   return y;
737 }
738 
xcosh(vdouble x)739 static INLINE vdouble xcosh(vdouble x) {
740   vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
741   d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
742   vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
743 
744   y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(librtprocess::RT_INFINITY), y);
745   y = vsel(vmask_isnan(x), vcast_vd_d(librtprocess::RT_NAN), y);
746 
747   return y;
748 }
749 
xtanh(vdouble x)750 static INLINE vdouble xtanh(vdouble x) {
751   vdouble y = vabs(x);
752   vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
753   vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
754   d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
755   y = d.x + d.y;
756 
757   y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
758   y = vmulsign(y, x);
759   y = vsel(vmask_isnan(x), vcast_vd_d(librtprocess::RT_NAN), y);
760 
761   return y;
762 }
763 
logk2(vdouble2 d)764 static INLINE vdouble2 logk2(vdouble2 d) {
765   vdouble2 x, x2, m;
766   vdouble t;
767   vint e;
768 
769   d = normalize_d(d);
770   e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
771   m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
772 
773   x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
774   x2 = squ_d(x);
775   x2 = normalize_d(x2);
776 
777   t = vcast_vd_d(0.134601987501262130076155);
778   t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
779   t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
780   t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
781   t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
782   t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
783   t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
784   t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
785 
786   return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
787 		       vcast_vd_vi(e)),
788 		add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
789 }
790 
xasinh(vdouble x)791 static INLINE vdouble xasinh(vdouble x) {
792   vdouble y = vabs(x);
793   vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
794   y = vadd(d.x, d.y);
795 
796   y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(librtprocess::RT_INFINITY), y);
797   y = vmulsign(y, x);
798   y = vsel(vmask_isnan(x), vcast_vd_d(librtprocess::RT_NAN), y);
799 
800   return y;
801 }
802 
xacosh(vdouble x)803 static INLINE vdouble xacosh(vdouble x) {
804   vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
805   vdouble y = vadd(d.x, d.y);
806 
807   y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(librtprocess::RT_INFINITY), y);
808   y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
809   y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(librtprocess::RT_NAN), y);
810   y = vsel(vmask_isnan(x), vcast_vd_d(librtprocess::RT_NAN), y);
811 
812   return y;
813 }
814 
xatanh(vdouble x)815 static INLINE vdouble xatanh(vdouble x) {
816   vdouble y = vabs(x);
817   vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
818   y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(librtprocess::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(librtprocess::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
819 
820   y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(librtprocess::RT_NAN), y);
821   y = vmulsign(y, x);
822   y = vsel(vmask_isnan(x), vcast_vd_d(librtprocess::RT_NAN), y);
823 
824   return y;
825 }
826 
xcbrt(vdouble d)827 static INLINE vdouble xcbrt(vdouble d) {
828   vdouble x, y, q = vcast_vd_d(1.0);
829   vint e, qu, re;
830   vdouble t;
831 
832   e = vilogbp1(vabs(d));
833   d = vldexp(d, vsubi(vcast_vi_i(0), e));
834 
835   t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
836   qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
837   re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
838 
839   q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
840   q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
841   q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
842 
843   q = vmulsign(q, d);
844 
845   d = vabs(d);
846 
847   x = vcast_vd_d(-0.640245898480692909870982);
848   x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
849   x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
850   x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
851   x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
852   x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
853 
854   y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
855   y = vmul(vmul(d, x), x);
856   y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
857 
858   return y;
859 }
860 
xexp2(vdouble a)861 static INLINE vdouble xexp2(vdouble a) {
862   vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
863   u = vsel(vmask_ispinf(a), vcast_vd_d(librtprocess::RT_INFINITY), u);
864   u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
865   return u;
866 }
867 
xexp10(vdouble a)868 static INLINE vdouble xexp10(vdouble a) {
869   vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
870   u = vsel(vmask_ispinf(a), vcast_vd_d(librtprocess::RT_INFINITY), u);
871   u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
872   return u;
873 }
874 
xexpm1(vdouble a)875 static INLINE vdouble xexpm1(vdouble a) {
876   vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
877   vdouble x = d.x + d.y;
878   x = vsel(vmask_ispinf(a), vcast_vd_d(librtprocess::RT_INFINITY), x);
879   x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
880   return x;
881 }
882 
xlog10(vdouble a)883 static INLINE vdouble xlog10(vdouble a) {
884   vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
885   vdouble x = d.x + d.y;
886 
887   x = vsel(vmask_ispinf(a), vcast_vd_d(librtprocess::RT_INFINITY), x);
888   x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(librtprocess::RT_NAN), x);
889   x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-librtprocess::RT_INFINITY), x);
890 
891   return x;
892 }
893 
xlog1p(vdouble a)894 static INLINE vdouble xlog1p(vdouble a) {
895   vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
896   vdouble x = d.x + d.y;
897 
898   x = vsel(vmask_ispinf(a), vcast_vd_d(librtprocess::RT_INFINITY), x);
899   x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(librtprocess::RT_NAN), x);
900   x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-librtprocess::RT_INFINITY), x);
901 
902   return x;
903 }
904 
905 //
906 
907 typedef struct {
908   vfloat x, y;
909 } vfloat2;
910 
vabsf(vfloat f)911 static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
vnegf(vfloat f)912 static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
913 
914 #ifdef __SSE4_1__
915 	// only one instruction when using SSE4.1
vself(vmask mask,vfloat x,vfloat y)916 	static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
917 		return _mm_blendv_ps(y,x,(vfloat)mask);
918 	}
919 
vselc(vmask mask,vint x,vint y)920 	static INLINE vint vselc(vmask mask, vint x, vint y) {
921 		return _mm_blendv_epi8(y,x,mask);
922 	}
923 
924 #else
925 	// three instructions when using SSE2
vself(vmask mask,vfloat x,vfloat y)926 	static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
927 		return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
928 	}
929 
vselc(vmask mask,vint x,vint y)930 	static INLINE vint vselc(vmask mask, vint x, vint y) {
931 	    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
932 	}
933 #endif
934 
vselfzero(vmask mask,vfloat x)935 static INLINE vfloat vselfzero(vmask mask, vfloat x) {
936      // returns value of x if corresponding mask bits are 1, else returns 0
937      // faster than vself(mask, x, ZEROV)
938     return _mm_and_ps((vfloat)mask, x);
939 }
vselfnotzero(vmask mask,vfloat x)940 static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
941     // returns value of x if corresponding mask bits are 0, else returns 0
942     // faster than vself(mask, ZEROV, x)
943     return _mm_andnot_ps((vfloat)mask, x);
944 }
945 
vselizero(vmask mask,vint x)946 static INLINE vint vselizero(vmask mask, vint x) {
947      // returns value of x if corresponding mask bits are 1, else returns 0
948      // faster than vselc(mask, x, ZEROV)
949     return _mm_and_si128(mask, x);
950 }
vselinotzero(vmask mask,vint x)951 static INLINE vint vselinotzero(vmask mask, vint x) {
952     // returns value of x if corresponding mask bits are 0, else returns 0
953     // faster than vselc(mask, ZEROV, x)
954     return _mm_andnot_si128(mask, x);
955 }
956 
vseli2_lt(vfloat f0,vfloat f1,vint2 x,vint2 y)957 static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
958   vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
959   return vori2(vandi2(m2, x), vandnoti2(m2, y));
960 }
961 
vsignbitf(vfloat f)962 static INLINE vmask vsignbitf(vfloat f) {
963   return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
964 }
965 
vmulsignf(vfloat x,vfloat y)966 static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
967   return (vfloat)vxorm((vmask)x, vsignbitf(y));
968 }
969 
vsignf(vfloat f)970 static INLINE vfloat vsignf(vfloat f) {
971   return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
972 }
973 
vmaskf_isinf(vfloat d)974 static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
vmaskf_ispinf(vfloat d)975 static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
vmaskf_isminf(vfloat d)976 static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
vmaskf_isnan(vfloat d)977 static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
978 // the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
vmaskf_isnan(vfloat a,vfloat b)979 static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
visinf2f(vfloat d,vfloat m)980 static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
visinff(vfloat d)981 static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
982 
vilogbp1f(vfloat d)983 static INLINE vint2 vilogbp1f(vfloat d) {
984   vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
985   d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
986   vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
987   q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
988   return q;
989 }
990 
vldexpf(vfloat x,vint2 q)991 static INLINE vfloat vldexpf(vfloat x, vint2 q) {
992   vfloat u;
993   vint2 m = vsrai2(q, 31);
994   m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
995   q = vsubi2(q, vslli2(m, 2));
996   u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
997   x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
998   u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
999   return vmulf(x, u);
1000 }
1001 
xsinf(vfloat d)1002 static INLINE vfloat xsinf(vfloat d) {
1003   vint2 q;
1004   vfloat u, s;
1005 
1006   q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)librtprocess::RT_1_PI)));
1007 
1008   u = vcast_vf_vi2(q);
1009   d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
1010   d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
1011   d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
1012   d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
1013 
1014   s = vmulf(d, d);
1015 
1016   d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
1017 
1018   u = vcast_vf_f(2.6083159809786593541503e-06f);
1019   u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1020   u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1021   u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1022 
1023   u = vmlaf(s, vmulf(u, d), d);
1024 
1025   return u;
1026 }
1027 
xcosf(vfloat d)1028 static INLINE vfloat xcosf(vfloat d) {
1029   vint2 q;
1030   vfloat u, s;
1031 
1032   q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)librtprocess::RT_1_PI)), vcast_vf_f(0.5f)));
1033   q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
1034 
1035   u = vcast_vf_vi2(q);
1036   d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
1037   d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
1038   d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
1039   d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
1040 
1041   s = vmulf(d, d);
1042 
1043   d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
1044 
1045   u = vcast_vf_f(2.6083159809786593541503e-06f);
1046   u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1047   u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1048   u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1049 
1050   u = vmlaf(s, vmulf(u, d), d);
1051 
1052   return u;
1053 }
1054 
xsincosf(vfloat d)1055 static INLINE vfloat2 xsincosf(vfloat d) {
1056   vint2 q;
1057   vmask m;
1058   vfloat u, s, t, rx, ry;
1059   vfloat2 r;
1060 
1061   q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)librtprocess::RT_2_PI)));
1062 
1063   s = d;
1064 
1065   u = vcast_vf_vi2(q);
1066   s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
1067   s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
1068   s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
1069   s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
1070 
1071   t = s;
1072 
1073   s = vmulf(s, s);
1074 
1075   u = vcast_vf_f(-0.000195169282960705459117889f);
1076   u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
1077   u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
1078   u = vmulf(vmulf(u, s), t);
1079 
1080   rx = vaddf(t, u);
1081 
1082   u = vcast_vf_f(-2.71811842367242206819355e-07f);
1083   u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
1084   u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
1085   u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
1086   u = vmlaf(u, s, vcast_vf_f(-0.5));
1087 
1088   ry = vaddf(vcast_vf_f(1), vmulf(s, u));
1089 
1090   m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
1091   r.x = vself(m, rx, ry);
1092   r.y = vself(m, ry, rx);
1093 
1094   m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
1095   r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
1096 
1097   m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
1098   r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
1099 
1100   m = vmaskf_isinf(d);
1101   r.x = vself(m, vcast_vf_f(librtprocess::RT_NAN), r.x);
1102   r.y = vself(m, vcast_vf_f(librtprocess::RT_NAN), r.y);
1103 
1104   return r;
1105 }
1106 
xtanf(vfloat d)1107 static INLINE vfloat xtanf(vfloat d) {
1108   vint2 q;
1109   vmask m;
1110   vfloat u, s, x;
1111 
1112   q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * librtprocess::RT_1_PI))));
1113 
1114   x = d;
1115 
1116   u = vcast_vf_vi2(q);
1117   x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
1118   x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
1119   x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
1120   x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
1121 
1122   s = vmulf(x, x);
1123 
1124   m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
1125   x = vself(m, vnegf(x), x);
1126 
1127   u = vcast_vf_f(0.00927245803177356719970703f);
1128   u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
1129   u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
1130   u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
1131   u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
1132   u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
1133 
1134   u = vmlaf(s, vmulf(u, x), x);
1135 
1136   u = vself(m, vrecf(u), u);
1137 
1138   u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
1139 
1140   return u;
1141 }
1142 
xatanf(vfloat s)1143 static INLINE vfloat xatanf(vfloat s) {
1144   vfloat t, u;
1145   vint2 q;
1146 
1147   q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
1148   s = vabsf(s);
1149 
1150   q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
1151   s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
1152 
1153   t = vmulf(s, s);
1154 
1155   u = vcast_vf_f(0.00282363896258175373077393f);
1156   u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1157   u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1158   u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1159   u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1160   u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1161   u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1162   u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1163 
1164   t = vaddf(s, vmulf(s, vmulf(t, u)));
1165 
1166   t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(librtprocess::RT_PI/2)), t), t);
1167   t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
1168 
1169   return t;
1170 }
1171 
atan2kf(vfloat y,vfloat x)1172 static INLINE vfloat atan2kf(vfloat y, vfloat x) {
1173   vfloat s, t, u;
1174   vint2 q;
1175   vmask p;
1176 
1177   q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
1178   x = vabsf(x);
1179 
1180   q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
1181   p = vmaskf_lt(x, y);
1182   s = vself(p, vnegf(x), y);
1183   t = vmaxf(x, y);
1184 
1185   s = vdivf(s, t);
1186   t = vmulf(s, s);
1187 
1188   u = vcast_vf_f(0.00282363896258175373077393f);
1189   u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1190   u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1191   u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1192   u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1193   u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1194   u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1195   u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1196 
1197   t = vaddf(s, vmulf(s, vmulf(t, u)));
1198   t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(librtprocess::RT_PI/2))));
1199 
1200   return t;
1201 }
1202 
xatan2f(vfloat y,vfloat x)1203 static INLINE vfloat xatan2f(vfloat y, vfloat x) {
1204   vfloat r = atan2kf(vabsf(y), x);
1205 
1206   r = vmulsignf(r, x);
1207   r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(librtprocess::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(librtprocess::RT_PI/2)), x))), r);
1208   r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(librtprocess::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(librtprocess::RT_PI/4)), x))), r);
1209   r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)librtprocess::RT_PI)), r);
1210 
1211   return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
1212 }
1213 
xasinf(vfloat d)1214 static INLINE vfloat xasinf(vfloat d) {
1215   vfloat x, y;
1216   x = vaddf(vcast_vf_f(1.0f), d);
1217   y = vsubf(vcast_vf_f(1.0f), d);
1218   x = vmulf(x, y);
1219   x = vsqrtf(x);
1220   x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
1221   return vmulsignf(x, d);
1222 }
1223 
xacosf(vfloat d)1224 static INLINE vfloat xacosf(vfloat d) {
1225   vfloat x, y;
1226   x = vaddf(vcast_vf_f(1.0f), d);
1227   y = vsubf(vcast_vf_f(1.0f), d);
1228   x = vmulf(x, y);
1229   x = vsqrtf(x);
1230   x = vmulsignf(atan2kf(x, vabsf(d)), d);
1231   y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)librtprocess::RT_PI));
1232   x = vaddf(x, y);
1233   return x;
1234 }
1235 
xlogf(vfloat d)1236 static INLINE vfloat xlogf(vfloat d) {
1237   vfloat x, x2, t, m;
1238   vint2 e;
1239 
1240   e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1241   m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1242 
1243   x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1244   x2 = vmulf(x, x);
1245 
1246   t = vcast_vf_f(0.2371599674224853515625f);
1247   t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1248   t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1249   t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1250   t = vmlaf(t, x2, vcast_vf_f(2.0f));
1251 
1252   x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1253 
1254   x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
1255   x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
1256   x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
1257 
1258   return x;
1259 }
1260 
xlogf0(vfloat d)1261 static INLINE vfloat xlogf0(vfloat d) {
1262   vfloat x, x2, t, m;
1263   vint2 e;
1264 
1265   e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1266   m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1267 
1268   x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1269   x2 = vmulf(x, x);
1270 
1271   t = vcast_vf_f(0.2371599674224853515625f);
1272   t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1273   t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1274   t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1275   t = vmlaf(t, x2, vcast_vf_f(2.0f));
1276 
1277   x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1278 
1279   x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
1280   x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
1281   x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
1282 
1283   return x;
1284 }
1285 
xlogfNoCheck(vfloat d)1286 static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
1287   vfloat x, x2, t, m;
1288   vint2 e;
1289 
1290   e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1291   m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1292 
1293   x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1294   x2 = vmulf(x, x);
1295 
1296   t = vcast_vf_f(0.2371599674224853515625f);
1297   t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1298   t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1299   t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1300   t = vmlaf(t, x2, vcast_vf_f(2.0f));
1301 
1302   return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1303 
1304 }
1305 
xexpf(vfloat d)1306 static INLINE vfloat xexpf(vfloat d) {
1307   vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1308   vfloat s, u;
1309 
1310   s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1311   s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1312 
1313   u = vcast_vf_f(0.00136324646882712841033936f);
1314   u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1315   u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1316   u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1317   u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1318 
1319   u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1320 
1321   u = vldexpf(u, q);
1322 
1323   // -104.0
1324   return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
1325 }
1326 
xexpfNoCheck(vfloat d)1327 static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
1328   vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1329   vfloat s, u;
1330 
1331   s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1332   s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1333 
1334   u = vcast_vf_f(0.00136324646882712841033936f);
1335   u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1336   u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1337   u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1338   u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1339 
1340   u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1341 
1342   return vldexpf(u, q);
1343 }
1344 
xcbrtf(vfloat d)1345 static INLINE vfloat xcbrtf(vfloat d) {
1346   vfloat x, y, q = vcast_vf_f(1.0), t;
1347   vint2 e, qu, re;
1348 
1349   e = vilogbp1f(vabsf(d));
1350   d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1351 
1352   t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
1353   qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
1354   re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
1355 
1356   q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
1357   q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
1358   q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
1359 
1360   q = vmulsignf(q, d);
1361   d = vabsf(d);
1362 
1363   x = vcast_vf_f(-0.601564466953277587890625f);
1364   x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
1365   x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
1366   x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
1367   x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
1368   x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
1369 
1370   y = vmulf(vmulf(d, x), x);
1371   y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
1372 
1373   return y;
1374 }
1375 
LIMV(vfloat a,vfloat b,vfloat c)1376 static INLINE vfloat LIMV( vfloat a, vfloat b, vfloat c ) {
1377 return vmaxf( b, vminf(a,c));
1378 }
1379 
SQRV(vfloat a)1380 static INLINE vfloat SQRV(vfloat a){
1381 	return a * a;
1382 }
1383 
vswap(vmask condition,vfloat & a,vfloat & b)1384 static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
1385     // conditional swap the elements of two vfloats
1386     vfloat temp = vself(condition, a, b); // the values which fit to condition
1387     a = vself(condition, b, a); // the values which fit to inverted condition
1388     b = temp;
1389 }
1390 
vhadd(vfloat a)1391 static inline float vhadd( vfloat a ) {
1392     // returns a[0] + a[1] + a[2] + a[3]
1393     a += _mm_movehl_ps(a, a);
1394     return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
1395 }
1396 
vmul2f(vfloat a)1397 static INLINE vfloat vmul2f(vfloat a){
1398     // fastest way to multiply by 2
1399 	return a + a;
1400 }
1401 
vintpf(vfloat a,vfloat b,vfloat c)1402 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
1403     // calculate a * b + (1 - a) * c (interpolate two values)
1404     // following is valid:
1405     // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
1406     // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
1407     return a * (b-c) + c;
1408 }
1409 
vdup(vfloat a)1410 static INLINE vfloat vdup(vfloat a){
1411     // returns { a[0],a[0],a[1],a[1] }
1412     return _mm_unpacklo_ps( a, a );
1413 }
1414 
vaddc2vfu(float & a)1415 static INLINE vfloat vaddc2vfu(float &a)
1416 {
1417     // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
1418     vfloat a1 = _mm_loadu_ps( &a );
1419     vfloat a2 = _mm_loadu_ps( (&a) + 4 );
1420     return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
1421 }
1422 
vadivapb(vfloat a,vfloat b)1423 static INLINE vfloat vadivapb (vfloat a, vfloat b) {
1424     return a / (a+b);
1425 }
1426 
vconvertrgbrgbrgbrgb2rrrrggggbbbb(const float * src,vfloat & rv,vfloat & gv,vfloat & bv)1427 static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
1428     // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
1429     rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
1430     gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
1431     bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
1432 }
1433 
1434 #endif // __SSE2__
1435 #endif // SLEEFSSEAVX
1436