1 ////////////////////////////////////////////////////////////////
2 //
3 //  this code was taken from http://shibatch.sourceforge.net/
4 //  Many thanks to the author of original version: Naoki Shibata
5 //
6 //  This version contains modifications made by Ingo Weyrich
7 //
8 ////////////////////////////////////////////////////////////////
9 
10 
11 #ifndef SLEEFSSEAVX
12 #define SLEEFSSEAVX
13 
14 #include <assert.h>
15 #include "rt_math.h"
16 #ifdef __SSE2__
17 #include "helpersse2.h"
18 
19 #ifdef ENABLE_AVX
20 #include "helperavx.h"
21 #endif
22 
23 #ifdef __GNUC__
24 #define INLINE __inline
25 #else
26 #define INLINE inline
27 #endif
28 
29 #define PI4_A .7853981554508209228515625
30 #define PI4_B .794662735614792836713604629039764404296875e-8
31 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
32 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
33 
34 #define L2U .69314718055966295651160180568695068359375
35 #define L2L .28235290563031577122588448175013436025525412068e-12
36 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
37 
38 #define PI4_Af 0.78515625f
39 #define PI4_Bf 0.00024127960205078125f
40 #define PI4_Cf 6.3329935073852539062e-07f
41 #define PI4_Df 4.9604681473525147339e-10f
42 
43 #define L2Uf 0.693145751953125f
44 #define L2Lf 1.428606765330187045e-06f
45 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
46 
47 #define INFINITYf ((float)rtengine::RT_INFINITY)
48 #define NANf ((float)rtengine::RT_NAN)
49 
vadd3(vdouble v0,vdouble v1,vdouble v2)50 static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
51     return vadd(vadd(v0, v1), v2);
52 }
53 
vadd4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)54 static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
55     return vadd3(vadd(v0, v1), v2, v3);
56 }
57 
vadd5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)58 static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
59     return vadd4(vadd(v0, v1), v2, v3, v4);
60 }
61 
vadd6(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5)62 static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
63     return vadd5(vadd(v0, v1), v2, v3, v4, v5);
64 }
65 
vadd7(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5,vdouble v6)66 static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
67     return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
68 }
69 
vsub3(vdouble v0,vdouble v1,vdouble v2)70 static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
71     return vsub(vsub(v0, v1), v2);
72 }
73 
vsub4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)74 static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
75     return vsub3(vsub(v0, v1), v2, v3);
76 }
77 
vsub5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)78 static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
79     return vsub4(vsub(v0, v1), v2, v3, v4);
80 }
81 
82 //
83 
normalize_d(vdouble2 t)84 static INLINE vdouble2 normalize_d(vdouble2 t) {
85     vdouble2 s;
86 
87     s.x = vadd(t.x, t.y);
88     s.y = vadd(vsub(t.x, s.x), t.y);
89 
90     return s;
91 }
92 
scale_d(vdouble2 d,vdouble s)93 static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
94     vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
95     return r;
96 }
97 
add_ss(vdouble x,vdouble y)98 static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
99     vdouble2 r;
100 
101     r.x = vadd(x, y);
102     r.y = vadd(vsub(x, r.x), y);
103 
104     return r;
105 }
106 
add2_ss(vdouble x,vdouble y)107 static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
108     vdouble2 r;
109 
110     r.x = vadd(x, y);
111     vdouble v = vsub(r.x, x);
112     r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
113 
114     return r;
115 }
116 
add_ds(vdouble2 x,vdouble y)117 static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
118     vdouble2 r;
119 
120     r.x = vadd(x.x, y);
121     r.y = vadd3(vsub(x.x, r.x), y, x.y);
122 
123     return r;
124 }
125 
add2_ds(vdouble2 x,vdouble y)126 static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
127     vdouble2 r;
128 
129     r.x = vadd(x.x, y);
130     vdouble v = vsub(r.x, x.x);
131     r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
132     r.y = vadd(r.y, x.y);
133 
134     return r;
135 }
136 
add_sd(vdouble x,vdouble2 y)137 static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
138     vdouble2 r;
139 
140     r.x = vadd(x, y.x);
141     r.y = vadd3(vsub(x, r.x), y.x, y.y);
142 
143     return r;
144 }
145 
add_dd(vdouble2 x,vdouble2 y)146 static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
147     // |x| >= |y|
148 
149     vdouble2 r;
150 
151     r.x = vadd(x.x, y.x);
152     r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
153 
154     return r;
155 }
156 
add2_dd(vdouble2 x,vdouble2 y)157 static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
158     vdouble2 r;
159 
160     r.x  = vadd(x.x, y.x);
161     vdouble v = vsub(r.x, x.x);
162     r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
163     r.y = vadd(r.y, vadd(x.y, y.y));
164 
165     return r;
166 }
167 
div_dd(vdouble2 n,vdouble2 d)168 static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
169     vdouble t = vrec(d.x);
170     vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
171     vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
172     vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
173 
174     vdouble2 q;
175 
176     q.x = vmul(n.x, t);
177 
178     vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
179             vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
180 
181     q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
182 
183     return q;
184 }
185 
mul_ss(vdouble x,vdouble y)186 static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
187     vdouble xh = vupper(x), xl = vsub(x, xh);
188     vdouble yh = vupper(y), yl = vsub(y, yh);
189     vdouble2 r;
190 
191     r.x = vmul(x, y);
192     r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
193 
194     return r;
195 }
196 
mul_ds(vdouble2 x,vdouble y)197 static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
198     vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
199     vdouble yh = vupper(y  ), yl = vsub(y  , yh);
200     vdouble2 r;
201 
202     r.x = vmul(x.x, y);
203     r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
204 
205     return r;
206 }
207 
mul_dd(vdouble2 x,vdouble2 y)208 static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
209     vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
210     vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
211     vdouble2 r;
212 
213     r.x = vmul(x.x, y.x);
214     r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
215 
216     return r;
217 }
218 
squ_d(vdouble2 x)219 static INLINE vdouble2 squ_d(vdouble2 x) {
220     vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
221     vdouble2 r;
222 
223     r.x = vmul(x.x, x.x);
224     r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
225 
226     return r;
227 }
228 
rec_s(vdouble d)229 static INLINE vdouble2 rec_s(vdouble d) {
230     vdouble t = vrec(d);
231     vdouble dh = vupper(d), dl = vsub(d, dh);
232     vdouble th = vupper(t), tl = vsub(t, th);
233     vdouble2 q;
234 
235     q.x = t;
236     q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
237 
238     return q;
239 }
240 
sqrt_d(vdouble2 d)241 static INLINE vdouble2 sqrt_d(vdouble2 d) {
242     vdouble t = vsqrt(vadd(d.x, d.y));
243     return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
244 }
245 
246 //
247 
xldexp(vdouble x,vint q)248 static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
249 
xilogb(vdouble d)250 static INLINE vint xilogb(vdouble d) {
251     vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
252     e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
253     e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
254     return vrint_vi_vd(e);
255 }
256 
xsin(vdouble d)257 static INLINE vdouble xsin(vdouble d) {
258     vint q;
259     vdouble u, s;
260 
261     q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
262 
263     u = vcast_vd_vi(q);
264     d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
265     d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
266     d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
267 
268     s = vmul(d, d);
269 
270     d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
271 
272     u = vcast_vd_d(-7.97255955009037868891952e-18);
273     u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
274     u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
275     u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
276     u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
277     u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
278     u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
279     u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
280     u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
281 
282     u = vmla(s, vmul(u, d), d);
283 
284     return u;
285 }
286 
xcos(vdouble d)287 static INLINE vdouble xcos(vdouble d) {
288     vint q;
289     vdouble u, s;
290 
291     q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
292     q = vaddi(vaddi(q, q), vcast_vi_i(1));
293 
294     u = vcast_vd_vi(q);
295     d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
296     d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
297     d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
298 
299     s = vmul(d, d);
300 
301     d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
302 
303     u = vcast_vd_d(-7.97255955009037868891952e-18);
304     u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
305     u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
306     u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
307     u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
308     u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
309     u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
310     u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
311     u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
312 
313     u = vmla(s, vmul(u, d), d);
314 
315     return u;
316 }
317 
xsincos(vdouble d)318 static INLINE vdouble2 xsincos(vdouble d) {
319     vint q;
320     vmask m;
321     vdouble u, s, t, rx, ry;
322     vdouble2 r;
323 
324     q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
325 
326     s = d;
327 
328     u = vcast_vd_vi(q);
329     s = vmla(u, vcast_vd_d(-PI4_A*2), s);
330     s = vmla(u, vcast_vd_d(-PI4_B*2), s);
331     s = vmla(u, vcast_vd_d(-PI4_C*2), s);
332 
333     t = s;
334 
335     s = vmul(s, s);
336 
337     u = vcast_vd_d(1.58938307283228937328511e-10);
338     u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
339     u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
340     u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
341     u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
342     u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
343     u = vmul(vmul(u, s), t);
344 
345     rx = vadd(t, u);
346 
347     u = vcast_vd_d(-1.13615350239097429531523e-11);
348     u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
349     u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
350     u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
351     u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
352     u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
353     u = vmla(u, s, vcast_vd_d(-0.5));
354 
355     ry = vadd(vcast_vd_d(1), vmul(s, u));
356 
357     m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
358     r.x = vsel(m, rx, ry);
359     r.y = vsel(m, ry, rx);
360 
361     m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
362     r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
363 
364     m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
365     r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
366 
367     m = vmask_isinf(d);
368     r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
369     r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
370 
371     return r;
372 }
373 
xtan(vdouble d)374 static INLINE vdouble xtan(vdouble d) {
375     vint q;
376     vdouble u, s, x;
377     vmask m;
378 
379     q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
380 
381     u = vcast_vd_vi(q);
382     x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
383     x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
384     x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
385 
386     s = vmul(x, x);
387 
388     m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
389     x = vsel(m, vneg(x), x);
390 
391     u = vcast_vd_d(1.01419718511083373224408e-05);
392     u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
393     u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
394     u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
395     u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
396     u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
397     u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
398     u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
399     u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
400     u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
401     u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
402     u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
403     u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
404     u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
405     u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
406 
407     u = vmla(s, vmul(u, x), x);
408 
409     u = vsel(m, vrec(u), u);
410 
411     u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
412 
413     return u;
414 }
415 
atan2k(vdouble y,vdouble x)416 static INLINE vdouble atan2k(vdouble y, vdouble x) {
417     vdouble s, t, u;
418     vint q;
419     vmask p;
420 
421     q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
422     x = vabs(x);
423 
424     q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
425     p = vmask_lt(x, y);
426     s = vsel (p, vneg(x), y);
427     t = vmax (x, y);
428 
429     s = vdiv(s, t);
430     t = vmul(s, s);
431 
432     u = vcast_vd_d(-1.88796008463073496563746e-05);
433     u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
434     u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
435     u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
436     u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
437     u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
438     u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
439     u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
440     u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
441     u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
442     u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
443     u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
444     u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
445     u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
446     u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
447     u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
448     u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
449     u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
450     u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
451 
452     t = vadd(s, vmul(s, vmul(t, u)));
453     t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
454 
455     return t;
456 }
457 
xatan2(vdouble y,vdouble x)458 static INLINE vdouble xatan2(vdouble y, vdouble x) {
459     vdouble r = atan2k(vabs(y), x);
460 
461     r = vmulsign(r, x);
462     r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
463     r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
464     r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
465 
466     return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
467 }
468 
xasin(vdouble d)469 static INLINE vdouble xasin(vdouble d) {
470     vdouble x, y;
471     x = vadd(vcast_vd_d(1), d);
472     y = vsub(vcast_vd_d(1), d);
473     x = vmul(x, y);
474     x = vsqrt(x);
475     x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
476     return vmulsign(x, d);
477 }
478 
xacos(vdouble d)479 static INLINE vdouble xacos(vdouble d) {
480     vdouble x, y;
481     x = vadd(vcast_vd_d(1), d);
482     y = vsub(vcast_vd_d(1), d);
483     x = vmul(x, y);
484     x = vsqrt(x);
485     x = vmulsign(atan2k(x, vabs(d)), d);
486     y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
487     x = vadd(x, y);
488     return x;
489 }
490 
xatan(vdouble s)491 static INLINE vdouble xatan(vdouble s) {
492     vdouble t, u;
493     vint q;
494 
495     q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
496     s = vabs(s);
497 
498     q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
499     s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
500 
501     t = vmul(s, s);
502 
503     u = vcast_vd_d(-1.88796008463073496563746e-05);
504     u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
505     u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
506     u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
507     u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
508     u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
509     u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
510     u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
511     u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
512     u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
513     u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
514     u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
515     u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
516     u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
517     u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
518     u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
519     u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
520     u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
521     u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
522 
523     t = vadd(s, vmul(s, vmul(t, u)));
524 
525     t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
526     t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
527 
528     return t;
529 }
530 
xlog(vdouble d)531 static INLINE vdouble xlog(vdouble d) {
532     vdouble x, x2;
533     vdouble t, m;
534     vint e;
535 
536     e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
537     m = vldexp(d, vsubi(vcast_vi_i(0), e));
538 
539     x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
540     x2 = vmul(x, x);
541 
542     t = vcast_vd_d(0.148197055177935105296783);
543     t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
544     t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
545     t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
546     t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
547     t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
548     t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
549     t = vmla(t, x2, vcast_vd_d(2));
550 
551     x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
552 
553     x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
554     x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
555     x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
556 
557     return x;
558 }
559 
xexp(vdouble d)560 static INLINE vdouble xexp(vdouble d) {
561     vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
562     vdouble s, u;
563 
564     s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
565     s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
566 
567     u = vcast_vd_d(2.08860621107283687536341e-09);
568     u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
569     u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
570     u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
571     u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
572     u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
573     u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
574     u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
575     u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
576     u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
577     u = vmla(u, s, vcast_vd_d(0.5));
578 
579     u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
580 
581     u = vldexp(u, q);
582 
583     u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
584 
585     return u;
586 }
587 
logk(vdouble d)588 static INLINE vdouble2 logk(vdouble d) {
589     vdouble2 x, x2;
590     vdouble t, m;
591     vint e;
592 
593     e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
594     m = vldexp(d, vsubi(vcast_vi_i(0), e));
595 
596     x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
597     x2 = squ_d(x);
598     x2 = normalize_d(x2);
599 
600     t = vcast_vd_d(0.134601987501262130076155);
601     t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
602     t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
603     t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
604     t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
605     t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
606     t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
607     t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
608 
609     return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
610                 vcast_vd_vi(e)),
611             add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
612 }
613 
expk(vdouble2 d)614 static INLINE vdouble expk(vdouble2 d) {
615     vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
616     vint q = vrint_vi_vd(u);
617     vdouble2 s, t;
618 
619     s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
620     s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
621 
622     q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
623 
624     s = normalize_d(s);
625 
626     u = vcast_vd_d(2.51069683420950419527139e-08);
627     u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
628     u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
629     u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
630     u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
631     u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
632     u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
633     u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
634     u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
635     u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
636 
637     t = add_dd(s, mul_ds(squ_d(s), u));
638 
639     t = add_sd(vcast_vd_d(1), t);
640     u = vadd(t.x, t.y);
641     u = vldexp(u, q);
642 
643     return u;
644 }
645 
xpow(vdouble x,vdouble y)646 static INLINE vdouble xpow(vdouble x, vdouble y) {
647 #if 1
648     vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
649     vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
650 
651     vdouble result = expk(mul_ds(logk(vabs(x)), y));
652 
653     //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
654 
655     result = vmul(result,
656             vsel(vmask_gt(x, vcast_vd_d(0)),
657                 vcast_vd_d(1),
658                 vsel(yisint,
659                     vsel(yisodd,
660                         vcast_vd_d(-1),
661                         vcast_vd_d(1)),
662                     vcast_vd_d(rtengine::RT_NAN))));
663 
664     vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
665 
666     result = vsel(vmask_isinf(y),
667             vsel(vmask_lt(efx, vcast_vd_d(0)),
668                 vcast_vd_d(0),
669                 vsel(vmask_eq(efx, vcast_vd_d(0)),
670                     vcast_vd_d(1.0),
671                     vcast_vd_d(rtengine::RT_INFINITY))),
672             result);
673 
674     result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
675             vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
676                 vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
677                     vcast_vd_d(0),
678                     vcast_vd_d(rtengine::RT_INFINITY))),
679             result);
680 
681     result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
682 
683     result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
684 
685     return result;
686 #else
687     return expk(mul_ds(logk(x), y));
688 #endif
689 }
690 
expk2(vdouble2 d)691 static INLINE vdouble2 expk2(vdouble2 d) {
692     vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
693     vint q = vrint_vi_vd(u);
694     vdouble2 s, t;
695 
696     s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
697     s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
698 
699     q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
700 
701     s = normalize_d(s);
702 
703     u = vcast_vd_d(2.51069683420950419527139e-08);
704     u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
705     u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
706     u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
707     u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
708     u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
709     u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
710     u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
711     u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
712     u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
713 
714     t = add_dd(s, mul_ds(squ_d(s), u));
715 
716     t = add_sd(vcast_vd_d(1), t);
717 
718     return dd(vldexp(t.x, q), vldexp(t.y, q));
719 }
720 
xsinh(vdouble x)721 static INLINE vdouble xsinh(vdouble x) {
722     vdouble y = vabs(x);
723     vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
724     d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
725     y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
726 
727     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
728     y = vmulsign(y, x);
729     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
730 
731     return y;
732 }
733 
xcosh(vdouble x)734 static INLINE vdouble xcosh(vdouble x) {
735     vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
736     d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
737     vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
738 
739     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
740     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
741 
742     return y;
743 }
744 
xtanh(vdouble x)745 static INLINE vdouble xtanh(vdouble x) {
746     vdouble y = vabs(x);
747     vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
748     vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
749     d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
750     y = d.x + d.y;
751 
752     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
753     y = vmulsign(y, x);
754     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
755 
756     return y;
757 }
758 
logk2(vdouble2 d)759 static INLINE vdouble2 logk2(vdouble2 d) {
760     vdouble2 x, x2, m;
761     vdouble t;
762     vint e;
763 
764     d = normalize_d(d);
765     e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
766     m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
767 
768     x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
769     x2 = squ_d(x);
770     x2 = normalize_d(x2);
771 
772     t = vcast_vd_d(0.134601987501262130076155);
773     t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
774     t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
775     t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
776     t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
777     t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
778     t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
779     t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
780 
781     return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
782                 vcast_vd_vi(e)),
783             add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
784 }
785 
xasinh(vdouble x)786 static INLINE vdouble xasinh(vdouble x) {
787     vdouble y = vabs(x);
788     vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
789     y = vadd(d.x, d.y);
790 
791     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
792     y = vmulsign(y, x);
793     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
794 
795     return y;
796 }
797 
xacosh(vdouble x)798 static INLINE vdouble xacosh(vdouble x) {
799     vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
800     vdouble y = vadd(d.x, d.y);
801 
802     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
803     y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
804     y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
805     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
806 
807     return y;
808 }
809 
xatanh(vdouble x)810 static INLINE vdouble xatanh(vdouble x) {
811     vdouble y = vabs(x);
812     vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
813     y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
814 
815     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
816     y = vmulsign(y, x);
817     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
818 
819     return y;
820 }
821 
xcbrt(vdouble d)822 static INLINE vdouble xcbrt(vdouble d) {
823     vdouble x, y, q = vcast_vd_d(1.0);
824     vint e, qu, re;
825     vdouble t;
826 
827     e = vilogbp1(vabs(d));
828     d = vldexp(d, vsubi(vcast_vi_i(0), e));
829 
830     t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
831     qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
832     re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
833 
834     q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
835     q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
836     q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
837 
838     q = vmulsign(q, d);
839 
840     d = vabs(d);
841 
842     x = vcast_vd_d(-0.640245898480692909870982);
843     x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
844     x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
845     x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
846     x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
847     x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
848 
849     y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
850     y = vmul(vmul(d, x), x);
851     y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
852 
853     return y;
854 }
855 
xexp2(vdouble a)856 static INLINE vdouble xexp2(vdouble a) {
857     vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
858     u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
859     u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
860     return u;
861 }
862 
xexp10(vdouble a)863 static INLINE vdouble xexp10(vdouble a) {
864     vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
865     u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
866     u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
867     return u;
868 }
869 
xexpm1(vdouble a)870 static INLINE vdouble xexpm1(vdouble a) {
871     vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
872     vdouble x = d.x + d.y;
873     x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
874     x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
875     return x;
876 }
877 
xlog10(vdouble a)878 static INLINE vdouble xlog10(vdouble a) {
879     vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
880     vdouble x = d.x + d.y;
881 
882     x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
883     x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
884     x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
885 
886     return x;
887 }
888 
xlog1p(vdouble a)889 static INLINE vdouble xlog1p(vdouble a) {
890     vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
891     vdouble x = d.x + d.y;
892 
893     x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
894     x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
895     x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
896 
897     return x;
898 }
899 
900 //
901 
902 typedef struct {
903     vfloat x, y;
904 } vfloat2;
905 
vabsf(vfloat f)906 static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
vnegf(vfloat f)907 static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
908 
909 #ifdef __SSE4_1__
910 // only one instruction when using SSE4.1
vself(vmask mask,vfloat x,vfloat y)911 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
912     return _mm_blendv_ps(y,x,(vfloat)mask);
913 }
914 
vselc(vmask mask,vint x,vint y)915 static INLINE vint vselc(vmask mask, vint x, vint y) {
916     return _mm_blendv_epi8(y,x,mask);
917 }
918 
919 #else
920 // three instructions when using SSE2
vself(vmask mask,vfloat x,vfloat y)921 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
922     return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
923 }
924 
vselc(vmask mask,vint x,vint y)925 static INLINE vint vselc(vmask mask, vint x, vint y) {
926     return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
927 }
928 #endif
929 
vselfzero(vmask mask,vfloat x)930 static INLINE vfloat vselfzero(vmask mask, vfloat x) {
931     // returns value of x if corresponding mask bits are 1, else returns 0
932     // faster than vself(mask, x, ZEROV)
933     return _mm_and_ps((vfloat)mask, x);
934 }
vselfnotzero(vmask mask,vfloat x)935 static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
936     // returns value of x if corresponding mask bits are 0, else returns 0
937     // faster than vself(mask, ZEROV, x)
938     return _mm_andnot_ps((vfloat)mask, x);
939 }
940 
vselizero(vmask mask,vint x)941 static INLINE vint vselizero(vmask mask, vint x) {
942     // returns value of x if corresponding mask bits are 1, else returns 0
943     // faster than vselc(mask, x, ZEROV)
944     return _mm_and_si128(mask, x);
945 }
vselinotzero(vmask mask,vint x)946 static INLINE vint vselinotzero(vmask mask, vint x) {
947     // returns value of x if corresponding mask bits are 0, else returns 0
948     // faster than vselc(mask, ZEROV, x)
949     return _mm_andnot_si128(mask, x);
950 }
951 
vseli2_lt(vfloat f0,vfloat f1,vint2 x,vint2 y)952 static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
953     vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
954     return vori2(vandi2(m2, x), vandnoti2(m2, y));
955 }
956 
vsignbitf(vfloat f)957 static INLINE vmask vsignbitf(vfloat f) {
958     return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
959 }
960 
vmulsignf(vfloat x,vfloat y)961 static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
962     return (vfloat)vxorm((vmask)x, vsignbitf(y));
963 }
964 
vsignf(vfloat f)965 static INLINE vfloat vsignf(vfloat f) {
966     return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
967 }
968 
vmaskf_isinf(vfloat d)969 static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
vmaskf_ispinf(vfloat d)970 static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
vmaskf_isminf(vfloat d)971 static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
vmaskf_isnan(vfloat d)972 static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
973 // the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
vmaskf_isnan(vfloat a,vfloat b)974 static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
visinf2f(vfloat d,vfloat m)975 static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
visinff(vfloat d)976 static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
977 
vilogbp1f(vfloat d)978 static INLINE vint2 vilogbp1f(vfloat d) {
979     vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
980     d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
981     vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
982     q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
983     return q;
984 }
985 
vldexpf(vfloat x,vint2 q)986 static INLINE vfloat vldexpf(vfloat x, vint2 q) {
987     vfloat u;
988     vint2 m = vsrai2(q, 31);
989     m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
990     q = vsubi2(q, vslli2(m, 2));
991     u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
992     x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
993     u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
994     return vmulf(x, u);
995 }
996 
xsinf(vfloat d)997 static INLINE vfloat xsinf(vfloat d) {
998     vint2 q;
999     vfloat u, s;
1000 
1001     q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
1002 
1003     u = vcast_vf_vi2(q);
1004     d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
1005     d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
1006     d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
1007     d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
1008 
1009     s = vmulf(d, d);
1010 
1011     d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
1012 
1013     u = vcast_vf_f(2.6083159809786593541503e-06f);
1014     u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1015     u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1016     u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1017 
1018     u = vmlaf(s, vmulf(u, d), d);
1019 
1020     return u;
1021 }
1022 
xcosf(vfloat d)1023 static INLINE vfloat xcosf(vfloat d) {
1024     vint2 q;
1025     vfloat u, s;
1026 
1027     q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
1028     q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
1029 
1030     u = vcast_vf_vi2(q);
1031     d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
1032     d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
1033     d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
1034     d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
1035 
1036     s = vmulf(d, d);
1037 
1038     d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
1039 
1040     u = vcast_vf_f(2.6083159809786593541503e-06f);
1041     u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1042     u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1043     u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1044 
1045     u = vmlaf(s, vmulf(u, d), d);
1046 
1047     return u;
1048 }
1049 
xsincosf(vfloat d)1050 static INLINE vfloat2 xsincosf(vfloat d) {
1051     vint2 q;
1052     vmask m;
1053     vfloat u, s, t, rx, ry;
1054     vfloat2 r;
1055 
1056     q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
1057 
1058     s = d;
1059 
1060     u = vcast_vf_vi2(q);
1061     s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
1062     s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
1063     s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
1064     s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
1065 
1066     t = s;
1067 
1068     s = vmulf(s, s);
1069 
1070     u = vcast_vf_f(-0.000195169282960705459117889f);
1071     u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
1072     u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
1073     u = vmulf(vmulf(u, s), t);
1074 
1075     rx = vaddf(t, u);
1076 
1077     u = vcast_vf_f(-2.71811842367242206819355e-07f);
1078     u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
1079     u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
1080     u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
1081     u = vmlaf(u, s, vcast_vf_f(-0.5));
1082 
1083     ry = vaddf(vcast_vf_f(1), vmulf(s, u));
1084 
1085     m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
1086     r.x = vself(m, rx, ry);
1087     r.y = vself(m, ry, rx);
1088 
1089     m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
1090     r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
1091 
1092     m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
1093     r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
1094 
1095     m = vmaskf_isinf(d);
1096     r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
1097     r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
1098 
1099     return r;
1100 }
1101 
xtanf(vfloat d)1102 static INLINE vfloat xtanf(vfloat d) {
1103     vint2 q;
1104     vmask m;
1105     vfloat u, s, x;
1106 
1107     q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
1108 
1109     x = d;
1110 
1111     u = vcast_vf_vi2(q);
1112     x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
1113     x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
1114     x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
1115     x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
1116 
1117     s = vmulf(x, x);
1118 
1119     m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
1120     x = vself(m, vnegf(x), x);
1121 
1122     u = vcast_vf_f(0.00927245803177356719970703f);
1123     u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
1124     u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
1125     u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
1126     u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
1127     u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
1128 
1129     u = vmlaf(s, vmulf(u, x), x);
1130 
1131     u = vself(m, vrecf(u), u);
1132 
1133     u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
1134 
1135     return u;
1136 }
1137 
xatanf(vfloat s)1138 static INLINE vfloat xatanf(vfloat s) {
1139     vfloat t, u;
1140     vint2 q;
1141 
1142     q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
1143     s = vabsf(s);
1144 
1145     q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
1146     s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
1147 
1148     t = vmulf(s, s);
1149 
1150     u = vcast_vf_f(0.00282363896258175373077393f);
1151     u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1152     u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1153     u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1154     u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1155     u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1156     u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1157     u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1158 
1159     t = vaddf(s, vmulf(s, vmulf(t, u)));
1160 
1161     t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
1162     t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
1163 
1164     return t;
1165 }
1166 
atan2kf(vfloat y,vfloat x)1167 static INLINE vfloat atan2kf(vfloat y, vfloat x) {
1168     vfloat s, t, u;
1169     vint2 q;
1170     vmask p;
1171 
1172     q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
1173     x = vabsf(x);
1174 
1175     q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
1176     p = vmaskf_lt(x, y);
1177     s = vself(p, vnegf(x), y);
1178     t = vmaxf(x, y);
1179 
1180     s = vdivf(s, t);
1181     t = vmulf(s, s);
1182 
1183     u = vcast_vf_f(0.00282363896258175373077393f);
1184     u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1185     u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1186     u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1187     u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1188     u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1189     u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1190     u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1191 
1192     t = vaddf(s, vmulf(s, vmulf(t, u)));
1193     t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
1194 
1195     return t;
1196 }
1197 
xatan2f(vfloat y,vfloat x)1198 static INLINE vfloat xatan2f(vfloat y, vfloat x) {
1199     vfloat r = atan2kf(vabsf(y), x);
1200 
1201     r = vmulsignf(r, x);
1202     r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
1203     r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
1204     r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
1205 
1206     return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
1207 }
1208 
xasinf(vfloat d)1209 static INLINE vfloat xasinf(vfloat d) {
1210     vfloat x, y;
1211     x = vaddf(vcast_vf_f(1.0f), d);
1212     y = vsubf(vcast_vf_f(1.0f), d);
1213     x = vmulf(x, y);
1214     x = vsqrtf(x);
1215     x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
1216     return vmulsignf(x, d);
1217 }
1218 
xacosf(vfloat d)1219 static INLINE vfloat xacosf(vfloat d) {
1220     vfloat x, y;
1221     x = vaddf(vcast_vf_f(1.0f), d);
1222     y = vsubf(vcast_vf_f(1.0f), d);
1223     x = vmulf(x, y);
1224     x = vsqrtf(x);
1225     x = vmulsignf(atan2kf(x, vabsf(d)), d);
1226     y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
1227     x = vaddf(x, y);
1228     return x;
1229 }
1230 
xlogf(vfloat d)1231 static INLINE vfloat xlogf(vfloat d) {
1232     vfloat x, x2, t, m;
1233     vint2 e;
1234 
1235     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1236     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1237 
1238     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1239     x2 = vmulf(x, x);
1240 
1241     t = vcast_vf_f(0.2371599674224853515625f);
1242     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1243     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1244     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1245     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1246 
1247     x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1248 
1249     x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
1250     x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
1251     x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
1252 
1253     return x;
1254 }
1255 
xlogf1(vfloat d)1256 static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
1257     vfloat x, x2, t, m;
1258     vint2 e;
1259 
1260     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1261     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1262 
1263     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1264     x2 = vmulf(x, x);
1265 
1266     t = vcast_vf_f(0.2371599674224853515625f);
1267     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1268     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1269     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1270     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1271 
1272     x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1273 
1274     x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
1275     x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
1276 
1277     return x;
1278 }
1279 
xlogf0(vfloat d)1280 static INLINE vfloat xlogf0(vfloat d) {
1281     vfloat x, x2, t, m;
1282     vint2 e;
1283 
1284     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1285     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1286 
1287     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1288     x2 = vmulf(x, x);
1289 
1290     t = vcast_vf_f(0.2371599674224853515625f);
1291     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1292     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1293     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1294     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1295 
1296     x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1297 
1298     x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
1299     x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
1300     x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
1301 
1302     return x;
1303 }
1304 
xlogfNoCheck(vfloat d)1305 static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
1306     vfloat x, x2, t, m;
1307     vint2 e;
1308 
1309     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1310     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1311 
1312     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1313     x2 = vmulf(x, x);
1314 
1315     t = vcast_vf_f(0.2371599674224853515625f);
1316     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1317     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1318     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1319     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1320 
1321     return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1322 
1323 }
1324 
xexpf(vfloat d)1325 static INLINE vfloat xexpf(vfloat d) {
1326     vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1327     vfloat s, u;
1328 
1329     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1330     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1331 
1332     u = vcast_vf_f(0.00136324646882712841033936f);
1333     u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1334     u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1335     u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1336     u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1337 
1338     u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1339 
1340     u = vldexpf(u, q);
1341 
1342     // -104.0
1343     return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
1344 }
1345 
xexpfNoCheck(vfloat d)1346 static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
1347     vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1348     vfloat s, u;
1349 
1350     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1351     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1352 
1353     u = vcast_vf_f(0.00136324646882712841033936f);
1354     u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1355     u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1356     u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1357     u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1358 
1359     u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1360 
1361     return vldexpf(u, q);
1362 }
1363 
xcbrtf(vfloat d)1364 static INLINE vfloat xcbrtf(vfloat d) {
1365     vfloat x, y, q = vcast_vf_f(1.0), t;
1366     vint2 e, qu, re;
1367 
1368     e = vilogbp1f(vabsf(d));
1369     d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1370 
1371     t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
1372     qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
1373     re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
1374 
1375     q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
1376     q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
1377     q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
1378 
1379     q = vmulsignf(q, d);
1380     d = vabsf(d);
1381 
1382     x = vcast_vf_f(-0.601564466953277587890625f);
1383     x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
1384     x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
1385     x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
1386     x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
1387     x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
1388 
1389     y = vmulf(vmulf(d, x), x);
1390     y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
1391 
1392     return y;
1393 }
1394 
vclampf(vfloat value,vfloat low,vfloat high)1395 static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
1396     // clamps value in [low;high], returns low if value is NaN
1397     return vmaxf(vminf(high, value), low);
1398 }
1399 
SQRV(vfloat a)1400 static INLINE vfloat SQRV(vfloat a){
1401     return a * a;
1402 }
1403 
vswap(vmask condition,vfloat & a,vfloat & b)1404 static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
1405     // conditional swap the elements of two vfloats
1406     vfloat temp = vself(condition, a, b); // the values which fit to condition
1407     a = vself(condition, b, a); // the values which fit to inverted condition
1408     b = temp;
1409 }
1410 
vhadd(vfloat a)1411 static inline float vhadd( vfloat a ) {
1412     // returns a[0] + a[1] + a[2] + a[3]
1413     a += _mm_movehl_ps(a, a);
1414     return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
1415 }
1416 
vhmin(vfloat a)1417 static inline float vhmin(vfloat a) {
1418     // returns min(a[0], a[1], a[2], a[3])
1419     a = vminf(a, _mm_movehl_ps(a, a));
1420     return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
1421 }
1422 
vhmax(vfloat a)1423 static inline float vhmax(vfloat a) {
1424     // returns max(a[0], a[1], a[2], a[3])
1425     a = vmaxf(a, _mm_movehl_ps(a, a));
1426     return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
1427 }
1428 
vmul2f(vfloat a)1429 static INLINE vfloat vmul2f(vfloat a){
1430     // fastest way to multiply by 2
1431     return a + a;
1432 }
1433 
vintpf(vfloat a,vfloat b,vfloat c)1434 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
1435     // calculate a * b + (1 - a) * c (interpolate two values)
1436     // following is valid:
1437     // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
1438     // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
1439     return a * (b-c) + c;
1440 }
1441 
vdup(vfloat a)1442 static INLINE vfloat vdup(vfloat a){
1443     // returns { a[0],a[0],a[1],a[1] }
1444     return _mm_unpacklo_ps( a, a );
1445 }
1446 
vaddc2vfu(float & a)1447 static INLINE vfloat vaddc2vfu(float &a)
1448 {
1449     // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
1450     vfloat a1 = _mm_loadu_ps( &a );
1451     vfloat a2 = _mm_loadu_ps( (&a) + 4 );
1452     return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
1453 }
1454 
vadivapb(vfloat a,vfloat b)1455 static INLINE vfloat vadivapb (vfloat a, vfloat b) {
1456     return a / (a+b);
1457 }
1458 
vconvertrgbrgbrgbrgb2rrrrggggbbbb(const float * src,vfloat & rv,vfloat & gv,vfloat & bv)1459 static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
1460     // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
1461     rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
1462     gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
1463     bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
1464 }
1465 
1466 #if defined( __SSE4_1__ ) && defined( __x86_64__ )
vceilf(vfloat x)1467 static INLINE vfloat vceilf(vfloat x) {
1468     return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
1469 }
1470 
1471 #else
1472 
vceilf(vfloat x)1473 static INLINE vfloat vceilf(vfloat x) {
1474     __m128i zerov = _mm_setzero_si128();
1475     zerov = _mm_cmpeq_epi32(zerov, zerov);
1476     const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
1477     const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
1478     return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
1479 }
1480 #endif
1481 
1482 #endif // __SSE2__
1483 #endif // SLEEFSSEAVX
1484