1 ////////////////////////////////////////////////////////////////
2 //
3 //  this code was taken from http://shibatch.sourceforge.net/
4 //  Many thanks to the author of original version: Naoki Shibata
5 //
6 //   Copyright Naoki Shibata and contributors 2010 - 2021.
7 // Distributed under the Boost Software License, Version 1.0.
8 //    (See accompanying file sleef_LICENSE.txt or copy at
9 //          http://www.boost.org/LICENSE_1_0.txt)
10 //
11 //  This version contains modifications made by Ingo Weyrich
12 //
13 ////////////////////////////////////////////////////////////////
14 #pragma once
15 
16 #include "rt_math.h"
17 #ifdef __SSE2__
18 #include "helpersse2.h"
19 
20 #ifdef ENABLE_AVX
21 #include "helperavx.h"
22 #endif
23 
24 #ifdef __GNUC__
25 #define INLINE __inline
26 #else
27 #define INLINE inline
28 #endif
29 
30 #define PI4_A .7853981554508209228515625
31 #define PI4_B .794662735614792836713604629039764404296875e-8
32 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
33 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
34 
35 #define L2U .69314718055966295651160180568695068359375
36 #define L2L .28235290563031577122588448175013436025525412068e-12
37 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
38 
39 #define PI4_Af 0.78515625f
40 #define PI4_Bf 0.00024127960205078125f
41 #define PI4_Cf 6.3329935073852539062e-07f
42 #define PI4_Df 4.9604681473525147339e-10f
43 
44 #define L2Uf 0.693145751953125f
45 #define L2Lf 1.428606765330187045e-06f
46 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
47 
48 #define INFINITYf ((float)rtengine::RT_INFINITY)
49 #define NANf ((float)rtengine::RT_NAN)
50 
vadd3(vdouble v0,vdouble v1,vdouble v2)51 static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
52     return vadd(vadd(v0, v1), v2);
53 }
54 
vadd4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)55 static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
56     return vadd3(vadd(v0, v1), v2, v3);
57 }
58 
vadd5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)59 static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
60     return vadd4(vadd(v0, v1), v2, v3, v4);
61 }
62 
vadd6(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5)63 static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
64     return vadd5(vadd(v0, v1), v2, v3, v4, v5);
65 }
66 
vadd7(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5,vdouble v6)67 static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
68     return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
69 }
70 
vsub3(vdouble v0,vdouble v1,vdouble v2)71 static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
72     return vsub(vsub(v0, v1), v2);
73 }
74 
vsub4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)75 static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
76     return vsub3(vsub(v0, v1), v2, v3);
77 }
78 
vsub5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)79 static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
80     return vsub4(vsub(v0, v1), v2, v3, v4);
81 }
82 
83 //
84 
normalize_d(vdouble2 t)85 static INLINE vdouble2 normalize_d(vdouble2 t) {
86     vdouble2 s;
87 
88     s.x = vadd(t.x, t.y);
89     s.y = vadd(vsub(t.x, s.x), t.y);
90 
91     return s;
92 }
93 
scale_d(vdouble2 d,vdouble s)94 static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
95     vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
96     return r;
97 }
98 
add_ss(vdouble x,vdouble y)99 static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
100     vdouble2 r;
101 
102     r.x = vadd(x, y);
103     r.y = vadd(vsub(x, r.x), y);
104 
105     return r;
106 }
107 
add2_ss(vdouble x,vdouble y)108 static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
109     vdouble2 r;
110 
111     r.x = vadd(x, y);
112     vdouble v = vsub(r.x, x);
113     r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
114 
115     return r;
116 }
117 
add_ds(vdouble2 x,vdouble y)118 static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
119     vdouble2 r;
120 
121     r.x = vadd(x.x, y);
122     r.y = vadd3(vsub(x.x, r.x), y, x.y);
123 
124     return r;
125 }
126 
add2_ds(vdouble2 x,vdouble y)127 static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
128     vdouble2 r;
129 
130     r.x = vadd(x.x, y);
131     vdouble v = vsub(r.x, x.x);
132     r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
133     r.y = vadd(r.y, x.y);
134 
135     return r;
136 }
137 
add_sd(vdouble x,vdouble2 y)138 static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
139     vdouble2 r;
140 
141     r.x = vadd(x, y.x);
142     r.y = vadd3(vsub(x, r.x), y.x, y.y);
143 
144     return r;
145 }
146 
add_dd(vdouble2 x,vdouble2 y)147 static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
148     // |x| >= |y|
149 
150     vdouble2 r;
151 
152     r.x = vadd(x.x, y.x);
153     r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
154 
155     return r;
156 }
157 
add2_dd(vdouble2 x,vdouble2 y)158 static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
159     vdouble2 r;
160 
161     r.x  = vadd(x.x, y.x);
162     vdouble v = vsub(r.x, x.x);
163     r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
164     r.y = vadd(r.y, vadd(x.y, y.y));
165 
166     return r;
167 }
168 
div_dd(vdouble2 n,vdouble2 d)169 static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
170     vdouble t = vrec(d.x);
171     vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
172     vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
173     vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
174 
175     vdouble2 q;
176 
177     q.x = vmul(n.x, t);
178 
179     vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
180             vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
181 
182     q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
183 
184     return q;
185 }
186 
mul_ss(vdouble x,vdouble y)187 static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
188     vdouble xh = vupper(x), xl = vsub(x, xh);
189     vdouble yh = vupper(y), yl = vsub(y, yh);
190     vdouble2 r;
191 
192     r.x = vmul(x, y);
193     r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
194 
195     return r;
196 }
197 
mul_ds(vdouble2 x,vdouble y)198 static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
199     vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
200     vdouble yh = vupper(y  ), yl = vsub(y  , yh);
201     vdouble2 r;
202 
203     r.x = vmul(x.x, y);
204     r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
205 
206     return r;
207 }
208 
mul_dd(vdouble2 x,vdouble2 y)209 static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
210     vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
211     vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
212     vdouble2 r;
213 
214     r.x = vmul(x.x, y.x);
215     r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
216 
217     return r;
218 }
219 
squ_d(vdouble2 x)220 static INLINE vdouble2 squ_d(vdouble2 x) {
221     vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
222     vdouble2 r;
223 
224     r.x = vmul(x.x, x.x);
225     r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
226 
227     return r;
228 }
229 
rec_s(vdouble d)230 static INLINE vdouble2 rec_s(vdouble d) {
231     vdouble t = vrec(d);
232     vdouble dh = vupper(d), dl = vsub(d, dh);
233     vdouble th = vupper(t), tl = vsub(t, th);
234     vdouble2 q;
235 
236     q.x = t;
237     q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
238 
239     return q;
240 }
241 
sqrt_d(vdouble2 d)242 static INLINE vdouble2 sqrt_d(vdouble2 d) {
243     vdouble t = vsqrt(vadd(d.x, d.y));
244     return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
245 }
246 
247 //
248 
xldexp(vdouble x,vint q)249 static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
250 
xilogb(vdouble d)251 static INLINE vint xilogb(vdouble d) {
252     vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
253     e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
254     e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
255     return vrint_vi_vd(e);
256 }
257 
xsin(vdouble d)258 static INLINE vdouble xsin(vdouble d) {
259     vint q;
260     vdouble u, s;
261 
262     q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
263 
264     u = vcast_vd_vi(q);
265     d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
266     d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
267     d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
268 
269     s = vmul(d, d);
270 
271     d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
272 
273     u = vcast_vd_d(-7.97255955009037868891952e-18);
274     u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
275     u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
276     u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
277     u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
278     u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
279     u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
280     u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
281     u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
282 
283     u = vmla(s, vmul(u, d), d);
284 
285     return u;
286 }
287 
xcos(vdouble d)288 static INLINE vdouble xcos(vdouble d) {
289     vint q;
290     vdouble u, s;
291 
292     q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
293     q = vaddi(vaddi(q, q), vcast_vi_i(1));
294 
295     u = vcast_vd_vi(q);
296     d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
297     d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
298     d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
299 
300     s = vmul(d, d);
301 
302     d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
303 
304     u = vcast_vd_d(-7.97255955009037868891952e-18);
305     u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
306     u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
307     u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
308     u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
309     u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
310     u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
311     u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
312     u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
313 
314     u = vmla(s, vmul(u, d), d);
315 
316     return u;
317 }
318 
xsincos(vdouble d)319 static INLINE vdouble2 xsincos(vdouble d) {
320     vint q;
321     vmask m;
322     vdouble u, s, t, rx, ry;
323     vdouble2 r;
324 
325     q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
326 
327     s = d;
328 
329     u = vcast_vd_vi(q);
330     s = vmla(u, vcast_vd_d(-PI4_A*2), s);
331     s = vmla(u, vcast_vd_d(-PI4_B*2), s);
332     s = vmla(u, vcast_vd_d(-PI4_C*2), s);
333 
334     t = s;
335 
336     s = vmul(s, s);
337 
338     u = vcast_vd_d(1.58938307283228937328511e-10);
339     u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
340     u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
341     u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
342     u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
343     u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
344     u = vmul(vmul(u, s), t);
345 
346     rx = vadd(t, u);
347 
348     u = vcast_vd_d(-1.13615350239097429531523e-11);
349     u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
350     u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
351     u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
352     u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
353     u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
354     u = vmla(u, s, vcast_vd_d(-0.5));
355 
356     ry = vadd(vcast_vd_d(1), vmul(s, u));
357 
358     m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
359     r.x = vsel(m, rx, ry);
360     r.y = vsel(m, ry, rx);
361 
362     m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
363     r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
364 
365     m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
366     r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
367 
368     m = vmask_isinf(d);
369     r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
370     r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
371 
372     return r;
373 }
374 
xtan(vdouble d)375 static INLINE vdouble xtan(vdouble d) {
376     vint q;
377     vdouble u, s, x;
378     vmask m;
379 
380     q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
381 
382     u = vcast_vd_vi(q);
383     x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
384     x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
385     x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
386 
387     s = vmul(x, x);
388 
389     m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
390     x = vsel(m, vneg(x), x);
391 
392     u = vcast_vd_d(1.01419718511083373224408e-05);
393     u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
394     u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
395     u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
396     u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
397     u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
398     u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
399     u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
400     u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
401     u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
402     u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
403     u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
404     u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
405     u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
406     u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
407 
408     u = vmla(s, vmul(u, x), x);
409 
410     u = vsel(m, vrec(u), u);
411 
412     u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
413 
414     return u;
415 }
416 
atan2k(vdouble y,vdouble x)417 static INLINE vdouble atan2k(vdouble y, vdouble x) {
418     vdouble s, t, u;
419     vint q;
420     vmask p;
421 
422     q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
423     x = vabs(x);
424 
425     q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
426     p = vmask_lt(x, y);
427     s = vsel (p, vneg(x), y);
428     t = vmax (x, y);
429 
430     s = vdiv(s, t);
431     t = vmul(s, s);
432 
433     u = vcast_vd_d(-1.88796008463073496563746e-05);
434     u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
435     u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
436     u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
437     u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
438     u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
439     u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
440     u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
441     u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
442     u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
443     u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
444     u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
445     u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
446     u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
447     u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
448     u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
449     u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
450     u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
451     u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
452 
453     t = vadd(s, vmul(s, vmul(t, u)));
454     t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
455 
456     return t;
457 }
458 
xatan2(vdouble y,vdouble x)459 static INLINE vdouble xatan2(vdouble y, vdouble x) {
460     vdouble r = atan2k(vabs(y), x);
461 
462     r = vmulsign(r, x);
463     r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
464     r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
465     r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
466 
467     return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
468 }
469 
xasin(vdouble d)470 static INLINE vdouble xasin(vdouble d) {
471     vdouble x, y;
472     x = vadd(vcast_vd_d(1), d);
473     y = vsub(vcast_vd_d(1), d);
474     x = vmul(x, y);
475     x = vsqrt(x);
476     x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
477     return vmulsign(x, d);
478 }
479 
xacos(vdouble d)480 static INLINE vdouble xacos(vdouble d) {
481     vdouble x, y;
482     x = vadd(vcast_vd_d(1), d);
483     y = vsub(vcast_vd_d(1), d);
484     x = vmul(x, y);
485     x = vsqrt(x);
486     x = vmulsign(atan2k(x, vabs(d)), d);
487     y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
488     x = vadd(x, y);
489     return x;
490 }
491 
xatan(vdouble s)492 static INLINE vdouble xatan(vdouble s) {
493     vdouble t, u;
494     vint q;
495 
496     q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
497     s = vabs(s);
498 
499     q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
500     s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
501 
502     t = vmul(s, s);
503 
504     u = vcast_vd_d(-1.88796008463073496563746e-05);
505     u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
506     u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
507     u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
508     u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
509     u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
510     u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
511     u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
512     u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
513     u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
514     u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
515     u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
516     u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
517     u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
518     u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
519     u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
520     u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
521     u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
522     u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
523 
524     t = vadd(s, vmul(s, vmul(t, u)));
525 
526     t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
527     t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
528 
529     return t;
530 }
531 
xlog(vdouble d)532 static INLINE vdouble xlog(vdouble d) {
533     vdouble x, x2;
534     vdouble t, m;
535     vint e;
536 
537     e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
538     m = vldexp(d, vsubi(vcast_vi_i(0), e));
539 
540     x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
541     x2 = vmul(x, x);
542 
543     t = vcast_vd_d(0.148197055177935105296783);
544     t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
545     t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
546     t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
547     t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
548     t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
549     t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
550     t = vmla(t, x2, vcast_vd_d(2));
551 
552     x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
553 
554     x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
555     x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
556     x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
557 
558     return x;
559 }
560 
xexp(vdouble d)561 static INLINE vdouble xexp(vdouble d) {
562     vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
563     vdouble s, u;
564 
565     s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
566     s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
567 
568     u = vcast_vd_d(2.08860621107283687536341e-09);
569     u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
570     u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
571     u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
572     u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
573     u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
574     u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
575     u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
576     u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
577     u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
578     u = vmla(u, s, vcast_vd_d(0.5));
579 
580     u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
581 
582     u = vldexp(u, q);
583 
584     u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
585 
586     return u;
587 }
588 
logk(vdouble d)589 static INLINE vdouble2 logk(vdouble d) {
590     vdouble2 x, x2;
591     vdouble t, m;
592     vint e;
593 
594     e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
595     m = vldexp(d, vsubi(vcast_vi_i(0), e));
596 
597     x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
598     x2 = squ_d(x);
599     x2 = normalize_d(x2);
600 
601     t = vcast_vd_d(0.134601987501262130076155);
602     t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
603     t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
604     t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
605     t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
606     t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
607     t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
608     t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
609 
610     return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
611                 vcast_vd_vi(e)),
612             add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
613 }
614 
expk(vdouble2 d)615 static INLINE vdouble expk(vdouble2 d) {
616     vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
617     vint q = vrint_vi_vd(u);
618     vdouble2 s, t;
619 
620     s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
621     s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
622 
623     q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
624 
625     s = normalize_d(s);
626 
627     u = vcast_vd_d(2.51069683420950419527139e-08);
628     u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
629     u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
630     u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
631     u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
632     u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
633     u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
634     u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
635     u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
636     u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
637 
638     t = add_dd(s, mul_ds(squ_d(s), u));
639 
640     t = add_sd(vcast_vd_d(1), t);
641     u = vadd(t.x, t.y);
642     u = vldexp(u, q);
643 
644     return u;
645 }
646 
xpow(vdouble x,vdouble y)647 static INLINE vdouble xpow(vdouble x, vdouble y) {
648 #if 1
649     vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
650     vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
651 
652     vdouble result = expk(mul_ds(logk(vabs(x)), y));
653 
654     //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
655 
656     result = vmul(result,
657             vsel(vmask_gt(x, vcast_vd_d(0)),
658                 vcast_vd_d(1),
659                 vsel(yisint,
660                     vsel(yisodd,
661                         vcast_vd_d(-1),
662                         vcast_vd_d(1)),
663                     vcast_vd_d(rtengine::RT_NAN))));
664 
665     vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
666 
667     result = vsel(vmask_isinf(y),
668             vsel(vmask_lt(efx, vcast_vd_d(0)),
669                 vcast_vd_d(0),
670                 vsel(vmask_eq(efx, vcast_vd_d(0)),
671                     vcast_vd_d(1.0),
672                     vcast_vd_d(rtengine::RT_INFINITY))),
673             result);
674 
675     result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
676             vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
677                 vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
678                     vcast_vd_d(0),
679                     vcast_vd_d(rtengine::RT_INFINITY))),
680             result);
681 
682     result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
683 
684     result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
685 
686     return result;
687 #else
688     return expk(mul_ds(logk(x), y));
689 #endif
690 }
691 
expk2(vdouble2 d)692 static INLINE vdouble2 expk2(vdouble2 d) {
693     vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
694     vint q = vrint_vi_vd(u);
695     vdouble2 s, t;
696 
697     s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
698     s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
699 
700     q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
701 
702     s = normalize_d(s);
703 
704     u = vcast_vd_d(2.51069683420950419527139e-08);
705     u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
706     u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
707     u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
708     u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
709     u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
710     u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
711     u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
712     u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
713     u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
714 
715     t = add_dd(s, mul_ds(squ_d(s), u));
716 
717     t = add_sd(vcast_vd_d(1), t);
718 
719     return dd(vldexp(t.x, q), vldexp(t.y, q));
720 }
721 
xsinh(vdouble x)722 static INLINE vdouble xsinh(vdouble x) {
723     vdouble y = vabs(x);
724     vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
725     d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
726     y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
727 
728     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
729     y = vmulsign(y, x);
730     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
731 
732     return y;
733 }
734 
xcosh(vdouble x)735 static INLINE vdouble xcosh(vdouble x) {
736     vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
737     d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
738     vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
739 
740     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
741     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
742 
743     return y;
744 }
745 
xtanh(vdouble x)746 static INLINE vdouble xtanh(vdouble x) {
747     vdouble y = vabs(x);
748     vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
749     vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
750     d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
751     y = d.x + d.y;
752 
753     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
754     y = vmulsign(y, x);
755     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
756 
757     return y;
758 }
759 
logk2(vdouble2 d)760 static INLINE vdouble2 logk2(vdouble2 d) {
761     vdouble2 x, x2, m;
762     vdouble t;
763     vint e;
764 
765     d = normalize_d(d);
766     e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
767     m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
768 
769     x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
770     x2 = squ_d(x);
771     x2 = normalize_d(x2);
772 
773     t = vcast_vd_d(0.134601987501262130076155);
774     t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
775     t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
776     t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
777     t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
778     t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
779     t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
780     t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
781 
782     return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
783                 vcast_vd_vi(e)),
784             add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
785 }
786 
xasinh(vdouble x)787 static INLINE vdouble xasinh(vdouble x) {
788     vdouble y = vabs(x);
789     vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
790     y = vadd(d.x, d.y);
791 
792     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
793     y = vmulsign(y, x);
794     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
795 
796     return y;
797 }
798 
xacosh(vdouble x)799 static INLINE vdouble xacosh(vdouble x) {
800     vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
801     vdouble y = vadd(d.x, d.y);
802 
803     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
804     y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
805     y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
806     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
807 
808     return y;
809 }
810 
xatanh(vdouble x)811 static INLINE vdouble xatanh(vdouble x) {
812     vdouble y = vabs(x);
813     vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
814     y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
815 
816     y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
817     y = vmulsign(y, x);
818     y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
819 
820     return y;
821 }
822 
xcbrt(vdouble d)823 static INLINE vdouble xcbrt(vdouble d) {
824     vdouble x, y, q = vcast_vd_d(1.0);
825     vint e, qu, re;
826     vdouble t;
827 
828     e = vilogbp1(vabs(d));
829     d = vldexp(d, vsubi(vcast_vi_i(0), e));
830 
831     t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
832     qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
833     re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
834 
835     q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
836     q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
837     q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
838 
839     q = vmulsign(q, d);
840 
841     d = vabs(d);
842 
843     x = vcast_vd_d(-0.640245898480692909870982);
844     x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
845     x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
846     x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
847     x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
848     x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
849 
850     y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
851     y = vmul(vmul(d, x), x);
852     y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
853 
854     return y;
855 }
856 
xexp2(vdouble a)857 static INLINE vdouble xexp2(vdouble a) {
858     vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
859     u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
860     u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
861     return u;
862 }
863 
xexp10(vdouble a)864 static INLINE vdouble xexp10(vdouble a) {
865     vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
866     u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
867     u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
868     return u;
869 }
870 
xexpm1(vdouble a)871 static INLINE vdouble xexpm1(vdouble a) {
872     vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
873     vdouble x = d.x + d.y;
874     x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
875     x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
876     return x;
877 }
878 
xlog10(vdouble a)879 static INLINE vdouble xlog10(vdouble a) {
880     vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
881     vdouble x = d.x + d.y;
882 
883     x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
884     x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
885     x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
886 
887     return x;
888 }
889 
xlog1p(vdouble a)890 static INLINE vdouble xlog1p(vdouble a) {
891     vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
892     vdouble x = d.x + d.y;
893 
894     x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
895     x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
896     x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
897 
898     return x;
899 }
900 
901 //
902 
903 typedef struct {
904     vfloat x, y;
905 } vfloat2;
906 
vabsf(vfloat f)907 static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
vnegf(vfloat f)908 static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
909 
910 #ifdef __SSE4_1__
911 // only one instruction when using SSE4.1
vself(vmask mask,vfloat x,vfloat y)912 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
913     return _mm_blendv_ps(y,x,(vfloat)mask);
914 }
915 
vselc(vmask mask,vint x,vint y)916 static INLINE vint vselc(vmask mask, vint x, vint y) {
917     return _mm_blendv_epi8(y,x,mask);
918 }
919 
920 #else
921 // three instructions when using SSE2
vself(vmask mask,vfloat x,vfloat y)922 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
923     return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
924 }
925 
vselc(vmask mask,vint x,vint y)926 static INLINE vint vselc(vmask mask, vint x, vint y) {
927     return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
928 }
929 #endif
930 
vselfzero(vmask mask,vfloat x)931 static INLINE vfloat vselfzero(vmask mask, vfloat x) {
932     // returns value of x if corresponding mask bits are 1, else returns 0
933     // faster than vself(mask, x, ZEROV)
934     return _mm_and_ps((vfloat)mask, x);
935 }
vselfnotzero(vmask mask,vfloat x)936 static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
937     // returns value of x if corresponding mask bits are 0, else returns 0
938     // faster than vself(mask, ZEROV, x)
939     return _mm_andnot_ps((vfloat)mask, x);
940 }
941 
vselizero(vmask mask,vint x)942 static INLINE vint vselizero(vmask mask, vint x) {
943     // returns value of x if corresponding mask bits are 1, else returns 0
944     // faster than vselc(mask, x, ZEROV)
945     return _mm_and_si128(mask, x);
946 }
vselinotzero(vmask mask,vint x)947 static INLINE vint vselinotzero(vmask mask, vint x) {
948     // returns value of x if corresponding mask bits are 0, else returns 0
949     // faster than vselc(mask, ZEROV, x)
950     return _mm_andnot_si128(mask, x);
951 }
952 
vseli2_lt(vfloat f0,vfloat f1,vint2 x,vint2 y)953 static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
954     vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
955     return vori2(vandi2(m2, x), vandnoti2(m2, y));
956 }
957 
vsignbitf(vfloat f)958 static INLINE vmask vsignbitf(vfloat f) {
959     return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
960 }
961 
vmulsignf(vfloat x,vfloat y)962 static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
963     return (vfloat)vxorm((vmask)x, vsignbitf(y));
964 }
965 
vsignf(vfloat f)966 static INLINE vfloat vsignf(vfloat f) {
967     return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
968 }
969 
vmaskf_isinf(vfloat d)970 static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
vmaskf_ispinf(vfloat d)971 static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
vmaskf_isminf(vfloat d)972 static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
vmaskf_isnan(vfloat d)973 static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
974 // the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
vmaskf_isnan(vfloat a,vfloat b)975 static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
visinf2f(vfloat d,vfloat m)976 static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
visinff(vfloat d)977 static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
978 
vilogbp1f(vfloat d)979 static INLINE vint2 vilogbp1f(vfloat d) {
980     vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
981     d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
982     vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
983     q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
984     return q;
985 }
986 
vldexpf(vfloat x,vint2 q)987 static INLINE vfloat vldexpf(vfloat x, vint2 q) {
988     vfloat u;
989     vint2 m = vsrai2(q, 31);
990     m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
991     q = vsubi2(q, vslli2(m, 2));
992     u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
993     x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
994     u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
995     return vmulf(x, u);
996 }
997 
xsinf(vfloat d)998 static INLINE vfloat xsinf(vfloat d) {
999     vint2 q;
1000     vfloat u, s;
1001 
1002     q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
1003 
1004     u = vcast_vf_vi2(q);
1005     d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
1006     d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
1007     d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
1008     d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
1009 
1010     s = vmulf(d, d);
1011 
1012     d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
1013 
1014     u = vcast_vf_f(2.6083159809786593541503e-06f);
1015     u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1016     u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1017     u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1018 
1019     u = vmlaf(s, vmulf(u, d), d);
1020 
1021     return u;
1022 }
1023 
xcosf(vfloat d)1024 static INLINE vfloat xcosf(vfloat d) {
1025     vint2 q;
1026     vfloat u, s;
1027 
1028     q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
1029     q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
1030 
1031     u = vcast_vf_vi2(q);
1032     d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
1033     d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
1034     d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
1035     d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
1036 
1037     s = vmulf(d, d);
1038 
1039     d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
1040 
1041     u = vcast_vf_f(2.6083159809786593541503e-06f);
1042     u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1043     u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1044     u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1045 
1046     u = vmlaf(s, vmulf(u, d), d);
1047 
1048     return u;
1049 }
1050 
xsincosf(vfloat d)1051 static INLINE vfloat2 xsincosf(vfloat d) {
1052     vint2 q;
1053     vmask m;
1054     vfloat u, s, t, rx, ry;
1055     vfloat2 r;
1056 
1057     q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
1058 
1059     s = d;
1060 
1061     u = vcast_vf_vi2(q);
1062     s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
1063     s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
1064     s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
1065     s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
1066 
1067     t = s;
1068 
1069     s = vmulf(s, s);
1070 
1071     u = vcast_vf_f(-0.000195169282960705459117889f);
1072     u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
1073     u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
1074     u = vmulf(vmulf(u, s), t);
1075 
1076     rx = vaddf(t, u);
1077 
1078     u = vcast_vf_f(-2.71811842367242206819355e-07f);
1079     u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
1080     u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
1081     u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
1082     u = vmlaf(u, s, vcast_vf_f(-0.5));
1083 
1084     ry = vaddf(vcast_vf_f(1), vmulf(s, u));
1085 
1086     m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
1087     r.x = vself(m, rx, ry);
1088     r.y = vself(m, ry, rx);
1089 
1090     m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
1091     r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
1092 
1093     m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
1094     r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
1095 
1096     m = vmaskf_isinf(d);
1097     r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
1098     r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
1099 
1100     return r;
1101 }
1102 
xtanf(vfloat d)1103 static INLINE vfloat xtanf(vfloat d) {
1104     vint2 q;
1105     vmask m;
1106     vfloat u, s, x;
1107 
1108     q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
1109 
1110     x = d;
1111 
1112     u = vcast_vf_vi2(q);
1113     x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
1114     x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
1115     x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
1116     x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
1117 
1118     s = vmulf(x, x);
1119 
1120     m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
1121     x = vself(m, vnegf(x), x);
1122 
1123     u = vcast_vf_f(0.00927245803177356719970703f);
1124     u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
1125     u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
1126     u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
1127     u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
1128     u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
1129 
1130     u = vmlaf(s, vmulf(u, x), x);
1131 
1132     u = vself(m, vrecf(u), u);
1133 
1134     u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
1135 
1136     return u;
1137 }
1138 
xatanf(vfloat s)1139 static INLINE vfloat xatanf(vfloat s) {
1140     vfloat t, u;
1141     vint2 q;
1142 
1143     q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
1144     s = vabsf(s);
1145 
1146     q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
1147     s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
1148 
1149     t = vmulf(s, s);
1150 
1151     u = vcast_vf_f(0.00282363896258175373077393f);
1152     u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1153     u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1154     u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1155     u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1156     u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1157     u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1158     u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1159 
1160     t = vaddf(s, vmulf(s, vmulf(t, u)));
1161 
1162     t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
1163     t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
1164 
1165     return t;
1166 }
1167 
atan2kf(vfloat y,vfloat x)1168 static INLINE vfloat atan2kf(vfloat y, vfloat x) {
1169     vfloat s, t, u;
1170     vint2 q;
1171     vmask p;
1172 
1173     q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
1174     x = vabsf(x);
1175 
1176     q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
1177     p = vmaskf_lt(x, y);
1178     s = vself(p, vnegf(x), y);
1179     t = vmaxf(x, y);
1180 
1181     s = vdivf(s, t);
1182     t = vmulf(s, s);
1183 
1184     u = vcast_vf_f(0.00282363896258175373077393f);
1185     u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1186     u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1187     u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1188     u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1189     u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1190     u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1191     u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1192 
1193     t = vaddf(s, vmulf(s, vmulf(t, u)));
1194     t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
1195 
1196     return t;
1197 }
1198 
xatan2f(vfloat y,vfloat x)1199 static INLINE vfloat xatan2f(vfloat y, vfloat x) {
1200     vfloat r = atan2kf(vabsf(y), x);
1201 
1202     r = vmulsignf(r, x);
1203     r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
1204     r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
1205     r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
1206 
1207     return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
1208 }
1209 
xasinf(vfloat d)1210 static INLINE vfloat xasinf(vfloat d) {
1211     vfloat x, y;
1212     x = vaddf(vcast_vf_f(1.0f), d);
1213     y = vsubf(vcast_vf_f(1.0f), d);
1214     x = vmulf(x, y);
1215     x = vsqrtf(x);
1216     x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
1217     return vmulsignf(x, d);
1218 }
1219 
xacosf(vfloat d)1220 static INLINE vfloat xacosf(vfloat d) {
1221     vfloat x, y;
1222     x = vaddf(vcast_vf_f(1.0f), d);
1223     y = vsubf(vcast_vf_f(1.0f), d);
1224     x = vmulf(x, y);
1225     x = vsqrtf(x);
1226     x = vmulsignf(atan2kf(x, vabsf(d)), d);
1227     y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
1228     x = vaddf(x, y);
1229     return x;
1230 }
1231 
xlogf(vfloat d)1232 static INLINE vfloat xlogf(vfloat d) {
1233     vfloat x, x2, t, m;
1234     vint2 e;
1235 
1236     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1237     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1238 
1239     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1240     x2 = vmulf(x, x);
1241 
1242     t = vcast_vf_f(0.2371599674224853515625f);
1243     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1244     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1245     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1246     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1247 
1248     x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1249 
1250     x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
1251     x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
1252     x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
1253 
1254     return x;
1255 }
1256 
xlogf1(vfloat d)1257 static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
1258     vfloat x, x2, t, m;
1259     vint2 e;
1260 
1261     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1262     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1263 
1264     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1265     x2 = vmulf(x, x);
1266 
1267     t = vcast_vf_f(0.2371599674224853515625f);
1268     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1269     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1270     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1271     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1272 
1273     x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1274 
1275     x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
1276     x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
1277 
1278     return x;
1279 }
1280 
xlogf0(vfloat d)1281 static INLINE vfloat xlogf0(vfloat d) {
1282     vfloat x, x2, t, m;
1283     vint2 e;
1284 
1285     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1286     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1287 
1288     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1289     x2 = vmulf(x, x);
1290 
1291     t = vcast_vf_f(0.2371599674224853515625f);
1292     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1293     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1294     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1295     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1296 
1297     x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1298 
1299     x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
1300     x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
1301     x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
1302 
1303     return x;
1304 }
1305 
xlogfNoCheck(vfloat d)1306 static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
1307     vfloat x, x2, t, m;
1308     vint2 e;
1309 
1310     e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1311     m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1312 
1313     x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1314     x2 = vmulf(x, x);
1315 
1316     t = vcast_vf_f(0.2371599674224853515625f);
1317     t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1318     t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1319     t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1320     t = vmlaf(t, x2, vcast_vf_f(2.0f));
1321 
1322     return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1323 
1324 }
1325 
xexpf(vfloat d)1326 static INLINE vfloat xexpf(vfloat d) {
1327     vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1328     vfloat s, u;
1329 
1330     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1331     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1332 
1333     u = vcast_vf_f(0.00136324646882712841033936f);
1334     u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1335     u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1336     u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1337     u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1338 
1339     u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1340 
1341     u = vldexpf(u, q);
1342 
1343     // -104.0
1344     return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
1345 }
1346 
xexpfNoCheck(vfloat d)1347 static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
1348     vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1349     vfloat s, u;
1350 
1351     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1352     s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1353 
1354     u = vcast_vf_f(0.00136324646882712841033936f);
1355     u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1356     u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1357     u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1358     u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1359 
1360     u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1361 
1362     return vldexpf(u, q);
1363 }
1364 
xcbrtf(vfloat d)1365 static INLINE vfloat xcbrtf(vfloat d) {
1366     vfloat x, y, q = vcast_vf_f(1.0), t;
1367     vint2 e, qu, re;
1368 
1369     e = vilogbp1f(vabsf(d));
1370     d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1371 
1372     t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
1373     qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
1374     re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
1375 
1376     q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
1377     q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
1378     q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
1379 
1380     q = vmulsignf(q, d);
1381     d = vabsf(d);
1382 
1383     x = vcast_vf_f(-0.601564466953277587890625f);
1384     x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
1385     x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
1386     x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
1387     x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
1388     x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
1389 
1390     y = vmulf(vmulf(d, x), x);
1391     y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
1392 
1393     return y;
1394 }
1395 
vclampf(vfloat value,vfloat low,vfloat high)1396 static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
1397     // clamps value in [low;high], returns low if value is NaN
1398     return vmaxf(vminf(high, value), low);
1399 }
1400 
SQRV(vfloat a)1401 static INLINE vfloat SQRV(vfloat a){
1402     return a * a;
1403 }
1404 
vswap(vmask condition,vfloat & a,vfloat & b)1405 static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
1406     // conditional swap the elements of two vfloats
1407     vfloat temp = vself(condition, a, b); // the values which fit to condition
1408     a = vself(condition, b, a); // the values which fit to inverted condition
1409     b = temp;
1410 }
1411 
vhadd(vfloat a)1412 static inline float vhadd( vfloat a ) {
1413     // returns a[0] + a[1] + a[2] + a[3]
1414     a += _mm_movehl_ps(a, a);
1415     return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
1416 }
1417 
vhmin(vfloat a)1418 static inline float vhmin(vfloat a) {
1419     // returns min(a[0], a[1], a[2], a[3])
1420     a = vminf(a, _mm_movehl_ps(a, a));
1421     return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
1422 }
1423 
vhmax(vfloat a)1424 static inline float vhmax(vfloat a) {
1425     // returns max(a[0], a[1], a[2], a[3])
1426     a = vmaxf(a, _mm_movehl_ps(a, a));
1427     return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
1428 }
1429 
vmul2f(vfloat a)1430 static INLINE vfloat vmul2f(vfloat a){
1431     // fastest way to multiply by 2
1432     return a + a;
1433 }
1434 
vintpf(vfloat a,vfloat b,vfloat c)1435 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
1436     // calculate a * b + (1 - a) * c (interpolate two values)
1437     // following is valid:
1438     // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
1439     // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
1440     return a * (b-c) + c;
1441 }
1442 
vdup(vfloat a)1443 static INLINE vfloat vdup(vfloat a){
1444     // returns { a[0],a[0],a[1],a[1] }
1445     return _mm_unpacklo_ps( a, a );
1446 }
1447 
vaddc2vfu(float & a)1448 static INLINE vfloat vaddc2vfu(float &a)
1449 {
1450     // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
1451     vfloat a1 = _mm_loadu_ps( &a );
1452     vfloat a2 = _mm_loadu_ps( (&a) + 4 );
1453     return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
1454 }
1455 
vadivapb(vfloat a,vfloat b)1456 static INLINE vfloat vadivapb (vfloat a, vfloat b) {
1457     return a / (a+b);
1458 }
1459 
vconvertrgbrgbrgbrgb2rrrrggggbbbb(const float * src,vfloat & rv,vfloat & gv,vfloat & bv)1460 static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
1461     // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
1462     rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
1463     gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
1464     bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
1465 }
1466 
1467 #if defined( __SSE4_1__ ) && defined( __x86_64__ )
vceilf(vfloat x)1468 static INLINE vfloat vceilf(vfloat x) {
1469     return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
1470 }
1471 
1472 #else
1473 
vceilf(vfloat x)1474 static INLINE vfloat vceilf(vfloat x) {
1475     __m128i zerov = _mm_setzero_si128();
1476     zerov = _mm_cmpeq_epi32(zerov, zerov);
1477     const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
1478     const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
1479     return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
1480 }
1481 #endif
1482 
1483 #endif // __SSE2__
1484