1 ////////////////////////////////////////////////////////////////
2 //
3 // this code was taken from http://shibatch.sourceforge.net/
4 // Many thanks to the author of original version: Naoki Shibata
5 //
6 // Copyright Naoki Shibata and contributors 2010 - 2021.
7 // Distributed under the Boost Software License, Version 1.0.
8 // (See accompanying file sleef_LICENSE.txt or copy at
9 // http://www.boost.org/LICENSE_1_0.txt)
10 //
11 // This version contains modifications made by Ingo Weyrich
12 //
13 ////////////////////////////////////////////////////////////////
14
15
16 #ifndef SLEEFSSEAVX
17 #define SLEEFSSEAVX
18
19 #include <assert.h>
20 #include "rt_math.h"
21 #ifdef __SSE2__
22 #include "helpersse2.h"
23
24 #ifdef ENABLE_AVX
25 #include "helperavx.h"
26 #endif
27
28 #ifndef INLINE
29 #define INLINE extern inline
30 #endif
31
32 #define PI4_A .7853981554508209228515625
33 #define PI4_B .794662735614792836713604629039764404296875e-8
34 #define PI4_C .306161699786838294306516483068750264552437361480769e-16
35 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
36
37 #define L2U .69314718055966295651160180568695068359375
38 #define L2L .28235290563031577122588448175013436025525412068e-12
39 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
40
41 #define PI4_Af 0.78515625f
42 #define PI4_Bf 0.00024127960205078125f
43 #define PI4_Cf 6.3329935073852539062e-07f
44 #define PI4_Df 4.9604681473525147339e-10f
45
46 #define L2Uf 0.693145751953125f
47 #define L2Lf 1.428606765330187045e-06f
48 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
49
50 #define INFINITYf ((float)rtengine::RT_INFINITY)
51 #define NANf ((float)rtengine::RT_NAN)
52
vadd3(vdouble v0,vdouble v1,vdouble v2)53 static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
54 return vadd(vadd(v0, v1), v2);
55 }
56
vadd4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)57 static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
58 return vadd3(vadd(v0, v1), v2, v3);
59 }
60
vadd5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)61 static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
62 return vadd4(vadd(v0, v1), v2, v3, v4);
63 }
64
vadd6(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5)65 static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
66 return vadd5(vadd(v0, v1), v2, v3, v4, v5);
67 }
68
vadd7(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4,vdouble v5,vdouble v6)69 static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
70 return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
71 }
72
vsub3(vdouble v0,vdouble v1,vdouble v2)73 static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
74 return vsub(vsub(v0, v1), v2);
75 }
76
vsub4(vdouble v0,vdouble v1,vdouble v2,vdouble v3)77 static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
78 return vsub3(vsub(v0, v1), v2, v3);
79 }
80
vsub5(vdouble v0,vdouble v1,vdouble v2,vdouble v3,vdouble v4)81 static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
82 return vsub4(vsub(v0, v1), v2, v3, v4);
83 }
84
85 //
86
normalize_d(vdouble2 t)87 static INLINE vdouble2 normalize_d(vdouble2 t) {
88 vdouble2 s;
89
90 s.x = vadd(t.x, t.y);
91 s.y = vadd(vsub(t.x, s.x), t.y);
92
93 return s;
94 }
95
scale_d(vdouble2 d,vdouble s)96 static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
97 vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
98 return r;
99 }
100
add_ss(vdouble x,vdouble y)101 static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
102 vdouble2 r;
103
104 r.x = vadd(x, y);
105 r.y = vadd(vsub(x, r.x), y);
106
107 return r;
108 }
109
add2_ss(vdouble x,vdouble y)110 static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
111 vdouble2 r;
112
113 r.x = vadd(x, y);
114 vdouble v = vsub(r.x, x);
115 r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
116
117 return r;
118 }
119
add_ds(vdouble2 x,vdouble y)120 static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
121 vdouble2 r;
122
123 r.x = vadd(x.x, y);
124 r.y = vadd3(vsub(x.x, r.x), y, x.y);
125
126 return r;
127 }
128
add2_ds(vdouble2 x,vdouble y)129 static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
130 vdouble2 r;
131
132 r.x = vadd(x.x, y);
133 vdouble v = vsub(r.x, x.x);
134 r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
135 r.y = vadd(r.y, x.y);
136
137 return r;
138 }
139
add_sd(vdouble x,vdouble2 y)140 static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
141 vdouble2 r;
142
143 r.x = vadd(x, y.x);
144 r.y = vadd3(vsub(x, r.x), y.x, y.y);
145
146 return r;
147 }
148
add_dd(vdouble2 x,vdouble2 y)149 static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
150 // |x| >= |y|
151
152 vdouble2 r;
153
154 r.x = vadd(x.x, y.x);
155 r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
156
157 return r;
158 }
159
add2_dd(vdouble2 x,vdouble2 y)160 static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
161 vdouble2 r;
162
163 r.x = vadd(x.x, y.x);
164 vdouble v = vsub(r.x, x.x);
165 r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
166 r.y = vadd(r.y, vadd(x.y, y.y));
167
168 return r;
169 }
170
div_dd(vdouble2 n,vdouble2 d)171 static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
172 vdouble t = vrec(d.x);
173 vdouble dh = vupper(d.x), dl = vsub(d.x, dh);
174 vdouble th = vupper(t ), tl = vsub(t , th);
175 vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
176
177 vdouble2 q;
178
179 q.x = vmul(n.x, t);
180
181 vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
182 vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
183
184 q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
185
186 return q;
187 }
188
mul_ss(vdouble x,vdouble y)189 static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
190 vdouble xh = vupper(x), xl = vsub(x, xh);
191 vdouble yh = vupper(y), yl = vsub(y, yh);
192 vdouble2 r;
193
194 r.x = vmul(x, y);
195 r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
196
197 return r;
198 }
199
mul_ds(vdouble2 x,vdouble y)200 static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
201 vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
202 vdouble yh = vupper(y ), yl = vsub(y , yh);
203 vdouble2 r;
204
205 r.x = vmul(x.x, y);
206 r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
207
208 return r;
209 }
210
mul_dd(vdouble2 x,vdouble2 y)211 static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
212 vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
213 vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
214 vdouble2 r;
215
216 r.x = vmul(x.x, y.x);
217 r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
218
219 return r;
220 }
221
squ_d(vdouble2 x)222 static INLINE vdouble2 squ_d(vdouble2 x) {
223 vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
224 vdouble2 r;
225
226 r.x = vmul(x.x, x.x);
227 r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
228
229 return r;
230 }
231
rec_s(vdouble d)232 static INLINE vdouble2 rec_s(vdouble d) {
233 vdouble t = vrec(d);
234 vdouble dh = vupper(d), dl = vsub(d, dh);
235 vdouble th = vupper(t), tl = vsub(t, th);
236 vdouble2 q;
237
238 q.x = t;
239 q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
240
241 return q;
242 }
243
sqrt_d(vdouble2 d)244 static INLINE vdouble2 sqrt_d(vdouble2 d) {
245 vdouble t = vsqrt(vadd(d.x, d.y));
246 return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
247 }
248
249 //
250
xldexp(vdouble x,vint q)251 static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
252
xilogb(vdouble d)253 static INLINE vint xilogb(vdouble d) {
254 vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
255 e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
256 e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
257 return vrint_vi_vd(e);
258 }
259
xsin(vdouble d)260 static INLINE vdouble xsin(vdouble d) {
261 vint q;
262 vdouble u, s;
263
264 q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
265
266 u = vcast_vd_vi(q);
267 d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
268 d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
269 d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
270
271 s = vmul(d, d);
272
273 d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
274
275 u = vcast_vd_d(-7.97255955009037868891952e-18);
276 u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
277 u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
278 u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
279 u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
280 u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
281 u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
282 u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
283 u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
284
285 u = vmla(s, vmul(u, d), d);
286
287 return u;
288 }
289
xcos(vdouble d)290 static INLINE vdouble xcos(vdouble d) {
291 vint q;
292 vdouble u, s;
293
294 q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
295 q = vaddi(vaddi(q, q), vcast_vi_i(1));
296
297 u = vcast_vd_vi(q);
298 d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
299 d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
300 d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
301
302 s = vmul(d, d);
303
304 d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
305
306 u = vcast_vd_d(-7.97255955009037868891952e-18);
307 u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
308 u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
309 u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
310 u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
311 u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
312 u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
313 u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
314 u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
315
316 u = vmla(s, vmul(u, d), d);
317
318 return u;
319 }
320
xsincos(vdouble d)321 static INLINE vdouble2 xsincos(vdouble d) {
322 vint q;
323 vmask m;
324 vdouble u, s, t, rx, ry;
325 vdouble2 r;
326
327 q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
328
329 s = d;
330
331 u = vcast_vd_vi(q);
332 s = vmla(u, vcast_vd_d(-PI4_A*2), s);
333 s = vmla(u, vcast_vd_d(-PI4_B*2), s);
334 s = vmla(u, vcast_vd_d(-PI4_C*2), s);
335
336 t = s;
337
338 s = vmul(s, s);
339
340 u = vcast_vd_d(1.58938307283228937328511e-10);
341 u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
342 u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
343 u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
344 u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
345 u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
346 u = vmul(vmul(u, s), t);
347
348 rx = vadd(t, u);
349
350 u = vcast_vd_d(-1.13615350239097429531523e-11);
351 u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
352 u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
353 u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
354 u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
355 u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
356 u = vmla(u, s, vcast_vd_d(-0.5));
357
358 ry = vadd(vcast_vd_d(1), vmul(s, u));
359
360 m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
361 r.x = vsel(m, rx, ry);
362 r.y = vsel(m, ry, rx);
363
364 m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
365 r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
366
367 m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
368 r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
369
370 m = vmask_isinf(d);
371 r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
372 r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
373
374 return r;
375 }
376
xtan(vdouble d)377 static INLINE vdouble xtan(vdouble d) {
378 vint q;
379 vdouble u, s, x;
380 vmask m;
381
382 q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
383
384 u = vcast_vd_vi(q);
385 x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
386 x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
387 x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
388
389 s = vmul(x, x);
390
391 m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
392 x = vsel(m, vneg(x), x);
393
394 u = vcast_vd_d(1.01419718511083373224408e-05);
395 u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
396 u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
397 u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
398 u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
399 u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
400 u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
401 u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
402 u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
403 u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
404 u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
405 u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
406 u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
407 u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
408 u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
409
410 u = vmla(s, vmul(u, x), x);
411
412 u = vsel(m, vrec(u), u);
413
414 u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
415
416 return u;
417 }
418
atan2k(vdouble y,vdouble x)419 static INLINE vdouble atan2k(vdouble y, vdouble x) {
420 vdouble s, t, u;
421 vint q;
422 vmask p;
423
424 q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
425 x = vabs(x);
426
427 q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
428 p = vmask_lt(x, y);
429 s = vsel (p, vneg(x), y);
430 t = vmax (x, y);
431
432 s = vdiv(s, t);
433 t = vmul(s, s);
434
435 u = vcast_vd_d(-1.88796008463073496563746e-05);
436 u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
437 u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
438 u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
439 u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
440 u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
441 u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
442 u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
443 u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
444 u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
445 u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
446 u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
447 u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
448 u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
449 u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
450 u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
451 u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
452 u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
453 u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
454
455 t = vadd(s, vmul(s, vmul(t, u)));
456 t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
457
458 return t;
459 }
460
xatan2(vdouble y,vdouble x)461 static INLINE vdouble xatan2(vdouble y, vdouble x) {
462 vdouble r = atan2k(vabs(y), x);
463
464 r = vmulsign(r, x);
465 r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
466 r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
467 r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
468
469 return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
470 }
471
xasin(vdouble d)472 static INLINE vdouble xasin(vdouble d) {
473 vdouble x, y;
474 x = vadd(vcast_vd_d(1), d);
475 y = vsub(vcast_vd_d(1), d);
476 x = vmul(x, y);
477 x = vsqrt(x);
478 x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
479 return vmulsign(x, d);
480 }
481
xacos(vdouble d)482 static INLINE vdouble xacos(vdouble d) {
483 vdouble x, y;
484 x = vadd(vcast_vd_d(1), d);
485 y = vsub(vcast_vd_d(1), d);
486 x = vmul(x, y);
487 x = vsqrt(x);
488 x = vmulsign(atan2k(x, vabs(d)), d);
489 y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
490 x = vadd(x, y);
491 return x;
492 }
493
xatan(vdouble s)494 static INLINE vdouble xatan(vdouble s) {
495 vdouble t, u;
496 vint q;
497
498 q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
499 s = vabs(s);
500
501 q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
502 s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
503
504 t = vmul(s, s);
505
506 u = vcast_vd_d(-1.88796008463073496563746e-05);
507 u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
508 u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
509 u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
510 u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
511 u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
512 u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
513 u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
514 u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
515 u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
516 u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
517 u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
518 u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
519 u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
520 u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
521 u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
522 u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
523 u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
524 u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
525
526 t = vadd(s, vmul(s, vmul(t, u)));
527
528 t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
529 t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
530
531 return t;
532 }
533
xlog(vdouble d)534 static INLINE vdouble xlog(vdouble d) {
535 vdouble x, x2;
536 vdouble t, m;
537 vint e;
538
539 e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
540 m = vldexp(d, vsubi(vcast_vi_i(0), e));
541
542 x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
543 x2 = vmul(x, x);
544
545 t = vcast_vd_d(0.148197055177935105296783);
546 t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
547 t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
548 t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
549 t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
550 t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
551 t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
552 t = vmla(t, x2, vcast_vd_d(2));
553
554 x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
555
556 x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
557 x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
558 x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
559
560 return x;
561 }
562
xexp(vdouble d)563 static INLINE vdouble xexp(vdouble d) {
564 vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
565 vdouble s, u;
566
567 s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
568 s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
569
570 u = vcast_vd_d(2.08860621107283687536341e-09);
571 u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
572 u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
573 u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
574 u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
575 u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
576 u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
577 u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
578 u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
579 u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
580 u = vmla(u, s, vcast_vd_d(0.5));
581
582 u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
583
584 u = vldexp(u, q);
585
586 u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
587
588 return u;
589 }
590
logk(vdouble d)591 static INLINE vdouble2 logk(vdouble d) {
592 vdouble2 x, x2;
593 vdouble t, m;
594 vint e;
595
596 e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
597 m = vldexp(d, vsubi(vcast_vi_i(0), e));
598
599 x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
600 x2 = squ_d(x);
601 x2 = normalize_d(x2);
602
603 t = vcast_vd_d(0.134601987501262130076155);
604 t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
605 t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
606 t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
607 t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
608 t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
609 t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
610 t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
611
612 return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
613 vcast_vd_vi(e)),
614 add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
615 }
616
expk(vdouble2 d)617 static INLINE vdouble expk(vdouble2 d) {
618 vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
619 vint q = vrint_vi_vd(u);
620 vdouble2 s, t;
621
622 s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
623 s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
624
625 q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
626
627 s = normalize_d(s);
628
629 u = vcast_vd_d(2.51069683420950419527139e-08);
630 u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
631 u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
632 u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
633 u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
634 u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
635 u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
636 u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
637 u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
638 u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
639
640 t = add_dd(s, mul_ds(squ_d(s), u));
641
642 t = add_sd(vcast_vd_d(1), t);
643 u = vadd(t.x, t.y);
644 u = vldexp(u, q);
645
646 return u;
647 }
648
xpow(vdouble x,vdouble y)649 static INLINE vdouble xpow(vdouble x, vdouble y) {
650 #if 1
651 vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
652 vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
653
654 vdouble result = expk(mul_ds(logk(vabs(x)), y));
655
656 //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
657
658 result = vmul(result,
659 vsel(vmask_gt(x, vcast_vd_d(0)),
660 vcast_vd_d(1),
661 vsel(yisint,
662 vsel(yisodd,
663 vcast_vd_d(-1),
664 vcast_vd_d(1)),
665 vcast_vd_d(rtengine::RT_NAN))));
666
667 vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
668
669 result = vsel(vmask_isinf(y),
670 vsel(vmask_lt(efx, vcast_vd_d(0)),
671 vcast_vd_d(0),
672 vsel(vmask_eq(efx, vcast_vd_d(0)),
673 vcast_vd_d(1.0),
674 vcast_vd_d(rtengine::RT_INFINITY))),
675 result);
676
677 result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
678 vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
679 vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
680 vcast_vd_d(0),
681 vcast_vd_d(rtengine::RT_INFINITY))),
682 result);
683
684 result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
685
686 result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
687
688 return result;
689 #else
690 return expk(mul_ds(logk(x), y));
691 #endif
692 }
693
expk2(vdouble2 d)694 static INLINE vdouble2 expk2(vdouble2 d) {
695 vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
696 vint q = vrint_vi_vd(u);
697 vdouble2 s, t;
698
699 s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
700 s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
701
702 q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
703
704 s = normalize_d(s);
705
706 u = vcast_vd_d(2.51069683420950419527139e-08);
707 u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
708 u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
709 u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
710 u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
711 u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
712 u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
713 u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
714 u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
715 u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
716
717 t = add_dd(s, mul_ds(squ_d(s), u));
718
719 t = add_sd(vcast_vd_d(1), t);
720
721 return dd(vldexp(t.x, q), vldexp(t.y, q));
722 }
723
xsinh(vdouble x)724 static INLINE vdouble xsinh(vdouble x) {
725 vdouble y = vabs(x);
726 vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
727 d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
728 y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
729
730 y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
731 y = vmulsign(y, x);
732 y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
733
734 return y;
735 }
736
xcosh(vdouble x)737 static INLINE vdouble xcosh(vdouble x) {
738 vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
739 d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
740 vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
741
742 y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
743 y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
744
745 return y;
746 }
747
xtanh(vdouble x)748 static INLINE vdouble xtanh(vdouble x) {
749 vdouble y = vabs(x);
750 vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
751 vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
752 d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
753 y = d.x + d.y;
754
755 y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
756 y = vmulsign(y, x);
757 y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
758
759 return y;
760 }
761
logk2(vdouble2 d)762 static INLINE vdouble2 logk2(vdouble2 d) {
763 vdouble2 x, x2, m;
764 vdouble t;
765 vint e;
766
767 d = normalize_d(d);
768 e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
769 m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
770
771 x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
772 x2 = squ_d(x);
773 x2 = normalize_d(x2);
774
775 t = vcast_vd_d(0.134601987501262130076155);
776 t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
777 t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
778 t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
779 t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
780 t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
781 t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
782 t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
783
784 return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
785 vcast_vd_vi(e)),
786 add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
787 }
788
xasinh(vdouble x)789 static INLINE vdouble xasinh(vdouble x) {
790 vdouble y = vabs(x);
791 vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y), vcast_vd_d(1))), y));
792 y = vadd(d.x, d.y);
793
794 y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
795 y = vmulsign(y, x);
796 y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
797
798 return y;
799 }
800
xacosh(vdouble x)801 static INLINE vdouble xacosh(vdouble x) {
802 vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
803 vdouble y = vadd(d.x, d.y);
804
805 y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
806 y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
807 y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
808 y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
809
810 return y;
811 }
812
xatanh(vdouble x)813 static INLINE vdouble xatanh(vdouble x) {
814 vdouble y = vabs(x);
815 vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
816 y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
817
818 y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
819 y = vmulsign(y, x);
820 y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
821
822 return y;
823 }
824
xcbrt(vdouble d)825 static INLINE vdouble xcbrt(vdouble d) {
826 vdouble x, y, q = vcast_vd_d(1.0);
827 vint e, qu, re;
828 vdouble t;
829
830 e = vilogbp1(vabs(d));
831 d = vldexp(d, vsubi(vcast_vi_i(0), e));
832
833 t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
834 qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
835 re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
836
837 q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
838 q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
839 q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
840
841 q = vmulsign(q, d);
842
843 d = vabs(d);
844
845 x = vcast_vd_d(-0.640245898480692909870982);
846 x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
847 x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
848 x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
849 x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
850 x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
851
852 y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
853 y = vmul(vmul(d, x), x);
854 y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
855
856 return y;
857 }
858
xexp2(vdouble a)859 static INLINE vdouble xexp2(vdouble a) {
860 vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
861 u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
862 u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
863 return u;
864 }
865
xexp10(vdouble a)866 static INLINE vdouble xexp10(vdouble a) {
867 vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
868 u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
869 u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
870 return u;
871 }
872
xexpm1(vdouble a)873 static INLINE vdouble xexpm1(vdouble a) {
874 vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
875 vdouble x = d.x + d.y;
876 x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
877 x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
878 return x;
879 }
880
xlog10(vdouble a)881 static INLINE vdouble xlog10(vdouble a) {
882 vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
883 vdouble x = d.x + d.y;
884
885 x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
886 x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
887 x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
888
889 return x;
890 }
891
xlog1p(vdouble a)892 static INLINE vdouble xlog1p(vdouble a) {
893 vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
894 vdouble x = d.x + d.y;
895
896 x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
897 x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
898 x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
899
900 return x;
901 }
902
903 //
904
905 typedef struct {
906 vfloat x, y;
907 } vfloat2;
908
vabsf(vfloat f)909 static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
vnegf(vfloat f)910 static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
911
912 #ifdef __SSE4_1__
913 // only one instruction when using SSE4.1
vself(vmask mask,vfloat x,vfloat y)914 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
915 return _mm_blendv_ps(y,x,(vfloat)mask);
916 }
917
vselc(vmask mask,vint x,vint y)918 static INLINE vint vselc(vmask mask, vint x, vint y) {
919 return _mm_blendv_epi8(y,x,mask);
920 }
921
922 #else
923 // three instructions when using SSE2
vself(vmask mask,vfloat x,vfloat y)924 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
925 return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
926 }
927
vselc(vmask mask,vint x,vint y)928 static INLINE vint vselc(vmask mask, vint x, vint y) {
929 return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
930 }
931 #endif
932
vselfzero(vmask mask,vfloat x)933 static INLINE vfloat vselfzero(vmask mask, vfloat x) {
934 // returns value of x if corresponding mask bits are 1, else returns 0
935 // faster than vself(mask, x, ZEROV)
936 return _mm_and_ps((vfloat)mask, x);
937 }
vselfnotzero(vmask mask,vfloat x)938 static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
939 // returns value of x if corresponding mask bits are 0, else returns 0
940 // faster than vself(mask, ZEROV, x)
941 return _mm_andnot_ps((vfloat)mask, x);
942 }
943
vselizero(vmask mask,vint x)944 static INLINE vint vselizero(vmask mask, vint x) {
945 // returns value of x if corresponding mask bits are 1, else returns 0
946 // faster than vselc(mask, x, ZEROV)
947 return _mm_and_si128(mask, x);
948 }
vselinotzero(vmask mask,vint x)949 static INLINE vint vselinotzero(vmask mask, vint x) {
950 // returns value of x if corresponding mask bits are 0, else returns 0
951 // faster than vselc(mask, ZEROV, x)
952 return _mm_andnot_si128(mask, x);
953 }
954
vseli2_lt(vfloat f0,vfloat f1,vint2 x,vint2 y)955 static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
956 vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
957 return vori2(vandi2(m2, x), vandnoti2(m2, y));
958 }
959
vsignbitf(vfloat f)960 static INLINE vmask vsignbitf(vfloat f) {
961 return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
962 }
963
vmulsignf(vfloat x,vfloat y)964 static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
965 return (vfloat)vxorm((vmask)x, vsignbitf(y));
966 }
967
vsignf(vfloat f)968 static INLINE vfloat vsignf(vfloat f) {
969 return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
970 }
971
vmaskf_isinf(vfloat d)972 static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
vmaskf_ispinf(vfloat d)973 static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
vmaskf_isminf(vfloat d)974 static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
vmaskf_isnan(vfloat d)975 static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
976 // the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
vmaskf_isnan(vfloat a,vfloat b)977 static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
visinf2f(vfloat d,vfloat m)978 static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
visinff(vfloat d)979 static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
980
vilogbp1f(vfloat d)981 static INLINE vint2 vilogbp1f(vfloat d) {
982 vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
983 d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
984 vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
985 q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
986 return q;
987 }
988
vldexpf(vfloat x,vint2 q)989 static INLINE vfloat vldexpf(vfloat x, vint2 q) {
990 vfloat u;
991 vint2 m = vsrai2(q, 31);
992 m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
993 q = vsubi2(q, vslli2(m, 2));
994 u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
995 x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
996 u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
997 return vmulf(x, u);
998 }
999
xsinf(vfloat d)1000 static INLINE vfloat xsinf(vfloat d) {
1001 vint2 q;
1002 vfloat u, s;
1003
1004 q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
1005
1006 u = vcast_vf_vi2(q);
1007 d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
1008 d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
1009 d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
1010 d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
1011
1012 s = vmulf(d, d);
1013
1014 d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
1015
1016 u = vcast_vf_f(2.6083159809786593541503e-06f);
1017 u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1018 u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1019 u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1020
1021 u = vmlaf(s, vmulf(u, d), d);
1022
1023 return u;
1024 }
1025
xcosf(vfloat d)1026 static INLINE vfloat xcosf(vfloat d) {
1027 vint2 q;
1028 vfloat u, s;
1029
1030 q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
1031 q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
1032
1033 u = vcast_vf_vi2(q);
1034 d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
1035 d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
1036 d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
1037 d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
1038
1039 s = vmulf(d, d);
1040
1041 d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
1042
1043 u = vcast_vf_f(2.6083159809786593541503e-06f);
1044 u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
1045 u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
1046 u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
1047
1048 u = vmlaf(s, vmulf(u, d), d);
1049
1050 return u;
1051 }
1052
xsincosf(vfloat d)1053 static INLINE vfloat2 xsincosf(vfloat d) {
1054 vint2 q;
1055 vmask m;
1056 vfloat u, s, t, rx, ry;
1057 vfloat2 r;
1058
1059 q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
1060
1061 s = d;
1062
1063 u = vcast_vf_vi2(q);
1064 s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
1065 s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
1066 s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
1067 s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
1068
1069 t = s;
1070
1071 s = vmulf(s, s);
1072
1073 u = vcast_vf_f(-0.000195169282960705459117889f);
1074 u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
1075 u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
1076 u = vmulf(vmulf(u, s), t);
1077
1078 rx = vaddf(t, u);
1079
1080 u = vcast_vf_f(-2.71811842367242206819355e-07f);
1081 u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
1082 u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
1083 u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
1084 u = vmlaf(u, s, vcast_vf_f(-0.5));
1085
1086 ry = vaddf(vcast_vf_f(1), vmulf(s, u));
1087
1088 m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
1089 r.x = vself(m, rx, ry);
1090 r.y = vself(m, ry, rx);
1091
1092 m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
1093 r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
1094
1095 m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
1096 r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
1097
1098 m = vmaskf_isinf(d);
1099 r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
1100 r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
1101
1102 return r;
1103 }
1104
xtanf(vfloat d)1105 static INLINE vfloat xtanf(vfloat d) {
1106 vint2 q;
1107 vmask m;
1108 vfloat u, s, x;
1109
1110 q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
1111
1112 x = d;
1113
1114 u = vcast_vf_vi2(q);
1115 x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
1116 x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
1117 x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
1118 x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
1119
1120 s = vmulf(x, x);
1121
1122 m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
1123 x = vself(m, vnegf(x), x);
1124
1125 u = vcast_vf_f(0.00927245803177356719970703f);
1126 u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
1127 u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
1128 u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
1129 u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
1130 u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
1131
1132 u = vmlaf(s, vmulf(u, x), x);
1133
1134 u = vself(m, vrecf(u), u);
1135
1136 u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
1137
1138 return u;
1139 }
1140
xatanf(vfloat s)1141 static INLINE vfloat xatanf(vfloat s) {
1142 vfloat t, u;
1143 vint2 q;
1144
1145 q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
1146 s = vabsf(s);
1147
1148 q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
1149 s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
1150
1151 t = vmulf(s, s);
1152
1153 u = vcast_vf_f(0.00282363896258175373077393f);
1154 u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1155 u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1156 u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1157 u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1158 u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1159 u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1160 u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1161
1162 t = vaddf(s, vmulf(s, vmulf(t, u)));
1163
1164 t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
1165 t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
1166
1167 return t;
1168 }
1169
atan2kf(vfloat y,vfloat x)1170 static INLINE vfloat atan2kf(vfloat y, vfloat x) {
1171 vfloat s, t, u;
1172 vint2 q;
1173 vmask p;
1174
1175 q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
1176 x = vabsf(x);
1177
1178 q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
1179 p = vmaskf_lt(x, y);
1180 s = vself(p, vnegf(x), y);
1181 t = vmaxf(x, y);
1182
1183 s = vdivf(s, t);
1184 t = vmulf(s, s);
1185
1186 u = vcast_vf_f(0.00282363896258175373077393f);
1187 u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
1188 u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
1189 u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
1190 u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
1191 u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
1192 u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
1193 u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
1194
1195 t = vaddf(s, vmulf(s, vmulf(t, u)));
1196 t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
1197
1198 return t;
1199 }
1200
xatan2f(vfloat y,vfloat x)1201 static INLINE vfloat xatan2f(vfloat y, vfloat x) {
1202 vfloat r = atan2kf(vabsf(y), x);
1203
1204 r = vmulsignf(r, x);
1205 r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
1206 r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
1207 r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
1208
1209 return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
1210 }
1211
xasinf(vfloat d)1212 static INLINE vfloat xasinf(vfloat d) {
1213 vfloat x, y;
1214 x = vaddf(vcast_vf_f(1.0f), d);
1215 y = vsubf(vcast_vf_f(1.0f), d);
1216 x = vmulf(x, y);
1217 x = vsqrtf(x);
1218 x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
1219 return vmulsignf(x, d);
1220 }
1221
xacosf(vfloat d)1222 static INLINE vfloat xacosf(vfloat d) {
1223 vfloat x, y;
1224 x = vaddf(vcast_vf_f(1.0f), d);
1225 y = vsubf(vcast_vf_f(1.0f), d);
1226 x = vmulf(x, y);
1227 x = vsqrtf(x);
1228 x = vmulsignf(atan2kf(x, vabsf(d)), d);
1229 y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
1230 x = vaddf(x, y);
1231 return x;
1232 }
1233
xlogf(vfloat d)1234 static INLINE vfloat xlogf(vfloat d) {
1235 vfloat x, x2, t, m;
1236 vint2 e;
1237
1238 e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1239 m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1240
1241 x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1242 x2 = vmulf(x, x);
1243
1244 t = vcast_vf_f(0.2371599674224853515625f);
1245 t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1246 t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1247 t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1248 t = vmlaf(t, x2, vcast_vf_f(2.0f));
1249
1250 x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1251
1252 x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
1253 x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
1254 x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
1255
1256 return x;
1257 }
1258
xlogf1(vfloat d)1259 static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
1260 vfloat x, x2, t, m;
1261 vint2 e;
1262
1263 e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1264 m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1265
1266 x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1267 x2 = vmulf(x, x);
1268
1269 t = vcast_vf_f(0.2371599674224853515625f);
1270 t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1271 t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1272 t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1273 t = vmlaf(t, x2, vcast_vf_f(2.0f));
1274
1275 x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1276
1277 x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
1278 x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
1279
1280 return x;
1281 }
1282
xlogf0(vfloat d)1283 static INLINE vfloat xlogf0(vfloat d) {
1284 vfloat x, x2, t, m;
1285 vint2 e;
1286
1287 e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1288 m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1289
1290 x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1291 x2 = vmulf(x, x);
1292
1293 t = vcast_vf_f(0.2371599674224853515625f);
1294 t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1295 t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1296 t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1297 t = vmlaf(t, x2, vcast_vf_f(2.0f));
1298
1299 x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1300
1301 x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
1302 x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
1303 x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
1304
1305 return x;
1306 }
1307
xlogfNoCheck(vfloat d)1308 static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
1309 vfloat x, x2, t, m;
1310 vint2 e;
1311
1312 e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
1313 m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1314
1315 x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
1316 x2 = vmulf(x, x);
1317
1318 t = vcast_vf_f(0.2371599674224853515625f);
1319 t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
1320 t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
1321 t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
1322 t = vmlaf(t, x2, vcast_vf_f(2.0f));
1323
1324 return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
1325
1326 }
1327
xexpf(vfloat d)1328 static INLINE vfloat xexpf(vfloat d) {
1329 vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1330 vfloat s, u;
1331
1332 s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1333 s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1334
1335 u = vcast_vf_f(0.00136324646882712841033936f);
1336 u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1337 u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1338 u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1339 u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1340
1341 u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1342
1343 u = vldexpf(u, q);
1344
1345 // -104.0
1346 return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
1347 }
1348
xexpfNoCheck(vfloat d)1349 static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
1350 vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
1351 vfloat s, u;
1352
1353 s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
1354 s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
1355
1356 u = vcast_vf_f(0.00136324646882712841033936f);
1357 u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
1358 u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
1359 u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
1360 u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
1361
1362 u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
1363
1364 return vldexpf(u, q);
1365 }
1366
xcbrtf(vfloat d)1367 static INLINE vfloat xcbrtf(vfloat d) {
1368 vfloat x, y, q = vcast_vf_f(1.0), t;
1369 vint2 e, qu, re;
1370
1371 e = vilogbp1f(vabsf(d));
1372 d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
1373
1374 t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
1375 qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
1376 re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
1377
1378 q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
1379 q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
1380 q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
1381
1382 q = vmulsignf(q, d);
1383 d = vabsf(d);
1384
1385 x = vcast_vf_f(-0.601564466953277587890625f);
1386 x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
1387 x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
1388 x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
1389 x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
1390 x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
1391
1392 y = vmulf(vmulf(d, x), x);
1393 y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
1394
1395 return y;
1396 }
1397
vclampf(vfloat value,vfloat low,vfloat high)1398 static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
1399 // clamps value in [low;high], returns low if value is NaN
1400 return vmaxf(vminf(high, value), low);
1401 }
1402
SQRV(vfloat a)1403 static INLINE vfloat SQRV(vfloat a){
1404 return a * a;
1405 }
1406
vswap(vmask condition,vfloat & a,vfloat & b)1407 static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
1408 // conditional swap the elements of two vfloats
1409 vfloat temp = vself(condition, a, b); // the values which fit to condition
1410 a = vself(condition, b, a); // the values which fit to inverted condition
1411 b = temp;
1412 }
1413
vhadd(vfloat a)1414 static inline float vhadd( vfloat a ) {
1415 // returns a[0] + a[1] + a[2] + a[3]
1416 a += _mm_movehl_ps(a, a);
1417 return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
1418 }
1419
vhmin(vfloat a)1420 static inline float vhmin(vfloat a) {
1421 // returns min(a[0], a[1], a[2], a[3])
1422 a = vminf(a, _mm_movehl_ps(a, a));
1423 return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
1424 }
1425
vhmax(vfloat a)1426 static inline float vhmax(vfloat a) {
1427 // returns max(a[0], a[1], a[2], a[3])
1428 a = vmaxf(a, _mm_movehl_ps(a, a));
1429 return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
1430 }
1431
vmul2f(vfloat a)1432 static INLINE vfloat vmul2f(vfloat a){
1433 // fastest way to multiply by 2
1434 return a + a;
1435 }
1436
vintpf(vfloat a,vfloat b,vfloat c)1437 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
1438 // calculate a * b + (1 - a) * c (interpolate two values)
1439 // following is valid:
1440 // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
1441 // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
1442 return a * (b-c) + c;
1443 }
1444
vdup(vfloat a)1445 static INLINE vfloat vdup(vfloat a){
1446 // returns { a[0],a[0],a[1],a[1] }
1447 return _mm_unpacklo_ps( a, a );
1448 }
1449
vaddc2vfu(float & a)1450 static INLINE vfloat vaddc2vfu(float &a)
1451 {
1452 // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
1453 vfloat a1 = _mm_loadu_ps( &a );
1454 vfloat a2 = _mm_loadu_ps( (&a) + 4 );
1455 return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
1456 }
1457
vadivapb(vfloat a,vfloat b)1458 static INLINE vfloat vadivapb (vfloat a, vfloat b) {
1459 return a / (a+b);
1460 }
1461
vconvertrgbrgbrgbrgb2rrrrggggbbbb(const float * src,vfloat & rv,vfloat & gv,vfloat & bv)1462 static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
1463 // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
1464 rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
1465 gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
1466 bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
1467 }
1468
1469 #if defined( __SSE4_1__ ) && defined( __x86_64__ )
vceilf(vfloat x)1470 static INLINE vfloat vceilf(vfloat x) {
1471 return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
1472 }
1473
1474 #else
1475
vceilf(vfloat x)1476 static INLINE vfloat vceilf(vfloat x) {
1477 __m128i zerov = _mm_setzero_si128();
1478 zerov = _mm_cmpeq_epi32(zerov, zerov);
1479 const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
1480 const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
1481 return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
1482 }
1483 #endif
1484
1485 #endif // __SSE2__
1486 #endif // SLEEFSSEAVX
1487