1 ////////////////////////////////////////////////////////////////
2 //
3 //  this code was taken from http://shibatch.sourceforge.net/
4 //  Many thanks to the author of original version: Naoki Shibata
5 //
6 //  This version contains modifications made by Ingo Weyrich
7 //
8 ////////////////////////////////////////////////////////////////
9 
10 #ifndef __SSE2__
11 #error Please specify -msse2.
12 #endif
13 
14 #ifdef __GNUC__
15 #define INLINE __inline
16 #else
17 #define INLINE inline
18 #endif
19 
20 #include <x86intrin.h>
21 
22 #include <stdint.h>
23 
24 typedef __m128d vdouble;
25 typedef __m128i vint;
26 typedef __m128i vmask;
27 
28 typedef __m128 vfloat;
29 typedef __m128i vint2;
30 
31 //
32 #ifdef __GNUC__
33 #if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) && (!defined(WIN32) || defined( __x86_64__ ))
34 #define LVF(x) _mm_load_ps((float*)&x)
35 #define LVFU(x) _mm_loadu_ps(&x)
36 #define STVF(x,y) _mm_store_ps(&x,y)
37 #define STVFU(x,y) _mm_storeu_ps(&x,y)
38 #define LVI(x) _mm_load_si128((__m128i*)&x)
39 #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the aligned functions to unaligned functions for WIN32 builds
40 #define LVF(x) _mm_loadu_ps((float*)&x)
41 #define LVFU(x) _mm_loadu_ps(&x)
42 #define STVF(x,y) _mm_storeu_ps(&x,y)
43 #define STVFU(x,y) _mm_storeu_ps(&x,y)
44 #define LVI(x) _mm_loadu_si128((__m128i*)&x)
45 #endif
46 #else
47 #define LVF(x) _mm_load_ps((float*)&x)
48 #define LVFU(x) _mm_loadu_ps(&x)
49 #define STVF(x,y) _mm_store_ps(&x,y)
50 #define STVFU(x,y) _mm_storeu_ps(&x,y)
51 #define LVI(x) _mm_load_si128((__m128i*)&x)
52 #endif
53 
54 #if defined(__x86_64__) && defined(__AVX__)
55 #define PERMUTEPS(a,mask) _mm_permute_ps(a,mask)
56 #else
57 #define PERMUTEPS(a,mask) _mm_shuffle_ps(a,a,mask)
58 #endif
59 
LC2VFU(float & a)60 static INLINE vfloat LC2VFU(float &a)
61 {
62     // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
63     vfloat a1 = _mm_loadu_ps( &a );
64     vfloat a2 = _mm_loadu_ps( (&a) + 4 );
65     return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 ));
66 }
67 
68 
69 // Store a vector of 4 floats in a[0],a[2],a[4] and a[6]
70 #if defined(__x86_64__) && defined(__SSE4_1__)
71 // SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself
72 #define STC2VFU(a,v) {\
73                          __m128 TST1V = _mm_loadu_ps(&a);\
74                          __m128 TST2V = _mm_unpacklo_ps(v,v);\
75                          _mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\
76                          TST1V = _mm_loadu_ps((&a)+4);\
77                          TST2V = _mm_unpackhi_ps(v,v);\
78                          _mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\
79                      }
80 #else
81 #define STC2VFU(a,v) {\
82                          __m128 TST1V = _mm_loadu_ps(&a);\
83                          __m128 TST2V = _mm_unpacklo_ps(v,v);\
84                          vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\
85                          _mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\
86                          TST1V = _mm_loadu_ps((&a)+4);\
87                          TST2V = _mm_unpackhi_ps(v,v);\
88                          _mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\
89                      }
90 #endif
91 
92 #define ZEROV _mm_setzero_ps()
93 #define F2V(a) _mm_set1_ps((a))
94 
vrint_vi_vd(vdouble vd)95 static INLINE vint vrint_vi_vd(vdouble vd)
96 {
97     return _mm_cvtpd_epi32(vd);
98 }
vtruncate_vi_vd(vdouble vd)99 static INLINE vint vtruncate_vi_vd(vdouble vd)
100 {
101     return _mm_cvttpd_epi32(vd);
102 }
vcast_vd_vi(vint vi)103 static INLINE vdouble vcast_vd_vi(vint vi)
104 {
105     return _mm_cvtepi32_pd(vi);
106 }
vcast_vd_d(double d)107 static INLINE vdouble vcast_vd_d(double d)
108 {
109     return _mm_set_pd(d, d);
110 }
vcast_vi_i(int i)111 static INLINE vint vcast_vi_i(int i)
112 {
113     return _mm_set_epi32(0, 0, i, i);
114 }
115 
vreinterpret_vm_vd(vdouble vd)116 static INLINE vmask vreinterpret_vm_vd(vdouble vd)
117 {
118     return (__m128i)vd;
119 }
vreinterpret_vd_vm(vint vm)120 static INLINE vdouble vreinterpret_vd_vm(vint vm)
121 {
122     return (__m128d)vm;
123 }
124 
vreinterpret_vm_vf(vfloat vf)125 static INLINE vmask vreinterpret_vm_vf(vfloat vf)
126 {
127     return (__m128i)vf;
128 }
vreinterpret_vf_vm(vmask vm)129 static INLINE vfloat vreinterpret_vf_vm(vmask vm)
130 {
131     return (__m128)vm;
132 }
133 
134 //
135 
vcast_vf_f(float f)136 static INLINE vfloat vcast_vf_f(float f)
137 {
138     return _mm_set_ps(f, f, f, f);
139 }
140 
141 // Don't use intrinsics here. Newer gcc versions (>= 4.9, maybe also before 4.9) generate better code when not using intrinsics
142 // example: vaddf(vmulf(a,b),c) will generate an FMA instruction when build for chips with that feature only when vaddf and vmulf don't use intrinsics
vaddf(vfloat x,vfloat y)143 static INLINE vfloat vaddf(vfloat x, vfloat y)
144 {
145     return x + y;
146 }
vsubf(vfloat x,vfloat y)147 static INLINE vfloat vsubf(vfloat x, vfloat y)
148 {
149     return x - y;
150 }
vmulf(vfloat x,vfloat y)151 static INLINE vfloat vmulf(vfloat x, vfloat y)
152 {
153     return x * y;
154 }
vdivf(vfloat x,vfloat y)155 static INLINE vfloat vdivf(vfloat x, vfloat y)
156 {
157     return x / y;
158 }
159 // Also don't use intrinsic here: Some chips support FMA instructions with 3 and 4 operands
160 // 3 operands: a = a*b+c, b = a*b+c, c = a*b+c // destination has to be one of a,b,c
161 // 4 operands: d = a*b+c // destination does not have to be one of a,b,c
162 // gcc will use the one which fits best when not using intrinsics. With using intrinsics that's not possible
vmlaf(vfloat x,vfloat y,vfloat z)163 static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) {
164     return x * y + z;
165 }
vrecf(vfloat x)166 static INLINE vfloat vrecf(vfloat x)
167 {
168     return vdivf(vcast_vf_f(1.0f), x);
169 }
vsqrtf(vfloat x)170 static INLINE vfloat vsqrtf(vfloat x)
171 {
172     return _mm_sqrt_ps(x);
173 }
vmaxf(vfloat x,vfloat y)174 static INLINE vfloat vmaxf(vfloat x, vfloat y)
175 {
176     return _mm_max_ps(x, y);
177 }
vminf(vfloat x,vfloat y)178 static INLINE vfloat vminf(vfloat x, vfloat y)
179 {
180     return _mm_min_ps(x, y);
181 }
182 
183 //
184 
vadd(vdouble x,vdouble y)185 static INLINE vdouble vadd(vdouble x, vdouble y)
186 {
187     return _mm_add_pd(x, y);
188 }
vsub(vdouble x,vdouble y)189 static INLINE vdouble vsub(vdouble x, vdouble y)
190 {
191     return _mm_sub_pd(x, y);
192 }
vmul(vdouble x,vdouble y)193 static INLINE vdouble vmul(vdouble x, vdouble y)
194 {
195     return _mm_mul_pd(x, y);
196 }
vdiv(vdouble x,vdouble y)197 static INLINE vdouble vdiv(vdouble x, vdouble y)
198 {
199     return _mm_div_pd(x, y);
200 }
vrec(vdouble x)201 static INLINE vdouble vrec(vdouble x)
202 {
203     return _mm_div_pd(_mm_set_pd(1, 1), x);
204 }
vsqrt(vdouble x)205 static INLINE vdouble vsqrt(vdouble x)
206 {
207     return _mm_sqrt_pd(x);
208 }
vmla(vdouble x,vdouble y,vdouble z)209 static INLINE vdouble vmla(vdouble x, vdouble y, vdouble z)
210 {
211     return vadd(vmul(x, y), z);
212 }
213 
vmax(vdouble x,vdouble y)214 static INLINE vdouble vmax(vdouble x, vdouble y)
215 {
216     return _mm_max_pd(x, y);
217 }
vmin(vdouble x,vdouble y)218 static INLINE vdouble vmin(vdouble x, vdouble y)
219 {
220     return _mm_min_pd(x, y);
221 }
222 
vabs(vdouble d)223 static INLINE vdouble vabs(vdouble d)
224 {
225     return (__m128d)_mm_andnot_pd(_mm_set_pd(-0.0, -0.0), d);
226 }
vneg(vdouble d)227 static INLINE vdouble vneg(vdouble d)
228 {
229     return (__m128d)_mm_xor_pd(_mm_set_pd(-0.0, -0.0), d);
230 }
231 
232 //
233 
vaddi(vint x,vint y)234 static INLINE vint vaddi(vint x, vint y)
235 {
236     return _mm_add_epi32(x, y);
237 }
vsubi(vint x,vint y)238 static INLINE vint vsubi(vint x, vint y)
239 {
240     return _mm_sub_epi32(x, y);
241 }
242 
vandi(vint x,vint y)243 static INLINE vint vandi(vint x, vint y)
244 {
245     return _mm_and_si128(x, y);
246 }
vandnoti(vint x,vint y)247 static INLINE vint vandnoti(vint x, vint y)
248 {
249     return _mm_andnot_si128(x, y);
250 }
vori(vint x,vint y)251 static INLINE vint vori(vint x, vint y)
252 {
253     return _mm_or_si128(x, y);
254 }
vxori(vint x,vint y)255 static INLINE vint vxori(vint x, vint y)
256 {
257     return _mm_xor_si128(x, y);
258 }
259 
vslli(vint x,int c)260 static INLINE vint vslli(vint x, int c)
261 {
262     return _mm_slli_epi32(x, c);
263 }
vsrli(vint x,int c)264 static INLINE vint vsrli(vint x, int c)
265 {
266     return _mm_srli_epi32(x, c);
267 }
vsrai(vint x,int c)268 static INLINE vint vsrai(vint x, int c)
269 {
270     return _mm_srai_epi32(x, c);
271 }
272 
273 //
274 
vandm(vmask x,vmask y)275 static INLINE vmask vandm(vmask x, vmask y)
276 {
277     return _mm_and_si128(x, y);
278 }
vandnotm(vmask x,vmask y)279 static INLINE vmask vandnotm(vmask x, vmask y)
280 {
281     return _mm_andnot_si128(x, y);
282 }
vorm(vmask x,vmask y)283 static INLINE vmask vorm(vmask x, vmask y)
284 {
285     return _mm_or_si128(x, y);
286 }
vxorm(vmask x,vmask y)287 static INLINE vmask vxorm(vmask x, vmask y)
288 {
289     return _mm_xor_si128(x, y);
290 }
vnotm(vmask x)291 static INLINE vmask vnotm(vmask x)
292 {
293     return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
294 }
295 
vmask_eq(vdouble x,vdouble y)296 static INLINE vmask vmask_eq(vdouble x, vdouble y)
297 {
298     return (__m128i)_mm_cmpeq_pd(x, y);
299 }
vmask_neq(vdouble x,vdouble y)300 static INLINE vmask vmask_neq(vdouble x, vdouble y)
301 {
302     return (__m128i)_mm_cmpneq_pd(x, y);
303 }
vmask_lt(vdouble x,vdouble y)304 static INLINE vmask vmask_lt(vdouble x, vdouble y)
305 {
306     return (__m128i)_mm_cmplt_pd(x, y);
307 }
vmask_le(vdouble x,vdouble y)308 static INLINE vmask vmask_le(vdouble x, vdouble y)
309 {
310     return (__m128i)_mm_cmple_pd(x, y);
311 }
vmask_gt(vdouble x,vdouble y)312 static INLINE vmask vmask_gt(vdouble x, vdouble y)
313 {
314     return (__m128i)_mm_cmpgt_pd(x, y);
315 }
vmask_ge(vdouble x,vdouble y)316 static INLINE vmask vmask_ge(vdouble x, vdouble y)
317 {
318     return (__m128i)_mm_cmpge_pd(x, y);
319 }
320 
vmaskf_eq(vfloat x,vfloat y)321 static INLINE vmask vmaskf_eq(vfloat x, vfloat y)
322 {
323     return (__m128i)_mm_cmpeq_ps(x, y);
324 }
vmaskf_neq(vfloat x,vfloat y)325 static INLINE vmask vmaskf_neq(vfloat x, vfloat y)
326 {
327     return (__m128i)_mm_cmpneq_ps(x, y);
328 }
vmaskf_lt(vfloat x,vfloat y)329 static INLINE vmask vmaskf_lt(vfloat x, vfloat y)
330 {
331     return (__m128i)_mm_cmplt_ps(x, y);
332 }
vmaskf_le(vfloat x,vfloat y)333 static INLINE vmask vmaskf_le(vfloat x, vfloat y)
334 {
335     return (__m128i)_mm_cmple_ps(x, y);
336 }
vmaskf_gt(vfloat x,vfloat y)337 static INLINE vmask vmaskf_gt(vfloat x, vfloat y)
338 {
339     return (__m128i)_mm_cmpgt_ps(x, y);
340 }
vmaskf_ge(vfloat x,vfloat y)341 static INLINE vmask vmaskf_ge(vfloat x, vfloat y)
342 {
343     return (__m128i)_mm_cmpge_ps(x, y);
344 }
345 
346 
vmaski_eq(vint x,vint y)347 static INLINE vmask vmaski_eq(vint x, vint y)
348 {
349     __m128 s = (__m128)_mm_cmpeq_epi32(x, y);
350     return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0));
351 }
352 
vsel(vmask mask,vdouble x,vdouble y)353 static INLINE vdouble vsel(vmask mask, vdouble x, vdouble y)
354 {
355     return (__m128d)vorm(vandm(mask, (__m128i)x), vandnotm(mask, (__m128i)y));
356 }
357 
vseli_lt(vdouble d0,vdouble d1,vint x,vint y)358 static INLINE vint vseli_lt(vdouble d0, vdouble d1, vint x, vint y)
359 {
360     vmask mask = (vmask)_mm_cmpeq_ps(_mm_cvtpd_ps((vdouble)vmask_lt(d0, d1)), _mm_set_ps(0, 0, 0, 0));
361     return vori(vandnoti(mask, x), vandi(mask, y));
362 }
363 
364 //
365 
vcast_vi2_vm(vmask vm)366 static INLINE vint2 vcast_vi2_vm(vmask vm)
367 {
368     return (vint2)vm;
369 }
vcast_vm_vi2(vint2 vi)370 static INLINE vmask vcast_vm_vi2(vint2 vi)
371 {
372     return (vmask)vi;
373 }
374 
vrint_vi2_vf(vfloat vf)375 static INLINE vint2 vrint_vi2_vf(vfloat vf)
376 {
377     return _mm_cvtps_epi32(vf);
378 }
vtruncate_vi2_vf(vfloat vf)379 static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
380 {
381     return _mm_cvttps_epi32(vf);
382 }
vcast_vf_vi2(vint2 vi)383 static INLINE vfloat vcast_vf_vi2(vint2 vi)
384 {
385     return _mm_cvtepi32_ps(vcast_vm_vi2(vi));
386 }
vcast_vi2_i(int i)387 static INLINE vint2 vcast_vi2_i(int i)
388 {
389     return _mm_set_epi32(i, i, i, i);
390 }
391 
vaddi2(vint2 x,vint2 y)392 static INLINE vint2 vaddi2(vint2 x, vint2 y)
393 {
394     return vaddi(x, y);
395 }
vsubi2(vint2 x,vint2 y)396 static INLINE vint2 vsubi2(vint2 x, vint2 y)
397 {
398     return vsubi(x, y);
399 }
400 
vandi2(vint2 x,vint2 y)401 static INLINE vint2 vandi2(vint2 x, vint2 y)
402 {
403     return vandi(x, y);
404 }
vandnoti2(vint2 x,vint2 y)405 static INLINE vint2 vandnoti2(vint2 x, vint2 y)
406 {
407     return vandnoti(x, y);
408 }
vori2(vint2 x,vint2 y)409 static INLINE vint2 vori2(vint2 x, vint2 y)
410 {
411     return vori(x, y);
412 }
vxori2(vint2 x,vint2 y)413 static INLINE vint2 vxori2(vint2 x, vint2 y)
414 {
415     return vxori(x, y);
416 }
417 
vslli2(vint2 x,int c)418 static INLINE vint2 vslli2(vint2 x, int c)
419 {
420     return vslli(x, c);
421 }
vsrli2(vint2 x,int c)422 static INLINE vint2 vsrli2(vint2 x, int c)
423 {
424     return vsrli(x, c);
425 }
vsrai2(vint2 x,int c)426 static INLINE vint2 vsrai2(vint2 x, int c)
427 {
428     return vsrai(x, c);
429 }
430 
vmaski2_eq(vint2 x,vint2 y)431 static INLINE vmask vmaski2_eq(vint2 x, vint2 y)
432 {
433     return _mm_cmpeq_epi32(x, y);
434 }
vseli2(vmask m,vint2 x,vint2 y)435 static INLINE vint2 vseli2(vmask m, vint2 x, vint2 y)
436 {
437     return vorm(vandm(m, x), vandnotm(m, y));
438 }
439 
440 //
441 
vcast_d_vd(vdouble v)442 static INLINE double vcast_d_vd(vdouble v)
443 {
444     double s[2];
445     _mm_storeu_pd(s, v);
446     return s[0];
447 }
448 
vcast_f_vf(vfloat v)449 static INLINE float vcast_f_vf(vfloat v)
450 {
451     float s[4];
452     _mm_storeu_ps(s, v);
453     return s[0];
454 }
455 
vsignbit(vdouble d)456 static INLINE vmask vsignbit(vdouble d)
457 {
458     return _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0));
459 }
460 
vsign(vdouble d)461 static INLINE vdouble vsign(vdouble d)
462 {
463     return (__m128d)_mm_or_si128((__m128i)_mm_set_pd(1, 1), _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0)));
464 }
465 
vmulsign(vdouble x,vdouble y)466 static INLINE vdouble vmulsign(vdouble x, vdouble y)
467 {
468     return (__m128d)vxori((__m128i)x, vsignbit(y));
469 }
470 
vmask_isinf(vdouble d)471 static INLINE vmask vmask_isinf(vdouble d)
472 {
473     return (vmask)_mm_cmpeq_pd(vabs(d), _mm_set_pd(INFINITY, INFINITY));
474 }
475 
vmask_ispinf(vdouble d)476 static INLINE vmask vmask_ispinf(vdouble d)
477 {
478     return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(INFINITY, INFINITY));
479 }
480 
vmask_isminf(vdouble d)481 static INLINE vmask vmask_isminf(vdouble d)
482 {
483     return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(-INFINITY, -INFINITY));
484 }
485 
vmask_isnan(vdouble d)486 static INLINE vmask vmask_isnan(vdouble d)
487 {
488     return (vmask)_mm_cmpneq_pd(d, d);
489 }
490 
visinf(vdouble d)491 static INLINE vdouble visinf(vdouble d)
492 {
493     return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)_mm_set_pd(1, 1)));
494 }
495 
visinf2(vdouble d,vdouble m)496 static INLINE vdouble visinf2(vdouble d, vdouble m)
497 {
498     return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)m));
499 }
500 
501 //
502 
vpow2i(vint q)503 static INLINE vdouble vpow2i(vint q)
504 {
505     q = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), q);
506     q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1, 3, 0, 3));
507     return (__m128d)_mm_slli_epi32(q, 20);
508 }
509 
vldexp(vdouble x,vint q)510 static INLINE vdouble vldexp(vdouble x, vint q)
511 {
512     vint m = _mm_srai_epi32(q, 31);
513     m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7);
514     q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2));
515     vdouble y = vpow2i(m);
516     return vmul(vmul(vmul(vmul(vmul(x, y), y), y), y), vpow2i(q));
517 }
518 
vilogbp1(vdouble d)519 static INLINE vint vilogbp1(vdouble d)
520 {
521     vint m = vmask_lt(d, vcast_vd_d(4.9090934652977266E-91));
522     d = vsel(m, vmul(vcast_vd_d(2.037035976334486E90), d), d);
523     __m128i q = _mm_and_si128((__m128i)d, _mm_set_epi32(((1 << 12) - 1) << 20, 0, ((1 << 12) - 1) << 20, 0));
524     q = _mm_srli_epi32(q, 20);
525     q = vorm(vandm   (m, _mm_sub_epi32(q, _mm_set_epi32(300 + 0x3fe, 0, 300 + 0x3fe, 0))),
526              vandnotm(m, _mm_sub_epi32(q, _mm_set_epi32(      0x3fe, 0,       0x3fe, 0))));
527     q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(0, 0, 3, 1));
528     return q;
529 }
530 
vupper(vdouble d)531 static INLINE vdouble vupper(vdouble d)
532 {
533     return (__m128d)_mm_and_si128((__m128i)d, _mm_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000));
534 }
535 
536 //
537 
538 typedef struct {
539     vdouble x, y;
540 } vdouble2;
541 
dd(vdouble h,vdouble l)542 static INLINE vdouble2 dd(vdouble h, vdouble l)
543 {
544     vdouble2 ret = {h, l};
545     return ret;
546 }
547 
vsel2(vmask mask,vdouble2 x,vdouble2 y)548 static INLINE vdouble2 vsel2(vmask mask, vdouble2 x, vdouble2 y)
549 {
550     return dd((__m128d)vorm(vandm(mask, (__m128i)x.x), vandnotm(mask, (__m128i)y.x)),
551               (__m128d)vorm(vandm(mask, (__m128i)x.y), vandnotm(mask, (__m128i)y.y)));
552 }
553 
abs_d(vdouble2 x)554 static INLINE vdouble2 abs_d(vdouble2 x)
555 {
556     return dd((__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0, -0.0), x.x), x.x),
557               (__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0, -0.0), x.x), x.y));
558 }
559