1 /*
2 * Copyright (C) 2018 Cavium, Inc. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18 #if defined(TARGET_LINUX_ARM64)
19 #include <stdint.h>
20 #include <assert.h>
21 #include <math.h>
22 #include <arm_neon.h>
23
24 /*
25 * https://www.ibm.com/support/knowledgecenter/SSGH2K_13.1.2/com.ibm.xlc131.aix.doc/compiler_ref/vec_intrin_cpp.html
26 */
27
28 struct __s128f {
29 typedef float vrs4_t __attribute__((vector_size(4 * sizeof(float))));
30 typedef double vrd2_t __attribute__((vector_size(2 * sizeof(double))));
31 union {
32 float xf[4];
33 int xi[4];
34 unsigned int xui[4];
35 double xd[2];
36 vrs4_t xfrd;
37 vrd2_t xdrd;
38 float32x4_t nfrd;
39 float64x2_t ndrd;
40 long double b;
41 unsigned char c[16];
42 } __attribute__((aligned(16)));
43
__s128f__s128f44 __s128f() : b(0) { }
45
__s128f__s128f46 __s128f(const float f[4]) : b(0) {
47 xfrd[0] = f[0];
48 xfrd[1] = f[1];
49 xfrd[2] = f[2];
50 xfrd[3] = f[3];
51 }
52
__s128f__s128f53 __s128f(const double d[2]) : b(0) {
54 xdrd[0] = d[0];
55 xdrd[1] = d[1];
56 }
57
__s128f__s128f58 __s128f(const int i[4]) : b(0) {
59 xfrd[0] = i[0];
60 xfrd[1] = i[1];
61 xfrd[2] = i[2];
62 xfrd[3] = i[3];
63 }
64
__s128f__s128f65 __s128f(const unsigned int i[4]) : b(0) {
66 xfrd[0] = i[0];
67 xfrd[1] = i[1];
68 xfrd[2] = i[2];
69 xfrd[3] = i[3];
70 }
71
__s128f__s128f72 __s128f(float e0, float e1, float e2, float e3) : b(0) {
73 xfrd[0] = e0;
74 xfrd[1] = e1;
75 xfrd[2] = e2;
76 xfrd[3] = e3;
77 }
78
__s128f__s128f79 __s128f(const long double& v) : b(v) { }
80
__s128f__s128f81 __s128f(const vrs4_t& v) : xfrd(v) { }
82
__s128f__s128f83 __s128f(const vrd2_t& v) : xdrd(v) { }
84
__s128f__s128f85 __s128f(const __s128f& rhs) : b(rhs.b) { }
86
87 inline __s128f& operator=(const __s128f& rhs) {
88 if (this != &rhs)
89 xfrd = rhs.xfrd;
90
91 return *this;
92 }
93
94 inline __s128f& operator=(long double ld) {
95 b = ld;
96 return *this;
97 }
98
99 inline __s128f& operator=(const vrs4_t& rhs) {
100 xfrd = rhs;
101 return *this;
102 }
103
104 inline __s128f& operator=(const vrd2_t& rhs) {
105 xdrd = rhs;
106 return *this;
107 }
108
109 inline operator long double() const {
110 return b;
111 }
112
vrs4_t__s128f113 inline operator vrs4_t() const {
114 return xfrd;
115 }
116
vrd2_t__s128f117 inline operator vrd2_t() const {
118 return xdrd;
119 }
120
121 inline operator bool() const {
122 return xfrd[0] == 0.0f && xfrd[1] == 0.0f &&
123 xfrd[2] == 0.0f && xfrd[3] == 0.0f;
124 }
125
126 inline __s128f operator+(float f) const {
127 __s128f r(*this);
128
129 if (f != 0.0) {
130 r.xfrd[0] += f;
131 r.xfrd[1] += f;
132 r.xfrd[2] += f;
133 r.xfrd[3] += f;
134 }
135
136 return r;
137 }
138
139 inline __s128f operator+(double d) const {
140 __s128f r(*this);
141
142 if (d != 0.0) {
143 r.xdrd[0] += d;
144 r.xdrd[1] += d;
145 }
146
147 return r;
148 }
149
150 inline __s128f operator+(unsigned int i) const {
151 __s128f r(*this);
152
153 if (i != 0U) {
154 r.xfrd[0] += i;
155 r.xfrd[1] += i;
156 r.xfrd[2] += i;
157 r.xfrd[3] += i;
158 }
159
160 return r;
161 }
162
163 inline __s128f operator+(int i) const {
164 __s128f r(*this);
165
166 if (i != 0) {
167 r.xfrd[0] += i;
168 r.xfrd[1] += i;
169 r.xfrd[2] += i;
170 r.xfrd[3] += i;
171 }
172
173 return r;
174 }
175
176 inline __s128f operator+(const long double& ld) const {
177 __s128f r(*this);
178
179 if (ld != 0.0)
180 r.b += ld;
181
182 return r;
183 }
184
185 inline __s128f operator+(const __s128f& s) const {
186 __s128f r(*this);
187
188 r.xfrd[0] += s.xfrd[0];
189 r.xfrd[1] += s.xfrd[1];
190 r.xfrd[2] += s.xfrd[2];
191 r.xfrd[3] += s.xfrd[3];
192
193 return r;
194 }
195
196 inline __s128f operator-(const __s128f& s) const {
197 __s128f r(*this);
198
199 r.xfrd[0] -= s.xfrd[0];
200 r.xfrd[1] -= s.xfrd[1];
201 r.xfrd[2] -= s.xfrd[2];
202 r.xfrd[3] -= s.xfrd[3];
203
204 return r;
205 }
206
207 inline __s128f operator-() const {
208 return __s128f(-xfrd[0], -xfrd[1], -xfrd[2], -xfrd[3]);
209 }
210
211 inline __s128f operator*(const __s128f& rhs) const {
212 __s128f r(*this);
213
214 r.xfrd[0] *= rhs.xfrd[0];
215 r.xfrd[1] *= rhs.xfrd[1];
216 r.xfrd[2] *= rhs.xfrd[2];
217 r.xfrd[3] *= rhs.xfrd[3];
218
219 return r;
220 }
221
222 inline __s128f operator/(const __s128f& rhs) const {
223 __s128f r(*this);
224
225 r.xfrd[0] /= rhs.xfrd[0];
226 r.xfrd[1] /= rhs.xfrd[1];
227 r.xfrd[2] /= rhs.xfrd[2];
228 r.xfrd[3] /= rhs.xfrd[3];
229
230 return r;
231 }
232
233 inline __s128f& operator+=(unsigned int i) {
234 if (i != 0U) {
235 xfrd[0] += i;
236 xfrd[1] += i;
237 xfrd[2] += i;
238 xfrd[3] += i;
239 }
240
241 return *this;
242 }
243
244 inline __s128f& operator+=(int i) {
245 if (i != 0) {
246 xfrd[0] += i;
247 xfrd[1] += i;
248 xfrd[2] += i;
249 xfrd[3] += i;
250 }
251
252 return *this;
253 }
254
255 inline __s128f& operator+=(float f) {
256 if (f != 0.0) {
257 xfrd[0] += f;
258 xfrd[1] += f;
259 xfrd[2] += f;
260 xfrd[3] += f;
261 }
262
263 return *this;
264 }
265
266 inline __s128f& operator+=(double d) {
267 if (d != 0.0) {
268 xfrd[0] += (float) d;
269 xfrd[1] += (float) d;
270 xfrd[2] += (float) d;
271 xfrd[3] += (float) d;
272 }
273
274 return *this;
275 }
276
277 inline __s128f& operator+=(long double ld) {
278 if (ld != 0.0)
279 b += ld;
280
281 return *this;
282 }
283
284 inline __s128f& operator+=(const __s128f& rhs) {
285 xfrd[0] += rhs.xfrd[0];
286 xfrd[1] += rhs.xfrd[1];
287 xfrd[2] += rhs.xfrd[2];
288 xfrd[3] += rhs.xfrd[3];
289
290 return *this;
291 }
292
293 inline __s128f& operator/=(const __s128f& rhs) {
294 xfrd[0] /= rhs.xfrd[0];
295 xfrd[1] /= rhs.xfrd[1];
296 xfrd[2] /= rhs.xfrd[2];
297 xfrd[3] /= rhs.xfrd[3];
298
299 return *this;
300 }
301
302 inline __s128f& operator*=(const __s128f& rhs) {
303 xfrd[0] *= rhs.xfrd[0];
304 xfrd[1] *= rhs.xfrd[1];
305 xfrd[2] *= rhs.xfrd[2];
306 xfrd[3] *= rhs.xfrd[3];
307
308 return *this;
309 }
310
311 inline __s128f& operator-=(const __s128f& rhs) {
312 xfrd[0] -= rhs.xfrd[0];
313 xfrd[1] -= rhs.xfrd[1];
314 xfrd[2] -= rhs.xfrd[2];
315 xfrd[3] -= rhs.xfrd[3];
316
317 return *this;
318 }
319 } __attribute__((aligned(16)));
320
321
322 struct __s128d {
323 typedef float vrs4_t __attribute__((vector_size(4 * sizeof(float))));
324 typedef double vrd2_t __attribute__((vector_size(2 * sizeof(double))));
325 union {
326 float xf[4];
327 int xi[4];
328 unsigned int xui[4];
329 unsigned long xul[2];
330 double xd[2];
331 vrs4_t xfrd;
332 vrd2_t xdrd;
333 float32x4_t nfrd;
334 float64x2_t ndrd;
335 long double b;
336 unsigned char c[16];
337 } __attribute__((aligned(16)));
338
339
__s128d__s128d340 __s128d() : b(0) { }
341
__s128d__s128d342 __s128d(const double d[2]) : b(0) {
343 xdrd[0] = d[0];
344 xdrd[1] = d[1];
345 }
346
__s128d__s128d347 __s128d(const float f[2]) : b(0) {
348 xdrd[0] = f[0];
349 xdrd[1] = f[1];
350 }
351
__s128d__s128d352 __s128d(const int i[2]) : b(0) {
353 xdrd[0] = i[0];
354 xdrd[1] = i[1];
355 }
356
__s128d__s128d357 __s128d(const unsigned int i[2]) : b(0) {
358 xdrd[0] = i[0];
359 xdrd[1] = i[1];
360 }
361
__s128d__s128d362 __s128d(float f0, float f1) : b(0) {
363 xdrd[0] = f0;
364 xdrd[1] = f1;
365 }
366
__s128d__s128d367 __s128d(double d0, double d1) : b(0) {
368 xdrd[0] = d0;
369 xdrd[1] = d1;
370 }
371
__s128d__s128d372 __s128d(const long double& v) : b(v) { }
373
__s128d__s128d374 __s128d(const __s128d& rhs) : b(rhs.b) { }
375
__s128d__s128d376 __s128d(const vrd2_t& v) : xdrd(v) { }
377
__s128d__s128d378 __s128d(const vrs4_t& v) : xfrd(v) { }
379
380 inline __s128d& operator=(const __s128d& rhs) {
381 if (this != &rhs)
382 xdrd = rhs.xdrd;
383
384 return *this;
385 }
386
387 inline __s128d& operator=(long double ld) {
388 b = ld;
389 return *this;
390 }
391
392 inline __s128d& operator=(const vrd2_t& rhs) {
393 xdrd = rhs;
394 return *this;
395 }
396
397 inline __s128d& operator=(const vrs4_t& rhs) {
398 xfrd = rhs;
399 return *this;
400 }
401
402 inline operator bool() const {
403 return xdrd[0] != 0.00 && xdrd[1] != 0.00;
404 }
405
406 inline operator long double() const {
407 return b;
408 }
409
vrs4_t__s128d410 inline operator vrs4_t() const {
411 return xfrd;
412 }
413
vrd2_t__s128d414 inline operator vrd2_t() const {
415 return xdrd;
416 }
417
418 inline __s128d operator+(unsigned int i) const {
419 __s128d r(*this);
420
421 if (i != 0U) {
422 r.xdrd[0] += i;
423 r.xdrd[1] += i;
424 }
425
426 return r;
427 }
428
429 inline __s128d operator+(int i) const {
430 __s128d r(*this);
431
432 if (i != 0) {
433 r.xdrd[0] += i;
434 r.xdrd[1] += i;
435 }
436
437 return r;
438 }
439
440 inline __s128d operator+(float f) const {
441 __s128d r(*this);
442
443 if (f != 0.0) {
444 r.xdrd[0] += f;
445 r.xdrd[1] += f;
446 }
447
448 return r;
449 }
450
451 inline __s128d operator+(double d) const {
452 __s128d r(*this);
453
454 if (d != 0.0) {
455 r.xdrd[0] += d;
456 r.xdrd[1] += d;
457 }
458
459 return r;
460 }
461
462 inline __s128d operator+(const long double& ld) const {
463 __s128d r(*this);
464
465 if (ld != 0.0)
466 r.b += ld;
467
468 return r;
469 }
470
471 inline __s128d operator+(const __s128d& s) const {
472 __s128d r(*this);
473
474 r.xdrd[0] += s.xdrd[0];
475 r.xdrd[1] += s.xdrd[1];
476
477 return r;
478 }
479
480 inline __s128d operator-(const __s128d& s) const {
481 __s128d r(*this);
482
483 r.xdrd[0] -= s.xdrd[0];
484 r.xdrd[1] -= s.xdrd[1];
485
486 return r;
487 }
488
489 inline __s128d operator-() const {
490 return __s128d(-xdrd[0], -xdrd[1]);
491 }
492
493 inline __s128d operator*(const __s128d& rhs) const {
494 __s128d r(*this);
495
496 r.xdrd[0] *= rhs.xdrd[0];
497 r.xdrd[1] *= rhs.xdrd[1];
498
499 return r;
500 }
501
502 inline __s128d operator/(const __s128d& rhs) const {
503 __s128d r(*this);
504
505 r.xdrd[0] /= rhs.xdrd[0];
506 r.xdrd[1] /= rhs.xdrd[1];
507
508 return r;
509 }
510
511 inline __s128d& operator+=(unsigned int i) {
512 if (i != 0U) {
513 xdrd[0] += i;
514 xdrd[1] += i;
515 }
516
517 return *this;
518 }
519
520 inline __s128d& operator+=(int i) {
521 if (i != 0) {
522 xdrd[0] += i;
523 xdrd[1] += i;
524 }
525
526 return *this;
527 }
528
529 inline __s128d& operator+=(float f) {
530 if (f != 0.0) {
531 xdrd[0] += f;
532 xdrd[1] += f;
533 }
534
535 return *this;
536 }
537
538 inline __s128d& operator+=(double d) {
539 if (d != 0.0) {
540 xdrd[0] += d;
541 xdrd[1] += d;
542 }
543
544 return *this;
545 }
546
547 inline __s128d& operator+=(long double ld) {
548 if (ld != 0.0) {
549 xdrd[0] += (double) ld;
550 xdrd[1] += (double) ld;
551 }
552
553 return *this;
554 }
555
556 inline __s128d& operator+=(const __s128d& rhs) {
557 xdrd[0] += rhs.xdrd[0];
558 xdrd[1] += rhs.xdrd[1];
559
560 return *this;
561 }
562
563 inline __s128d& operator*=(const __s128d& rhs) {
564 xdrd[0] *= rhs.xdrd[0];
565 xdrd[1] *= rhs.xdrd[1];
566
567 return *this;
568 }
569
570 inline __s128d& operator/=(const __s128d& rhs) {
571 xdrd[0] /= rhs.xdrd[0];
572 xdrd[1] /= rhs.xdrd[1];
573
574 return *this;
575 }
576
577 inline __s128d& operator-=(const __s128d& rhs) {
578 xdrd[0] -= rhs.xdrd[0];
579 xdrd[1] -= rhs.xdrd[1];
580
581 return *this;
582 }
583 } __attribute__((aligned(16)));
584
585 struct __s128i {
586 typedef enum __s128iType {
587 Invalid = 0,
588 SignedInt = 1,
589 UnsignedInt = 2,
590 SignedLong = 3,
591 UnsignedLong = 4,
592 } s128iType;
593
594 typedef int32_t vis4_t __attribute__((vector_size(4 * sizeof(int32_t))));
595 typedef int64_t vid2_t __attribute__((vector_size(2 * sizeof(int64_t))));
596
597 union {
598 float xf[4];
599 int xi[4];
600 unsigned int xui[4];
601 long xl[2];
602 unsigned long xul[2];
603 vis4_t xird;
604 vid2_t xlrd;
605 int32x4_t nird;
606 int64x2_t nlrd;
607 double xd[2];
608 long double b;
609 unsigned char c[16];
610 } __attribute__((aligned(16)));
611
612 s128iType type;
613
__s128i__s128i614 __s128i() : b(0), type(UnsignedInt) { }
615
__s128i__s128i616 __s128i(const float f[4]) : b(0), type(SignedInt) {
617 xird[0] = f[0];
618 xird[1] = f[1];
619 xird[2] = f[2];
620 xird[3] = f[3];
621 }
622
__s128i__s128i623 __s128i(const double d[2]) : b(0), type(SignedLong) {
624 xlrd[0] = d[0];
625 xlrd[1] = d[1];
626 }
627
__s128i__s128i628 __s128i(const int i[4]) : b(0), type(SignedInt) {
629 xird[0] = i[0];
630 xird[1] = i[1];
631 xird[2] = i[2];
632 xird[3] = i[3];
633 }
634
__s128i__s128i635 __s128i(const unsigned int i[4]) : b(0), type(UnsignedInt) {
636 xui[0] = i[0];
637 xui[1] = i[1];
638 xui[2] = i[2];
639 xui[3] = i[3];
640 }
641
__s128i__s128i642 __s128i(int i0, int i1, int i2, int i3) : b(0), type(SignedInt) {
643 xird[0] = i0;
644 xird[1] = i1;
645 xird[2] = i2;
646 xird[3] = i3;
647 }
648
__s128i__s128i649 __s128i(unsigned int i0, unsigned int i1,
650 unsigned int i2, unsigned int i3)
651 : b(0), type(UnsignedInt) {
652 xui[0] = i0;
653 xui[1] = i1;
654 xui[2] = i2;
655 xui[3] = i3;
656 }
657
__s128i__s128i658 __s128i(long l0, long l1)
659 : b(0), type(SignedLong) {
660 xlrd[0] = l0;
661 xlrd[1] = l1;
662 }
663
__s128i__s128i664 __s128i(unsigned long l0, unsigned long l1)
665 : b(0), type(UnsignedLong) {
666 xul[0] = l0;
667 xul[1] = l1;
668 }
669
__s128i__s128i670 __s128i(const vis4_t& v) : xird(v), type(SignedInt) { }
671
__s128i__s128i672 __s128i(const vid2_t& v) : xlrd(v), type(SignedLong) { }
673
__s128i__s128i674 __s128i(const long double& v) : b(v), type(UnsignedInt) { }
675
__s128i__s128i676 __s128i(const __s128i& rhs) : b(rhs.b), type(rhs.type) { }
677
678 inline __s128i& operator=(const __s128i& rhs) {
679 if (this != &rhs) {
680 b = rhs.b;
681 type = rhs.type;
682 }
683
684 return *this;
685 }
686
687 inline __s128i& operator=(const long double& ld) {
688 b = ld;
689 type = UnsignedInt;
690 return *this;
691 }
692
693 inline __s128i& operator=(int i) {
694 xird[0] = i;
695 xird[1] = i;
696 xird[2] = i;
697 xird[3] = i;
698 type = SignedInt;
699 return *this;
700 }
701
702 inline __s128i& operator=(unsigned int i) {
703 xird[0] = i;
704 xird[1] = i;
705 xird[2] = i;
706 xird[3] = i;
707 type = UnsignedInt;
708 return *this;
709 }
710
711 inline __s128i& operator=(const long& l) {
712 xlrd[0] = l;
713 xlrd[1] = l;
714 type = SignedLong;
715 return *this;
716 }
717
718 inline __s128i& operator=(const unsigned long& l) {
719 xul[0] = l;
720 xul[1] = l;
721 type = SignedLong;
722 return *this;
723 }
724
725 inline __s128i& operator=(const vis4_t& rhs) {
726 xird = rhs;
727 return *this;
728 }
729
730 inline __s128i& operator=(const vid2_t& rhs) {
731 xlrd = rhs;
732 return *this;
733 }
734
735 inline operator bool() const {
736 return xird[0] == 0U && xird[1] == 0U && xird[2] == 0U && xird[3] == 0U;
737 }
738
739 inline operator long double() const {
740 return b;
741 }
742
vis4_t__s128i743 inline operator vis4_t() const {
744 return xird;
745 }
746
vid2_t__s128i747 inline operator vid2_t() const {
748 return xlrd;
749 }
750
751 #if __GNUC__ < 8
int32x4_t__s128i752 inline operator int32x4_t() const {
753 return nird;
754 }
755
int64x2_t__s128i756 inline operator int64x2_t() const {
757 return nlrd;
758 }
759 #endif
760
761 inline __s128i operator+(unsigned int i) const {
762 __s128i r(*this);
763
764 if (i != 0U) {
765 r.xird[0] += i;
766 r.xird[1] += i;
767 r.xird[2] += i;
768 r.xird[3] += i;
769 }
770
771 return r;
772 }
773
774 inline __s128i operator+(int i) const {
775 __s128i r(*this);
776
777 if (i != 0) {
778 r.xird[0] += i;
779 r.xird[1] += i;
780 r.xird[2] += i;
781 r.xird[3] += i;
782 }
783
784 return r;
785 }
786
787 inline __s128i operator+(long l) const {
788 __s128i r(*this);
789
790 if (l != 0L) {
791 r.xlrd[0] += l;
792 r.xlrd[1] += l;
793 }
794
795 return r;
796 }
797
798 inline __s128i operator+(unsigned long l) const {
799 __s128i r(*this);
800
801 if (l != 0UL) {
802 r.xul[0] += l;
803 r.xul[1] += l;
804 }
805
806 return r;
807 }
808
809 inline __s128i operator+(float f) const {
810 __s128i r(*this);
811
812 if (f != 0.0) {
813 r.xird[0] += f;
814 r.xird[1] += f;
815 r.xird[2] += f;
816 r.xird[3] += f;
817 }
818
819 return r;
820 }
821
822 inline __s128i operator+(double d) const {
823 __s128i r(*this);
824
825 if (d != 0.0) {
826 r.xird[0] += d;
827 r.xird[1] += d;
828 r.xird[2] += d;
829 r.xird[3] += d;
830 }
831
832 return r;
833 }
834
835 inline __s128f operator/(const __s128i& rhs) const {
836 __s128f r;
837
838 if (type == rhs.type && type == SignedInt) {
839 r.xfrd[0] = (float) xird[0] / rhs.xird[0];
840 r.xfrd[1] = (float) xird[1] / rhs.xird[1];
841 r.xfrd[2] = (float) xird[2] / rhs.xird[2];
842 r.xfrd[3] = (float) xird[3] / rhs.xird[3];
843 } else if (type == rhs.type && type == UnsignedInt) {
844 r.xfrd[0] = (float) xui[0] / rhs.xui[0];
845 r.xfrd[1] = (float) xui[1] / rhs.xui[1];
846 r.xfrd[2] = (float) xui[2] / rhs.xui[2];
847 r.xfrd[3] = (float) xui[3] / rhs.xui[3];
848 }
849
850 return r;
851 }
852
853 inline __s128f operator/(int i) const {
854 __s128f r;
855
856 r.xfrd[0] = (float) xird[0] / i;
857 r.xfrd[1] = (float) xird[1] / i;
858 r.xfrd[2] = (float) xird[2] / i;
859 r.xfrd[3] = (float) xird[3] / i;
860
861 return r;
862 }
863
864 inline __s128f operator/(unsigned int i) const {
865 __s128f r;
866
867 r.xfrd[0] = (float) xui[0] / i;
868 r.xfrd[1] = (float) xui[1] / i;
869 r.xfrd[2] = (float) xui[2] / i;
870 r.xfrd[3] = (float) xui[3] / i;
871
872 return r;
873 }
874
875 inline __s128i operator+(const __s128i& s) const {
876 __s128i r;
877 r = xird + s.xird;
878 return r;
879 }
880
881 inline __s128i operator-(const __s128i& s) const {
882 __s128i r;
883 r = xird - s.xird;
884 return r;
885 }
886
operator___s128i887 inline __s128i operator_() const {
888 return __s128i(-xird[0], -xird[1], -xird[2], xird[3]);
889 }
890
891 inline __s128i operator+(const long double& ld) const {
892 __s128i r(*this);
893
894 if (ld != 0.0)
895 r.b += ld;
896
897 return r;
898 }
899
900 inline __s128i& operator+=(unsigned int i) {
901 if (i != 0U)
902 xird = xird + i;
903
904 return *this;
905 }
906
907 inline __s128i& operator+=(int i) {
908 if (i != 0)
909 xird = xird + i;
910
911 return *this;
912 }
913
914 inline __s128i& operator+=(long l) {
915 if (l != 0U) {
916 xlrd[0] += l;
917 xlrd[1] += l;
918 }
919
920 return *this;
921 }
922
923 inline __s128i& operator+=(unsigned long l) {
924 if (l != 0UL)
925 xlrd = xlrd + l;
926
927 return *this;
928 }
929
930 inline __s128i& operator+=(float f) {
931 if (f != 0.0) {
932 xird[0] += f;
933 xird[1] += f;
934 xird[2] += f;
935 xird[3] += f;
936 }
937
938 return *this;
939 }
940
941 inline __s128i& operator+=(double d) {
942 if (d != 0.0) {
943 xird[0] += d;
944 xird[1] += d;
945 xird[2] += d;
946 xird[3] += d;
947 }
948
949 return *this;
950 }
951
952 inline __s128i& operator+=(const long double& ld) {
953 if (ld != 0.0) {
954 xird[0] += ld;
955 xird[1] += ld;
956 xird[2] += ld;
957 xird[3] += ld;
958 }
959
960 return *this;
961 }
962
963 inline __s128i& operator+=(const __s128i& rhs) {
964 xird[0] += rhs.xird[0];
965 xird[1] += rhs.xird[1];
966 xird[2] += rhs.xird[2];
967 xird[3] += rhs.xird[3];
968
969 return *this;
970 }
971
972 inline __s128i& operator-=(const __s128i& rhs) {
973 xird[0] -= rhs.xird[0];
974 xird[1] -= rhs.xird[1];
975 xird[2] -= rhs.xird[2];
976 xird[3] -= rhs.xird[3];
977
978 return *this;
979 }
980 } __attribute__((aligned(16)));
981
982 typedef struct __s128f __m128;
983 typedef struct __s128i __m128i;
984 typedef struct __s128d __m128d;
985
986 static inline void
987 __attribute__((always_inline))
vec_st(const __m128 & vld,int v,unsigned char * t)988 vec_st(const __m128& vld, int v, unsigned char* t)
989 {
990 __m128 vldx = v ? vld + v : vld;
991
992 t[0] = vldx.c[0];
993 t[1] = vldx.c[1];
994 t[2] = vldx.c[2];
995 t[3] = vldx.c[3];
996 t[4] = vldx.c[4];
997 t[5] = vldx.c[5];
998 t[6] = vldx.c[6];
999 t[7] = vldx.c[7];
1000 t[8] = vldx.c[8];
1001 t[9] = vldx.c[9];
1002 t[10] = vldx.c[10];
1003 t[11] = vldx.c[11];
1004 t[12] = vldx.c[12];
1005 t[13] = vldx.c[13];
1006 t[14] = vldx.c[14];
1007 t[15] = vldx.c[15];
1008 }
1009
1010 static inline void
1011 __attribute__((always_inline))
vec_st(const __m128 & vld,int v,unsigned int * t)1012 vec_st(const __m128& vld, int v, unsigned int* t)
1013 {
1014 __m128 vldx = v ? vld + v : vld;
1015
1016 t[0] = vldx.xui[0];
1017 t[1] = vldx.xui[1];
1018 t[2] = vldx.xui[2];
1019 t[3] = vldx.xui[3];
1020 }
1021
1022 static inline void
1023 __attribute__((always_inline))
vec_st(const __m128i & vldi,int v,unsigned char * t)1024 vec_st(const __m128i& vldi, int v, unsigned char* t)
1025 {
1026 __m128i vldx = v ? vldi + v : vldi;
1027
1028 t[0] = vldx.c[0];
1029 t[1] = vldx.c[1];
1030 t[2] = vldx.c[2];
1031 t[3] = vldx.c[3];
1032 t[4] = vldx.c[4];
1033 t[5] = vldx.c[5];
1034 t[6] = vldx.c[6];
1035 t[7] = vldx.c[7];
1036 t[8] = vldx.c[8];
1037 t[9] = vldx.c[9];
1038 t[10] = vldx.c[10];
1039 t[11] = vldx.c[11];
1040 t[12] = vldx.c[12];
1041 t[13] = vldx.c[13];
1042 t[14] = vldx.c[14];
1043 t[15] = vldx.c[15];
1044 }
1045
1046 static inline void
1047 __attribute__((always_inline))
vec_st(const __m128i & vldi,int v,unsigned int * t)1048 vec_st(const __m128i& vldi, int v, unsigned int* t)
1049 {
1050 __m128i vldx = v ? vldi + v : vldi;
1051
1052 t[0] = vldx.xui[0];
1053 t[1] = vldx.xui[1];
1054 t[2] = vldx.xui[2];
1055 t[3] = vldx.xui[3];
1056 }
1057
1058 static inline void
1059 __attribute__((always_inline))
vec_st(const __m128d & vldd,int v,unsigned char * t)1060 vec_st(const __m128d& vldd, int v, unsigned char* t)
1061 {
1062 __m128d vldx = v ? vldd + v : vldd;
1063
1064 t[0] = vldx.c[0];
1065 t[1] = vldx.c[1];
1066 t[2] = vldx.c[2];
1067 t[3] = vldx.c[3];
1068 t[4] = vldx.c[4];
1069 t[5] = vldx.c[5];
1070 t[6] = vldx.c[6];
1071 t[7] = vldx.c[7];
1072 t[8] = vldx.c[8];
1073 t[9] = vldx.c[9];
1074 t[10] = vldx.c[10];
1075 t[11] = vldx.c[11];
1076 t[12] = vldx.c[12];
1077 t[13] = vldx.c[13];
1078 t[14] = vldx.c[14];
1079 t[15] = vldx.c[15];
1080 }
1081
1082 static inline void
1083 __attribute__((always_inline))
vec_st(const __m128d & vldd,int v,unsigned int * t)1084 vec_st(const __m128d& vldd, int v, unsigned int* t)
1085 {
1086 __m128d vldx = v ? vldd + v : vldd;
1087
1088 t[0] = vldx.xui[0];
1089 t[1] = vldx.xui[1];
1090 t[2] = vldx.xui[2];
1091 t[3] = vldx.xui[3];
1092 }
1093
1094 static inline unsigned int
1095 __attribute__((always_inline))
vec_extract(const __m128 & vlf,unsigned int i)1096 vec_extract(const __m128& vlf, unsigned int i)
1097 {
1098 return static_cast<unsigned int>(vlf.xi[i]);
1099 }
1100
1101 static inline unsigned int
1102 __attribute__((always_inline))
vec_extract(const __m128i & vli,unsigned int i)1103 vec_extract(const __m128i& vli, unsigned int i)
1104 {
1105 return static_cast<unsigned int>(vli.xi[i]);
1106 }
1107
1108 static inline unsigned int
1109 __attribute__((always_inline))
vec_extract(const __m128d & vld,unsigned int i)1110 vec_extract(const __m128d& vld, unsigned int i)
1111 {
1112 return static_cast<unsigned int>(vld.xi[i]);
1113 }
1114
1115 static inline __m128i
1116 __attribute__((always_inline))
vec_insert(unsigned int v,const __m128i & vld,unsigned int i)1117 vec_insert(unsigned int v, const __m128i& vld, unsigned int i)
1118 {
1119 __m128i vldx = vld;
1120 vldx.xird[i] = v;
1121 return vldx;
1122 }
1123
1124 static inline __m128i
1125 __attribute__((always_inline))
vec_insert(int v,const __m128i & vld,unsigned int i)1126 vec_insert(int v, const __m128i& vld, unsigned int i)
1127 {
1128 __m128i vldx = vld;
1129 vldx.xird[i] = v;
1130 return vldx;
1131 }
1132
1133 static inline __m128
1134 __attribute__((always_inline))
vec_insert(float v,const __m128 & vld,unsigned int i)1135 vec_insert(float v, const __m128& vld, unsigned int i)
1136 {
1137 __m128 vldx = vld;
1138 vldx.xfrd[i] = v;
1139 return vldx;
1140 }
1141
1142 static inline __m128d
1143 __attribute__((always_inline))
vec_insert(double v,const __m128d & vld,unsigned int i)1144 vec_insert(double v, const __m128d& vld, unsigned int i)
1145 {
1146 __m128d vldx = vld;
1147 vldx.xdrd[i] = v;
1148 return vldx;
1149 }
1150
1151 static inline __m128i
1152 __attribute__((always_inline))
vec_splats(unsigned int i)1153 vec_splats(unsigned int i)
1154 {
1155 __m128i r(i, i, i, i);
1156 return r;
1157 }
1158
1159 static inline __m128i
1160 __attribute__((always_inline))
vec_splats(int i)1161 vec_splats(int i)
1162 {
1163 __m128i r(i, i, i, i);
1164 return r;
1165 }
1166
1167 static inline __m128i
1168 __attribute__((always_inline))
vec_splats(long int i)1169 vec_splats(long int i)
1170 {
1171 __m128i r;
1172 r.xl[0] = r.xi[2] = (long int) ((i & 0xFFFFFFFF00000000LL) >> 32);
1173 r.xl[1] = r.xi[3] = (long int) (i & 0xFFFFFFFFLL);
1174 return r;
1175 }
1176
1177 static inline __m128i
1178 __attribute__((always_inline))
vec_splats(unsigned long i)1179 vec_splats(unsigned long i)
1180 {
1181 __m128i r;
1182 r.xl[0] = r.xi[2] = (unsigned long) ((i & 0xFFFFFFFF00000000ULL) >> 32);
1183 r.xl[1] = r.xi[3] = (unsigned long) (i & 0xFFFFFFFFULL);
1184 return r;
1185 }
1186
1187 static inline __m128
1188 __attribute__((always_inline))
vec_splats(float f)1189 vec_splats(float f)
1190 {
1191 __m128 r(f, f, f, f);
1192 return r;
1193 }
1194
1195 static inline __m128d
1196 __attribute__((always_inline))
vec_splats(double d)1197 vec_splats(double d)
1198 {
1199 __m128d r(d, d);
1200 return r;
1201 }
1202
1203 static inline __m128
1204 __attribute__((always_inline))
vec_sqrt(const __m128 & a)1205 vec_sqrt(const __m128& a)
1206 {
1207 __m128 r;
1208
1209 r.xfrd[0] = sqrtf(a.xfrd[0]);
1210 r.xfrd[1] = sqrtf(a.xfrd[1]);
1211 r.xfrd[2] = sqrtf(a.xfrd[2]);
1212 r.xfrd[3] = sqrtf(a.xfrd[3]);
1213
1214 return r;
1215 }
1216
1217 static inline __m128d
1218 __attribute__((always_inline))
vec_sqrt(const __m128d & a)1219 vec_sqrt(const __m128d& a)
1220 {
1221 __m128d r;
1222
1223 r.xdrd[0] = sqrt(a.xdrd[0]);
1224 r.xdrd[1] = sqrt(a.xdrd[1]);
1225
1226 return r;
1227
1228 }
1229
1230 static inline __m128i
1231 __attribute__((always_inline))
vec_ld(unsigned int v,__m128i * vld)1232 vec_ld(unsigned int v, __m128i* vld)
1233 {
1234 *vld += v;
1235 return *vld;
1236 }
1237
1238 static inline __m128i
1239 __attribute__((always_inline))
vec_ld(int v,__m128i * vld)1240 vec_ld(int v, __m128i* vld)
1241 {
1242 *vld += v;
1243 return *vld;
1244 }
1245
1246 static inline __m128i
1247 __attribute__((always_inline))
vec_ld(unsigned long v,__m128i * vld)1248 vec_ld(unsigned long v, __m128i* vld)
1249 {
1250 *vld += v;
1251 return *vld;
1252 }
1253
1254 static inline __m128i
1255 __attribute__((always_inline))
vec_ld(long v,__m128i * vld)1256 vec_ld(long v, __m128i* vld)
1257 {
1258 *vld += v;
1259 return *vld;
1260 }
1261
1262 static inline __m128
1263 __attribute__((always_inline))
vec_ld(unsigned int v,__m128 * vld)1264 vec_ld(unsigned int v, __m128* vld)
1265 {
1266 *vld += v;
1267 return *vld;
1268 }
1269
1270 static inline __m128
1271 __attribute__((always_inline))
vec_ld(int v,__m128 * vld)1272 vec_ld(int v, __m128* vld)
1273 {
1274 *vld += v;
1275 return *vld;
1276 }
1277
1278 static inline __m128::vrd2_t
1279 __attribute__((always_inline))
vec_ld(int v,float vld[4])1280 vec_ld(int v, float vld[4])
1281 {
1282 __m128 r(vld);
1283 r += v;
1284 return r.operator vrd2_t();
1285 }
1286
1287 static inline __m128::vrd2_t
1288 __attribute__((always_inline))
vec_ld(unsigned int v,float vld[4])1289 vec_ld(unsigned int v, float vld[4])
1290 {
1291 __m128 r(vld);
1292 r += v;
1293 return r.operator vrd2_t();
1294 }
1295
1296 static inline __m128d
1297 __attribute__((always_inline))
vec_ld(unsigned int v,__m128d * vld)1298 vec_ld(unsigned int v, __m128d* vld)
1299 {
1300 *vld += v;
1301 return *vld;
1302 }
1303
1304 static inline __m128d
1305 __attribute__((always_inline))
vec_ld(int v,__m128d * vld)1306 vec_ld(int v, __m128d* vld)
1307 {
1308 *vld += v;
1309 return *vld;
1310 }
1311
1312 static inline int
1313 __attribute__((always_inline))
vec_any_ne(const __m128i & a,const __m128i & b)1314 vec_any_ne(const __m128i& a, const __m128i& b)
1315 {
1316 for (unsigned i = 0; i < 4; ++i) {
1317 if (a.xird[i] != b.xird[i])
1318 return 1;
1319 }
1320
1321 return 0;
1322 }
1323
1324 static inline int
1325 __attribute__((always_inline))
vec_any_ne(const __m128 & a,const __m128 & b)1326 vec_any_ne(const __m128& a, const __m128& b)
1327 {
1328 for (unsigned i = 0; i < 4; ++i) {
1329 if (a.xfrd[i] != b.xfrd[i])
1330 return 1;
1331 }
1332
1333 return 0;
1334 }
1335
1336 static inline int
1337 __attribute__((always_inline))
vec_any_ne(const __m128d & a,const __m128d & b)1338 vec_any_ne(const __m128d& a, const __m128d& b)
1339 {
1340 for (unsigned i = 0; i < 2; ++i) {
1341 if (a.xdrd[i] != b.xdrd[i])
1342 return 1;
1343 }
1344
1345 return 0;
1346 }
1347
1348 static inline __m128
1349 __attribute__((always_inline))
vec_add(const __m128 & a,const __m128 & b)1350 vec_add(const __m128& a, const __m128& b)
1351 {
1352 __m128 r;
1353 r.xfrd = a.xfrd + b.xfrd;
1354 return r;
1355 }
1356
1357 static inline __m128i
1358 __attribute__((always_inline))
vec_add(const __m128i & a,const __m128i & b)1359 vec_add(const __m128i& a, const __m128i& b)
1360 {
1361 __m128i r;
1362 r.xird = a.xird + b.xird;
1363 return r;
1364 }
1365
1366 static inline __m128d
1367 __attribute__((always_inline))
vec_add(const __m128d & a,const __m128d & b)1368 vec_add(const __m128d& a, const __m128d& b)
1369 {
1370 __m128d r;
1371 r.xdrd = a.xdrd + b.xdrd;
1372 return r;
1373 }
1374
1375 static inline __m128
1376 __attribute__((always_inline))
vec_sub(const __m128 & a,const __m128 & b)1377 vec_sub(const __m128& a, const __m128& b)
1378 {
1379 __m128 r;
1380 r.xfrd = a.xfrd - b.xfrd;
1381 return r;
1382 }
1383
1384 static inline __m128i
1385 __attribute__((always_inline))
vec_sub(const __m128i & a,const __m128i & b)1386 vec_sub(const __m128i& a, const __m128i& b)
1387 {
1388 __m128i r;
1389 r.xird = a.xird - b.xird;
1390 return r;
1391 }
1392
1393 static inline __m128d
1394 __attribute__((always_inline))
vec_sub(const __m128d & a,const __m128d & b)1395 vec_sub(const __m128d& a, const __m128d& b)
1396 {
1397 __m128d r;
1398 r.xdrd = a.xdrd - b.xdrd;
1399 return r;
1400 }
1401
1402 static inline __m128
1403 __attribute__((always_inline))
vec_madd(const __m128 & a,const __m128 & b,const __m128 & c)1404 vec_madd(const __m128& a, const __m128& b, const __m128& c)
1405 {
1406 __m128 r;
1407 r.xfrd = c.xfrd + a.xfrd * b.xfrd;
1408 return r;
1409 }
1410
1411 static inline __m128d
1412 __attribute__((always_inline))
vec_madd(const __m128d & a,const __m128d & b,const __m128d & c)1413 vec_madd(const __m128d& a, const __m128d& b, const __m128d& c)
1414 {
1415 __m128d r;
1416 r.xdrd = c.xdrd + a.xdrd * b.xdrd;
1417 return r;
1418 }
1419
1420 static inline __m128
1421 __attribute__((always_inline))
vec_msub(const __m128 & a,const __m128 & b,const __m128 & c)1422 vec_msub(const __m128& a, const __m128& b, const __m128& c)
1423 {
1424 __m128 r;
1425 r.xfrd = a.xfrd * b.xfrd - c.xfrd;
1426 return r;
1427 }
1428
1429 static inline __m128d
1430 __attribute__((always_inline))
vec_msub(const __m128d & a,const __m128d & b,const __m128d & c)1431 vec_msub(const __m128d& a, const __m128d& b, const __m128d& c)
1432 {
1433 __m128d r;
1434 r.xdrd = a.xdrd * b.xdrd - c.xdrd;
1435 return r;
1436 }
1437
1438 static inline __m128i
1439 __attribute__((always_inline))
vec_msub(const __m128i & a,const __m128i & b,const __m128i & c)1440 vec_msub(const __m128i& a, const __m128i& b, const __m128i& c)
1441 {
1442 __m128i r;
1443 r.xird = a.xird * b.xird - c.xird;
1444 return r;
1445 }
1446
1447 static inline __m128
1448 __attribute__((always_inline))
vec_mul(const __m128 & a,const __m128 & b)1449 vec_mul(const __m128& a, const __m128& b)
1450 {
1451 return vec_madd(a, b, vec_splats(float(0.0)));
1452 }
1453
1454 static inline __m128d
1455 __attribute__((always_inline))
vec_mul(const __m128d & a,const __m128d & b)1456 vec_mul(const __m128d& a, const __m128d& b)
1457 {
1458 return vec_madd(a, b, vec_splats(double(0.0)));
1459 }
1460
1461 static inline __m128
1462 __attribute__((always_inline))
vec_cmpge(const __m128 & a,const __m128 & b)1463 vec_cmpge(const __m128& a, const __m128& b)
1464 {
1465 __m128 r;
1466
1467 if (a.xfrd[0] >= b.xfrd[0])
1468 r.xi[0] = (unsigned) 0xFFFFFFFF;
1469
1470 if (a.xfrd[1] >= b.xfrd[1])
1471 r.xi[1] = (unsigned) 0xFFFFFFFF;
1472
1473 if (a.xfrd[2] >= b.xfrd[2])
1474 r.xi[2] = (unsigned) 0xFFFFFFFF;
1475
1476 if (a.xfrd[3] >= b.xfrd[3])
1477 r.xi[3] = (unsigned) 0xFFFFFFFF;
1478
1479 return r;
1480 }
1481
1482 static inline __m128i
1483 __attribute__((always_inline))
vec_cmpge(const __m128i & a,const __m128i & b)1484 vec_cmpge(const __m128i& a, const __m128i& b)
1485 {
1486 __m128i r;
1487
1488 if (a.xird[0] >= b.xird[0])
1489 r.xird[0] = (unsigned) 0xFFFFFFFF;
1490
1491 if (a.xird[1] >= b.xird[1])
1492 r.xird[1] = (unsigned) 0xFFFFFFFF;
1493
1494 if (a.xird[2] >= b.xird[2])
1495 r.xird[2] = (unsigned) 0xFFFFFFFF;
1496
1497 if (a.xird[3] >= b.xird[3])
1498 r.xird[3] = (unsigned) 0xFFFFFFFF;
1499
1500 return r;
1501 }
1502
1503 static inline __m128d
1504 __attribute__((always_inline))
vec_cmpge(const __m128d & a,const __m128d & b)1505 vec_cmpge(const __m128d& a, const __m128d& b)
1506 {
1507 __m128d r;
1508
1509 if (a.xdrd[0] >= b.xdrd[0]) {
1510 r.xui[0] = (unsigned) 0xFFFFFFFF;
1511 r.xui[1] = (unsigned) 0xFFFFFFFF;
1512 }
1513
1514 if (a.xdrd[1] >= b.xdrd[1]) {
1515 r.xui[2] = (unsigned) 0xFFFFFFFF;
1516 r.xui[3] = (unsigned) 0xFFFFFFFF;
1517 }
1518
1519 return r;
1520 }
1521
1522 static inline __m128
1523 __attribute__((always_inline))
vec_cmpeq(const __m128 & a,const __m128 & b)1524 vec_cmpeq(const __m128& a, const __m128& b)
1525 {
1526 __m128 r;
1527
1528 if (a.xfrd[0] == b.xfrd[0])
1529 r.xui[0] = (unsigned) 0xFFFFFFFF;
1530
1531 if (a.xfrd[1] == b.xfrd[1])
1532 r.xui[1] = (unsigned) 0xFFFFFFFF;
1533
1534 if (a.xfrd[2] == b.xfrd[2])
1535 r.xui[2] = (unsigned) 0xFFFFFFFF;
1536
1537 if (a.xfrd[3] == b.xfrd[3])
1538 r.xui[3] = (unsigned) 0xFFFFFFFF;
1539
1540 return r;
1541 }
1542
1543 static inline __m128d
1544 __attribute__((always_inline))
vec_cmpeq(const __m128d & a,const __m128d & b)1545 vec_cmpeq(const __m128d& a, const __m128d& b)
1546 {
1547 __m128d r;
1548
1549 if (a.xdrd[0] == b.xdrd[0]) {
1550 r.xui[0] = (unsigned) 0xFFFFFFFF;
1551 r.xui[1] = (unsigned) 0xFFFFFFFF;
1552 }
1553
1554 if (a.xdrd[1] == b.xdrd[1]) {
1555 r.xui[2] = (unsigned) 0xFFFFFFFF;
1556 r.xui[3] = (unsigned) 0xFFFFFFFF;
1557 }
1558
1559 return r;
1560 }
1561
1562 static inline __m128i
1563 __attribute__((always_inline))
vec_cmpeq(const __m128i & a,const __m128i & b)1564 vec_cmpeq(const __m128i& a, const __m128i& b)
1565 {
1566 __m128i r;
1567
1568 if (a.xird[0] == b.xird[0])
1569 r.xird[0] = (unsigned) 0xFFFFFFFF;
1570
1571 if (a.xird[1] == b.xird[1])
1572 r.xird[1] = (unsigned) 0xFFFFFFFF;
1573
1574 if (a.xird[2] == b.xird[2])
1575 r.xird[2] = (unsigned) 0xFFFFFFFF;
1576
1577 if (a.xird[3] == b.xird[3])
1578 r.xird[3] = (unsigned) 0xFFFFFFFF;
1579
1580 return r;
1581 }
1582
1583 static inline __m128
1584 __attribute__((always_inline))
vec_cmple(const __m128 & a,const __m128 & b)1585 vec_cmple(const __m128& a, const __m128& b)
1586 {
1587 __m128 r;
1588
1589 if (a.xfrd[0] <= b.xfrd[0])
1590 r.xui[0] = (unsigned) 0xFFFFFFFF;
1591
1592 if (a.xfrd[1] <= b.xfrd[1])
1593 r.xui[0] = (unsigned) 0xFFFFFFFF;
1594
1595 if (a.xfrd[2] <= b.xfrd[2])
1596 r.xui[2] = (unsigned) 0xFFFFFFFF;
1597
1598 if (a.xfrd[3] <= b.xfrd[3])
1599 r.xui[3] = (unsigned) 0xFFFFFFFF;
1600
1601 return r;
1602 }
1603
1604 static inline __m128d
1605 __attribute__((always_inline))
vec_cmple(const __m128d & a,const __m128d & b)1606 vec_cmple(const __m128d& a, const __m128d& b)
1607 {
1608 __m128d r;
1609
1610 if (a.xdrd[0] <= b.xdrd[0]) {
1611 r.xui[0] = (unsigned) 0xFFFFFFFF;
1612 r.xui[1] = (unsigned) 0xFFFFFFFF;
1613 }
1614
1615 if (a.xdrd[1] <= b.xdrd[1]) {
1616 r.xui[2] = (unsigned) 0xFFFFFFFF;
1617 r.xui[3] = (unsigned) 0xFFFFFFFF;
1618 }
1619
1620 return r;
1621 }
1622
1623 static inline __m128i
1624 __attribute__((always_inline))
vec_cmple(const __m128i & a,const __m128i & b)1625 vec_cmple(const __m128i& a, const __m128i& b)
1626 {
1627 __m128i r;
1628
1629 if (a.xird[0] <= b.xird[0])
1630 r.xui[0] = (unsigned) 0xFFFFFFFF;
1631
1632 if (a.xird[1] <= b.xird[1])
1633 r.xui[1] = (unsigned) 0xFFFFFFFF;
1634
1635 if (a.xird[2] <= b.xird[2])
1636 r.xui[2] = (unsigned) 0xFFFFFFFF;
1637
1638 if (a.xird[3] <= b.xird[3])
1639 r.xui[3] = (unsigned) 0xFFFFFFFF;
1640
1641 return r;
1642 }
1643
1644 static inline __m128
1645 __attribute__((always_inline))
vec_cmpgt(const __m128 & a,const __m128 & b)1646 vec_cmpgt(const __m128& a, const __m128& b)
1647 {
1648 __m128 r;
1649
1650 if (a.xfrd[0] > b.xfrd[0])
1651 r.xui[0] = (unsigned) 0xFFFFFFFF;
1652
1653 if (a.xfrd[1] > b.xfrd[1])
1654 r.xui[1] = (unsigned) 0xFFFFFFFF;
1655
1656 if (a.xfrd[2] > b.xfrd[2])
1657 r.xui[2] = (unsigned) 0xFFFFFFFF;
1658
1659 if (a.xfrd[3] > b.xfrd[3])
1660 r.xui[3] = (unsigned) 0xFFFFFFFF;
1661
1662 return r;
1663 }
1664
1665 static inline __m128d
1666 __attribute__((always_inline))
vec_cmpgt(const __m128d & a,const __m128d & b)1667 vec_cmpgt(const __m128d& a, const __m128d& b)
1668 {
1669 __m128d r;
1670
1671 if (a.xdrd[0] > b.xdrd[0]) {
1672 r.xui[0] = (unsigned) 0xFFFFFFFF;
1673 r.xui[1] = (unsigned) 0xFFFFFFFF;
1674 }
1675
1676 if (a.xdrd[1] > b.xdrd[1]) {
1677 r.xui[2] = (unsigned) 0xFFFFFFFF;
1678 r.xui[3] = (unsigned) 0xFFFFFFFF;
1679 }
1680
1681 return r;
1682 }
1683
1684 static inline __m128i
1685 __attribute__((always_inline))
vec_cmpgt(const __m128i & a,const __m128i & b)1686 vec_cmpgt(const __m128i& a, const __m128i& b)
1687 {
1688 __m128i r;
1689
1690 if (a.xird[0] > b.xird[0])
1691 r.xui[0] = (unsigned) 0xFFFFFFFF;
1692
1693 if (a.xird[1] > b.xird[1])
1694 r.xui[1] = (unsigned) 0xFFFFFFFF;
1695
1696 if (a.xird[2] > b.xird[2])
1697 r.xui[2] = (unsigned) 0xFFFFFFFF;
1698
1699 if (a.xird[3] > b.xird[3])
1700 r.xui[3] = (unsigned) 0xFFFFFFFF;
1701
1702 return r;
1703 }
1704
1705 static inline __m128
1706 __attribute__((always_inline))
vec_and(const __m128 & a,const __m128 & b)1707 vec_and(const __m128& a, const __m128& b)
1708 {
1709 __m128 r(a);
1710
1711 r.c[0] &= b.c[0];
1712 r.c[1] &= b.c[1];
1713 r.c[2] &= b.c[2];
1714 r.c[3] &= b.c[3];
1715 r.c[4] &= b.c[4];
1716 r.c[5] &= b.c[5];
1717 r.c[6] &= b.c[6];
1718 r.c[7] &= b.c[7];
1719 r.c[8] &= b.c[8];
1720 r.c[9] &= b.c[9];
1721 r.c[10] &= b.c[10];
1722 r.c[11] &= b.c[11];
1723 r.c[12] &= b.c[12];
1724 r.c[13] &= b.c[13];
1725 r.c[14] &= b.c[14];
1726 r.c[15] &= b.c[15];
1727
1728 return r;
1729 }
1730
1731 static inline __m128d
1732 __attribute__((always_inline))
vec_and(const __m128d & a,const __m128d & b)1733 vec_and(const __m128d& a, const __m128d& b)
1734 {
1735 __m128d r(a);
1736
1737 r.c[0] &= b.c[0];
1738 r.c[1] &= b.c[1];
1739 r.c[2] &= b.c[2];
1740 r.c[3] &= b.c[3];
1741 r.c[4] &= b.c[4];
1742 r.c[5] &= b.c[5];
1743 r.c[6] &= b.c[6];
1744 r.c[7] &= b.c[7];
1745 r.c[8] &= b.c[8];
1746 r.c[9] &= b.c[9];
1747 r.c[10] &= b.c[10];
1748 r.c[11] &= b.c[11];
1749 r.c[12] &= b.c[12];
1750 r.c[13] &= b.c[13];
1751 r.c[14] &= b.c[14];
1752 r.c[15] &= b.c[15];
1753
1754 return r;
1755 }
1756
1757 static inline __m128i
1758 __attribute__((always_inline))
vec_and(const __m128i & a,const __m128i & b)1759 vec_and(const __m128i& a, const __m128i& b)
1760 {
1761 __m128i r(a);
1762
1763 r.xird[0] &= b.xird[0];
1764 r.xird[1] &= b.xird[1];
1765 r.xird[2] &= b.xird[2];
1766 r.xird[3] &= b.xird[3];
1767
1768 return r;
1769 }
1770
1771 static inline __m128
1772 __attribute__((always_inline))
vec_xor(const __m128 & a,const __m128 & b)1773 vec_xor(const __m128& a, const __m128& b)
1774 {
1775 __m128 r(a);
1776
1777 r.c[0] ^= b.c[0];
1778 r.c[1] ^= b.c[1];
1779 r.c[2] ^= b.c[2];
1780 r.c[3] ^= b.c[3];
1781 r.c[4] ^= b.c[4];
1782 r.c[5] ^= b.c[5];
1783 r.c[6] ^= b.c[6];
1784 r.c[7] ^= b.c[7];
1785 r.c[8] ^= b.c[8];
1786 r.c[9] ^= b.c[9];
1787 r.c[10] ^= b.c[10];
1788 r.c[11] ^= b.c[11];
1789 r.c[12] ^= b.c[12];
1790 r.c[13] ^= b.c[13];
1791 r.c[14] ^= b.c[14];
1792 r.c[15] ^= b.c[15];
1793
1794 return r;
1795 }
1796
1797 static inline __m128d
1798 __attribute__((always_inline))
vec_xor(const __m128d & a,const __m128d & b)1799 vec_xor(const __m128d& a, const __m128d& b)
1800 {
1801 __m128d r(a);
1802
1803 r.c[0] ^= b.c[0];
1804 r.c[1] ^= b.c[1];
1805 r.c[2] ^= b.c[2];
1806 r.c[3] ^= b.c[3];
1807 r.c[4] ^= b.c[4];
1808 r.c[5] ^= b.c[5];
1809 r.c[6] ^= b.c[6];
1810 r.c[7] ^= b.c[7];
1811 r.c[8] ^= b.c[8];
1812 r.c[9] ^= b.c[9];
1813 r.c[10] ^= b.c[10];
1814 r.c[11] ^= b.c[11];
1815 r.c[12] ^= b.c[12];
1816 r.c[13] ^= b.c[13];
1817 r.c[14] ^= b.c[14];
1818 r.c[15] ^= b.c[15];
1819
1820 return r;
1821 }
1822
1823 static inline __m128i
1824 __attribute__((always_inline))
vec_xor(const __m128i & a,const __m128i & b)1825 vec_xor(const __m128i& a, const __m128i& b)
1826 {
1827 __m128i r(a);
1828
1829 r.xird[0] ^= b.xird[0];
1830 r.xird[1] ^= b.xird[1];
1831 r.xird[2] ^= b.xird[2];
1832 r.xird[3] ^= b.xird[3];
1833
1834 return r;
1835 }
1836
1837 static inline __m128
1838 __attribute__((always_inline))
vec_or(const __m128 & a,const __m128 & b)1839 vec_or(const __m128& a, const __m128& b)
1840 {
1841 __m128 r(a);
1842
1843 r.c[0] |= b.c[0];
1844 r.c[1] |= b.c[1];
1845 r.c[2] |= b.c[2];
1846 r.c[3] |= b.c[3];
1847 r.c[4] |= b.c[4];
1848 r.c[5] |= b.c[5];
1849 r.c[6] |= b.c[6];
1850 r.c[7] |= b.c[7];
1851 r.c[8] |= b.c[8];
1852 r.c[9] |= b.c[9];
1853 r.c[10] |= b.c[10];
1854 r.c[11] |= b.c[11];
1855 r.c[12] |= b.c[12];
1856 r.c[13] |= b.c[13];
1857 r.c[14] |= b.c[14];
1858 r.c[15] |= b.c[15];
1859
1860 return r;
1861 }
1862
1863 static inline __m128d
1864 __attribute__((always_inline))
vec_or(const __m128d & a,const __m128d & b)1865 vec_or(const __m128d& a, const __m128d& b)
1866 {
1867 __m128d r(a);
1868
1869 r.c[0] |= b.c[0];
1870 r.c[1] |= b.c[1];
1871 r.c[2] |= b.c[2];
1872 r.c[3] |= b.c[3];
1873 r.c[4] |= b.c[4];
1874 r.c[5] |= b.c[5];
1875 r.c[6] |= b.c[6];
1876 r.c[7] |= b.c[7];
1877 r.c[8] |= b.c[8];
1878 r.c[9] |= b.c[9];
1879 r.c[10] |= b.c[10];
1880 r.c[11] |= b.c[11];
1881 r.c[12] |= b.c[12];
1882 r.c[13] |= b.c[13];
1883 r.c[14] |= b.c[14];
1884 r.c[15] |= b.c[15];
1885
1886 return r;
1887 }
1888
1889 static inline __m128i
1890 __attribute__((always_inline))
vec_or(const __m128i & a,const __m128i & b)1891 vec_or(const __m128i& a, const __m128i& b)
1892 {
1893 __m128i r(a);
1894
1895 r.xird[0] |= b.xird[0];
1896 r.xird[1] |= b.xird[1];
1897 r.xird[2] |= b.xird[2];
1898 r.xird[3] |= b.xird[3];
1899
1900 return r;
1901 }
1902
1903 static inline
1904 __attribute__((always_inline))
vec_compl(const __m128 & a)1905 __m128 vec_compl(const __m128& a)
1906 {
1907 __m128 r(a);
1908
1909 r.xi[0] = ~r.xi[0];
1910 r.xi[1] = ~r.xi[1];
1911 r.xi[2] = ~r.xi[2];
1912 r.xi[3] = ~r.xi[3];
1913
1914 return r;
1915 }
1916
1917 static inline __m128d
1918 __attribute__((always_inline))
vec_compl(const __m128d & a)1919 vec_compl(const __m128d& a)
1920 {
1921 __m128d r(a);
1922
1923 r.xi[0] = ~r.xi[0];
1924 r.xi[1] = ~r.xi[1];
1925 r.xi[2] = ~r.xi[2];
1926 r.xi[3] = ~r.xi[3];
1927
1928 return r;
1929 }
1930
1931 static inline
1932 __attribute__((always_inline))
vec_compl(const __m128i & a)1933 __m128i vec_compl(const __m128i& a)
1934 {
1935 __m128i r(a);
1936
1937 r.xi[0] = ~r.xi[0];
1938 r.xi[1] = ~r.xi[1];
1939 r.xi[2] = ~r.xi[2];
1940 r.xi[3] = ~r.xi[3];
1941
1942 return r;
1943 }
1944
1945 static inline __m128
1946 __attribute__((always_inline))
vec_andc(const __m128 & a,const __m128 & b)1947 vec_andc(const __m128& a, const __m128& b)
1948 {
1949 return vec_and(a, vec_compl(b));
1950 }
1951
1952 static inline __m128d
1953 __attribute__((always_inline))
vec_andc(const __m128d & a,const __m128d & b)1954 vec_andc(const __m128d& a, const __m128d& b)
1955 {
1956 return vec_and(a, vec_compl(b));
1957 }
1958
1959 static inline __m128i
1960 __attribute__((always_inline))
vec_andc(const __m128i & a,const __m128i & b)1961 vec_andc(const __m128i& a, const __m128i& b)
1962 {
1963 return vec_and(a, vec_compl(b));
1964 }
1965
1966 static inline __m128i
1967 __attribute__((always_inline))
vec_sl(const __m128i & a,const __m128i & b)1968 vec_sl(const __m128i& a, const __m128i& b)
1969 {
1970 __m128i r;
1971
1972 r.xird[0] = (a.xird[0] << b.xird[0]) % 32;
1973 r.xird[1] = (a.xird[1] << b.xird[1]) % 32;
1974 r.xird[2] = (a.xird[2] << b.xird[2]) % 32;
1975 r.xird[3] = (a.xird[3] << b.xird[3]) % 32;
1976
1977 return r;
1978 }
1979
1980 static inline __m128i
1981 __attribute__((always_inline))
vec_sr(const __m128i & a,const __m128i & b)1982 vec_sr(const __m128i& a, const __m128i& b)
1983 {
1984 __m128i r;
1985
1986 r.xird[0] = (a.xird[0] >> b.xird[0]) % 32;
1987 r.xird[1] = (a.xird[1] >> b.xird[1]) % 32;
1988 r.xird[2] = (a.xird[2] >> b.xird[2]) % 32;
1989 r.xird[3] = (a.xird[3] >> b.xird[3]) % 32;
1990
1991 return r;
1992 }
1993
1994 static inline __m128
1995 __attribute__((always_inline))
vec_ctf(const __m128i & a,unsigned i)1996 vec_ctf(const __m128i& a, unsigned i)
1997 {
1998 __m128 r;
1999 assert(i < 32U && "Invalid exponent!");
2000
2001 if (a.type == __m128i::SignedInt) {
2002 r.xfrd[0] = (float) ((a.xi[0] + ((a.xi[0] >> 31) & ((1 << i) + ~0))) >> i);
2003 r.xfrd[1] = (float) ((a.xi[1] + ((a.xi[1] >> 31) & ((1 << i) + ~0))) >> i);
2004 r.xfrd[2] = (float) ((a.xi[2] + ((a.xi[2] >> 31) & ((1 << i) + ~0))) >> i);
2005 r.xfrd[3] = (float) ((a.xi[3] + ((a.xi[3] >> 31) & ((1 << i) + ~0))) >> i);
2006 } else if (a.type == __m128i::UnsignedInt) {
2007 r.xfrd[0] =
2008 (float) ((a.xui[0] + ((a.xui[0] >> 31) & ((1 << i) + ~0))) >> i);
2009 r.xfrd[1] =
2010 (float) ((a.xui[1] + ((a.xui[1] >> 31) & ((1 << i) + ~0))) >> i);
2011 r.xfrd[2] =
2012 (float) ((a.xui[2] + ((a.xui[2] >> 31) & ((1 << i) + ~0))) >> i);
2013 r.xfrd[3] =
2014 (float) ((a.xui[3] + ((a.xui[3] >> 31) & ((1 << i) + ~0))) >> i);
2015 } else if (a.type == __m128i::SignedLong) {
2016 r.xfrd[0] =
2017 (float) ((a.xl[0] + ((a.xl[0] >> 31) & ((1 << i) + ~0))) >> i);
2018 r.xfrd[2] =
2019 (float) ((a.xl[1] + ((a.xl[1] >> 31) & ((1 << i) + ~0))) >> i);
2020 } else if (a.type == __m128i::UnsignedLong) {
2021 r.xfrd[0] =
2022 (float) ((a.xul[0] + ((a.xul[0] >> 31) & ((1 << i) + ~0))) >> i);
2023 r.xfrd[2] =
2024 (float) ((a.xul[1] + ((a.xul[1] >> 31) & ((1 << i) + ~0))) >> i);
2025 }
2026
2027 return r;
2028 }
2029
2030 static inline __m128i
2031 __attribute__((always_inline))
vec_fixed(const __m128 & a)2032 vec_fixed(const __m128& a)
2033 {
2034 __m128i r;
2035
2036 r.xird[0] = (int) a.xfrd[0];
2037 r.xird[1] = (int) a.xfrd[1];
2038 r.xird[2] = (int) a.xfrd[2];
2039 r.xird[3] = (int) a.xfrd[3];
2040
2041 return r;
2042 }
2043
2044 static inline __m128i
2045 __attribute__((always_inline))
vec_fixed(const __m128d & a)2046 vec_fixed(const __m128d& a)
2047 {
2048 __m128i r;
2049
2050 r.xird[0] = (int) a.xdrd[0];
2051 r.xird[2] = (int) a.xdrd[1];
2052
2053 return r;
2054 }
2055
2056 static inline __m128i
2057 __attribute__((always_inline))
vec_cts(const __m128 & a,int i)2058 vec_cts(const __m128& a, int i)
2059 {
2060 __m128i r;
2061
2062 assert((i >= 0 && i < 32) && "Invalid exponent!");
2063
2064 // FIXME: Expand to bitwise.
2065 r.xird[0] = (int) ldexpf(a.xfrd[0], (int) i);
2066 r.xird[1] = (int) ldexpf(a.xfrd[1], (int) i);
2067 r.xird[2] = (int) ldexpf(a.xfrd[2], (int) i);
2068 r.xird[3] = (int) ldexpf(a.xfrd[3], (int) i);
2069
2070 return r;
2071 }
2072
2073 static inline __m128i
2074 __attribute__((always_inline))
vec_cts(const __m128d & a,int i)2075 vec_cts(const __m128d& a, int i)
2076 {
2077 __m128i r;
2078
2079 assert((i >= 0) && (i < 32) && "Invalid exponent!");
2080
2081 // FIXME: Expand to bitwise.
2082 // The values of xi[1] and xi[3] are undefined (zero).
2083 r.xird[0] = (int) ldexp(a.xdrd[0], (int) i);
2084 r.xird[2] = (int) ldexp(a.xdrd[1], (int) i);
2085
2086 return r;
2087 }
2088
2089 static inline __m128
2090 __attribute__((always_inline))
vec_cmplt(const __m128 & a,const __m128 & b)2091 vec_cmplt(const __m128& a, const __m128& b)
2092 {
2093 __m128 r;
2094
2095 if (a.xfrd[0] < b.xfrd[0])
2096 r.xi[0] = (unsigned) 0xFFFFFFFF;
2097
2098 if (a.xfrd[1] < b.xfrd[1])
2099 r.xi[1] = (unsigned) 0xFFFFFFFF;
2100
2101 if (a.xfrd[2] < b.xfrd[2])
2102 r.xi[2] = (unsigned) 0xFFFFFFFF;
2103
2104 if (a.xfrd[3] < b.xfrd[3])
2105 r.xi[3] = (unsigned) 0xFFFFFFFF;
2106
2107 return r;
2108 }
2109
2110 static inline __m128d
2111 __attribute__((always_inline))
vec_cmplt(const __m128d & a,const __m128d & b)2112 vec_cmplt(const __m128d& a, const __m128d& b)
2113 {
2114 __m128d r;
2115
2116 if (a.xdrd[0] < b.xdrd[0]) {
2117 r.xi[0] = (unsigned) 0xFFFFFFFF;
2118 r.xi[1] = (unsigned) 0xFFFFFFFF;
2119 }
2120
2121 if (a.xdrd[1] < b.xdrd[1]) {
2122 r.xi[2] = (unsigned) 0xFFFFFFFF;
2123 r.xi[3] = (unsigned) 0xFFFFFFFF;
2124 }
2125
2126 return r;
2127 }
2128
2129 static inline __m128
2130 __attribute__((always_inline))
vec_trunc(const __m128 & a)2131 vec_trunc(const __m128& a)
2132 {
2133 __m128 r;
2134
2135 r.xfrd[0] = (float) ((int) a.xfrd[0]);
2136 r.xfrd[1] = (float) ((int) a.xfrd[1]);
2137 r.xfrd[2] = (float) ((int) a.xfrd[2]);
2138 r.xfrd[3] = (float) ((int) a.xfrd[3]);
2139
2140 return r;
2141 }
2142
2143 static inline __m128d
2144 __attribute__((always_inline))
vec_trunc(const __m128d & a)2145 vec_trunc(const __m128d& a)
2146 {
2147 __m128d r;
2148
2149 r.xdrd[0] = (double) ((long) a.xdrd[0]);
2150 r.xdrd[1] = (double) ((long) a.xdrd[1]);
2151
2152 return r;
2153 }
2154
2155 static inline __m128
2156 __attribute__((always_inline))
vec_floor(const __m128 & a)2157 vec_floor(const __m128& a)
2158 {
2159 __m128 r;
2160
2161 r.xfrd[0] = (int) (a.xfrd[0] + 16777215.0f) - (int) 16777215;
2162 r.xfrd[1] = (int) (a.xfrd[1] + 16777215.0f) - (int) 16777215;
2163 r.xfrd[2] = (int) (a.xfrd[2] + 16777215.0f) - (int) 16777215;
2164 r.xfrd[3] = (int) (a.xfrd[3] + 16777215.0f) - (int) 16777215;
2165
2166 return r;
2167 }
2168
2169 static inline __m128d
2170 __attribute__((always_inline))
vec_floor(const __m128d & a)2171 vec_floor(const __m128d& a)
2172 {
2173 __m128d r;
2174
2175 r.xdrd[0] = (long) (a.xdrd[0] + 2147418111.00) - (long) 2147418111;
2176 r.xdrd[1] = (long) (a.xdrd[1] + 2147418111.00) - (long) 2147418111;
2177
2178 return r;
2179 }
2180
2181 static inline __m128
2182 __attribute__((always_inline))
vec_div(const __m128 & a,const __m128 & b)2183 vec_div(const __m128& a, const __m128& b)
2184 {
2185 __m128 r = a / b;
2186 return r;
2187 }
2188
2189 static inline __m128d
2190 __attribute__((always_inline))
vec_div(const __m128d & a,const __m128d & b)2191 vec_div(const __m128d& a, const __m128d& b)
2192 {
2193 __m128d r = a / b;
2194 return r;
2195 }
2196
2197 static inline __m128
2198 __attribute__((always_inline))
vec_div(const __m128i & a,const __m128i & b)2199 vec_div(const __m128i& a, const __m128i& b)
2200 {
2201 __m128 r = a / b;
2202 return r;
2203 }
2204
2205 static inline __m128
2206 __attribute__((always_inline))
vec_max(const __m128 & a,const __m128 & b)2207 vec_max(const __m128& a, const __m128& b)
2208 {
2209 __m128 r;
2210
2211 r.xfrd[0] = a.xfrd[0] > b.xfrd[0] ? a.xfrd[0] : b.xfrd[0];
2212 r.xfrd[1] = a.xfrd[1] > b.xfrd[1] ? a.xfrd[1] : b.xfrd[1];
2213 r.xfrd[2] = a.xfrd[2] > b.xfrd[2] ? a.xfrd[2] : b.xfrd[2];
2214 r.xfrd[3] = a.xfrd[3] > b.xfrd[3] ? a.xfrd[3] : b.xfrd[3];
2215
2216 return r;
2217 }
2218
2219 static inline __m128d
2220 __attribute__((always_inline))
vec_max(const __m128d & a,const __m128d & b)2221 vec_max(const __m128d& a, const __m128d& b)
2222 {
2223 __m128d r;
2224
2225 r.xdrd[0] = a.xdrd[0] > b.xdrd[0] ? a.xdrd[0] : b.xdrd[0];
2226 r.xdrd[1] = a.xdrd[1] > b.xdrd[1] ? a.xdrd[1] : b.xdrd[1];
2227
2228 return r;
2229 }
2230
2231 static inline __m128i
2232 __attribute__((always_inline))
vec_max(const __m128i & a,const __m128i & b)2233 vec_max(const __m128i& a, const __m128i& b)
2234 {
2235 __m128i r;
2236
2237 if ((a.type == b.type) && (a.type == __s128i::SignedInt)) {
2238 r.xird[0] = a.xird[0] > b.xird[0] ? a.xird[0] : b.xird[0];
2239 r.xird[1] = a.xird[1] > b.xird[1] ? a.xird[1] : b.xird[1];
2240 r.xird[2] = a.xird[2] > b.xird[2] ? a.xird[2] : b.xird[2];
2241 r.xird[3] = a.xird[3] > b.xird[3] ? a.xird[3] : b.xird[3];
2242 } else if ((a.type == b.type) && (a.type == __s128i::UnsignedInt)) {
2243 r.xui[0] = a.xui[0] > b.xui[0] ? a.xui[0] : b.xui[0];
2244 r.xui[1] = a.xui[1] > b.xui[1] ? a.xui[1] : b.xui[1];
2245 r.xui[2] = a.xui[2] > b.xui[2] ? a.xui[2] : b.xui[2];
2246 r.xui[3] = a.xui[3] > b.xui[3] ? a.xui[3] : b.xui[3];
2247 } else if ((a.type == b.type) && (a.type == __s128i::SignedLong)) {
2248 r.xlrd[0] = a.xlrd[0] > b.xlrd[0] ? a.xlrd[0] : b.xlrd[0];
2249 r.xlrd[1] = a.xlrd[1] > b.xlrd[1] ? a.xlrd[1] : b.xlrd[1];
2250 } else if ((a.type == b.type) && (a.type == __s128i::UnsignedLong)) {
2251 r.xul[0] = a.xul[0] > b.xul[0] ? a.xul[0] : b.xul[0];
2252 r.xul[1] = a.xul[1] > b.xul[1] ? a.xul[1] : b.xul[1];
2253 }
2254
2255 return r;
2256 }
2257
2258 static inline __m128
2259 __attribute__((always_inline))
vec_min(const __m128 & a,const __m128 & b)2260 vec_min(const __m128& a, const __m128& b)
2261 {
2262 __m128 r;
2263
2264 r.xfrd[0] = a.xfrd[0] < b.xfrd[0] ? a.xfrd[0] : b.xfrd[0];
2265 r.xfrd[1] = a.xfrd[1] < b.xfrd[1] ? a.xfrd[1] : b.xfrd[1];
2266 r.xfrd[2] = a.xfrd[2] < b.xfrd[2] ? a.xfrd[2] : b.xfrd[2];
2267 r.xfrd[3] = a.xfrd[3] < b.xfrd[3] ? a.xfrd[3] : b.xfrd[3];
2268
2269 return r;
2270 }
2271
2272 static inline __m128d
2273 __attribute__((always_inline))
vec_min(const __m128d & a,const __m128d & b)2274 vec_min(const __m128d& a, const __m128d& b)
2275 {
2276 __m128d r;
2277
2278 r.xdrd[0] = a.xdrd[0] < b.xdrd[0] ? a.xdrd[0] : b.xdrd[0];
2279 r.xdrd[1] = a.xdrd[1] < b.xdrd[1] ? a.xdrd[1] : b.xdrd[1];
2280
2281 return r;
2282 }
2283
2284 static inline __m128i
2285 __attribute__((always_inline))
vec_min(const __m128i & a,const __m128i & b)2286 vec_min(const __m128i& a, const __m128i& b)
2287 {
2288 __m128i r;
2289
2290 if ((a.type == b.type) && (a.type == __s128i::SignedInt)) {
2291 r.xird[0] = a.xird[0] < b.xird[0] ? a.xird[0] : b.xird[0];
2292 r.xird[1] = a.xird[1] < b.xird[1] ? a.xird[1] : b.xird[1];
2293 r.xird[2] = a.xird[2] < b.xird[2] ? a.xird[2] : b.xird[2];
2294 r.xird[3] = a.xird[3] < b.xird[3] ? a.xird[3] : b.xird[3];
2295 } else if ((a.type == b.type) && (a.type == __s128i::UnsignedInt)) {
2296 r.xui[0] = a.xui[0] < b.xui[0] ? a.xui[0] : b.xui[0];
2297 r.xui[1] = a.xui[1] < b.xui[1] ? a.xui[1] : b.xui[1];
2298 r.xui[2] = a.xui[2] < b.xui[2] ? a.xui[2] : b.xui[2];
2299 r.xui[3] = a.xui[3] < b.xui[3] ? a.xui[3] : b.xui[3];
2300 } else if ((a.type == b.type) && (a.type == __s128i::SignedLong)) {
2301 r.xlrd[0] = a.xlrd[0] < b.xlrd[0] ? a.xlrd[0] : b.xlrd[0];
2302 r.xlrd[1] = a.xlrd[1] < b.xlrd[1] ? a.xlrd[1] : b.xlrd[1];
2303 } else if ((a.type == b.type) && (a.type == __s128i::UnsignedLong)) {
2304 r.xul[0] = a.xul[0] < b.xul[0] ? a.xul[0] : b.xul[0];
2305 r.xul[1] = a.xul[1] < b.xul[1] ? a.xul[1] : b.xul[1];
2306 }
2307
2308 return r;
2309 }
2310
2311 static inline __m128i
2312 __attribute__((always_inline))
vec_cmplt(const __m128i & a,const __m128i & b)2313 vec_cmplt(const __m128i& a, const __m128i& b)
2314 {
2315 __m128i r;
2316
2317 if (a.xird[0] < b.xird[0])
2318 r.xird[0] = (unsigned) 0xFFFFFFFF;
2319
2320 if (a.xird[1] < b.xird[1])
2321 r.xird[1] = (unsigned) 0xFFFFFFFF;
2322
2323 if (a.xird[2] < b.xird[2])
2324 r.xird[2] = (unsigned) 0xFFFFFFFF;
2325
2326 if (a.xird[3] < b.xird[3])
2327 r.xird[3] = (unsigned) 0xFFFFFFFF;
2328
2329 return r;
2330 }
2331
2332 static inline __m128
2333 __attribute__((always_inline))
vec_sel(const __m128 & a,const __m128 & b,const __m128 & c)2334 vec_sel(const __m128& a, const __m128& b, const __m128& c)
2335 {
2336 __m128 r;
2337
2338 for (unsigned i = 0; i < 4; ++i) {
2339 for (unsigned j = 0; j < 32; ++j) {
2340 if (c.xi[i] & (1 << j)) {
2341 if (b.xi[i] & (1 << j))
2342 r.xi[i] |= (b.xi[i] & (1 << j));
2343 else
2344 r.xi[i] &= ~(b.xi[i] & (1 << j));
2345 } else {
2346 if (a.xi[i] & (1 << j))
2347 r.xi[i] |= (a.xi[i] & (1 << j));
2348 else
2349 r.xi[i] &= ~(a.xi[i] & (1 << j));
2350 }
2351 }
2352 }
2353
2354 return r;
2355 }
2356
2357 static inline __m128d
2358 __attribute__((always_inline))
vec_sel(const __m128d & a,const __m128d & b,const __m128d & c)2359 vec_sel(const __m128d& a, const __m128d& b, const __m128d& c)
2360 {
2361 __m128d r;
2362
2363 for (unsigned i = 0; i < 4; ++i) {
2364 for (unsigned j = 0; j < 32; ++j) {
2365 if (c.xi[i] & (1 << j)) {
2366 if (b.xi[i] & (1 << j))
2367 r.xi[i] |= (b.xi[i] & (1 << j));
2368 else
2369 r.xi[i] &= ~(b.xi[i] & (1 << j));
2370 } else {
2371 if (a.xi[i] & (1 << j))
2372 r.xi[i] |= (a.xi[i] & (1 << j));
2373 else
2374 r.xi[i] &= ~(a.xi[i] & (1 << j));
2375 }
2376 }
2377 }
2378
2379 return r;
2380 }
2381
2382 static inline __m128i
2383 __attribute__((always_inline))
vec_sel(const __m128i & a,const __m128i & b,const __m128i & c)2384 vec_sel(const __m128i& a, const __m128i& b, const __m128i& c)
2385 {
2386 __m128i r;
2387
2388 for (unsigned i = 0; i < 4; ++i) {
2389 for (unsigned j = 0; j < 32; ++j) {
2390 if (c.xi[i] & (1 << j)) {
2391 if (b.xi[i] & (1 << j))
2392 r.xi[i] |= (b.xi[i] & (1 << j));
2393 else
2394 r.xi[i] &= ~(b.xi[i] & (1 << j));
2395 } else {
2396 if (a.xi[i] & (1 << j))
2397 r.xi[i] |= (a.xi[i] & (1 << j));
2398 else
2399 r.xi[i] &= ~(a.xi[i] & (1 << j));
2400 }
2401 }
2402 }
2403
2404 return r;
2405 }
2406
2407 static inline void
2408 __attribute__((always_inline))
vec_sti(const __m128i & vi,int v,unsigned char * t)2409 vec_sti(const __m128i& vi, int v, unsigned char* t)
2410 {
2411 __m128i vix = v ? vi + v : vi;
2412 unsigned char* x = vix.c;
2413
2414 t[0] = x[0];
2415 t[1] = x[1];
2416 t[2] = x[2];
2417 t[3] = x[3];
2418 t[4] = x[4];
2419 t[5] = x[5];
2420 t[6] = x[6];
2421 t[7] = x[7];
2422 t[8] = x[8];
2423 t[9] = x[9];
2424 t[10] = x[10];
2425 t[11] = x[11];
2426 t[12] = x[12];
2427 t[13] = x[13];
2428 t[14] = x[14];
2429 t[15] = x[15];
2430 }
2431
2432 static inline void
2433 __attribute__((always_inline))
vec_stf(const __m128 & vf,int v,unsigned char * t)2434 vec_stf(const __m128& vf, int v, unsigned char* t)
2435 {
2436 __m128 vfx = v ? vf + v : vf;
2437 unsigned char* x = vfx.c;
2438
2439 t[0] = x[0];
2440 t[1] = x[1];
2441 t[2] = x[2];
2442 t[3] = x[3];
2443 t[4] = x[4];
2444 t[5] = x[5];
2445 t[6] = x[6];
2446 t[7] = x[7];
2447 t[8] = x[8];
2448 t[9] = x[9];
2449 t[10] = x[10];
2450 t[11] = x[11];
2451 t[12] = x[12];
2452 t[13] = x[13];
2453 t[14] = x[14];
2454 t[15] = x[15];
2455 }
2456
2457 static inline void
2458 __attribute__((always_inline))
vec_std(const __m128d & vd,int v,unsigned char * t)2459 vec_std(const __m128d& vd, int v, unsigned char* t)
2460 {
2461 __m128d vdx = v ? vd + v : vd;
2462 unsigned char* x = vdx.c;
2463
2464 t[0] = x[0];
2465 t[1] = x[1];
2466 t[2] = x[2];
2467 t[3] = x[3];
2468 t[4] = x[4];
2469 t[5] = x[5];
2470 t[6] = x[6];
2471 t[7] = x[7];
2472 t[8] = x[8];
2473 t[9] = x[9];
2474 t[10] = x[10];
2475 t[11] = x[11];
2476 t[12] = x[12];
2477 t[13] = x[13];
2478 t[14] = x[14];
2479 t[15] = x[15];
2480 }
2481
2482 /*
2483 * No corresponding Altivec intrinsic to generate a scalar mask
2484 * from corresponding vector elements.
2485 */
2486 static inline unsigned int
2487 __attribute__((always_inline))
_mm_movemask_epi8(const __m128i & a)2488 _mm_movemask_epi8(const __m128i& a)
2489 {
2490 unsigned char t[16] __attribute__((aligned(16)));
2491 unsigned int r;
2492 int i;
2493
2494 vec_st(a, 0, t);
2495 r = 0;
2496 for (i = 0; i < 16; i++) {
2497 r = (r << 1) | (t[i] >> 7);
2498 }
2499
2500 return r;
2501 }
2502
2503 static inline unsigned int
2504 __attribute__((always_inline))
_mm_movemask_epi32(const __m128i & a)2505 _mm_movemask_epi32(const __m128i& a)
2506 {
2507 unsigned int t[4] __attribute__((aligned(16)));
2508 unsigned int r;
2509 int i;
2510
2511 vec_st(a, 0, t);
2512 r = 0;
2513 for (i = 0; i < 4; i++) {
2514 r = (r << 1) | (t[i] >> 31);
2515 }
2516
2517 return r;
2518 }
2519
2520 static inline unsigned int
2521 __attribute__((always_inline))
_mm_movemask_ps(const __m128 & a)2522 _mm_movemask_ps(const __m128& a)
2523 {
2524 return ((a.xui[3] >> 31) << 3) | ((a.xui[2] >> 31) << 2) |
2525 ((a.xui[1] >> 31) << 1) | ((a.xui[0] >> 31) << 0);
2526 }
2527
2528 static inline unsigned int
2529 __attribute__((always_inline))
_mm_movemask_pd(const __m128d & a)2530 _mm_movemask_pd(const __m128d& a)
2531 {
2532 return ((a.xul[1] >> 63) << 1) | ((a.xul[0] >> 63) << 0);
2533 }
2534
2535 static inline __m128i
2536 __attribute__((always_inline))
_mm_blend_epi32(const __m128i & a,const __m128i & b,int imm8)2537 _mm_blend_epi32(const __m128i& a, const __m128i& b, int imm8)
2538 {
2539 unsigned int t[4] __attribute__((aligned(16)));
2540 int i;
2541
2542 vec_st(a, 0, t);
2543 for (i = 0; i < 3; i++) {
2544 if (imm8 & 0x1)
2545 t[i] = vec_extract(b, i);
2546 imm8 >>= 1;
2547 }
2548
2549 // FIXME: Check the cast below.
2550 return vec_ld(0, (__m128i*) t);
2551 }
2552
2553 static inline __m128
2554 __attribute__((always_inline))
_mm_setr_ps(float e3,float e2,float e1,float e0)2555 _mm_setr_ps(float e3, float e2, float e1, float e0)
2556 {
2557 __m128 e = { e3, e2, e1, e0 };
2558 return e;
2559 }
2560
2561 static inline __m128d
2562 __attribute__((always_inline))
_mm_setr_pd(double e1,double e0)2563 _mm_setr_pd(double e1, double e0)
2564 {
2565 __m128d e = { e1, e0 };
2566 return e;
2567 }
2568
2569 static inline __m128d
2570 __attribute__((always_inline))
_mm_shuffle_pd(const __m128d & a,const __m128d & b,int imm8)2571 _mm_shuffle_pd(const __m128d& a, const __m128d& b, int imm8)
2572 {
2573 double r[2];
2574 r[0] = imm8 & 0x1 ? vec_extract(a, 1) : vec_extract(a, 0);
2575 r[1] = imm8 & 0x2 ? vec_extract(b, 1) : vec_extract(b, 0);
2576
2577 return vec_ld(0, (__m128d *)r);
2578 }
2579
2580 /*
2581 * Quick way to determine whether any element in a vector mask
2582 * register is set.
2583 *
2584 * No corresponding Altivec intrinsic.
2585 */
2586 static inline unsigned int
2587 __attribute__((always_inline))
_vec_any_nz(const __m128i & a)2588 _vec_any_nz(const __m128i& a)
2589 {
2590 return vec_any_ne(a, (__typeof__(a)) vec_splats(0));
2591 }
2592
2593 static inline __m128d
2594 __attribute__((always_inline))
_mm_cvtepi32_pd(const __m128i & a)2595 _mm_cvtepi32_pd(const __m128i& a)
2596 {
2597 __m128d r;
2598
2599 r = vec_insert(1.0 * vec_extract(a, 0), r, 0);
2600 r = vec_insert(1.0 * vec_extract(a, 2), r, 1);
2601
2602 return r;
2603 }
2604
2605 static inline __m128d
2606 __attribute__((always_inline))
_mm_min_sd(const __m128d & a,const __m128d & b)2607 _mm_min_sd(const __m128d& a, const __m128d& b)
2608 {
2609 double aa = vec_extract(a, 0);
2610 double bb = vec_extract(b, 0);
2611 aa = aa < bb ? aa : bb;
2612 return vec_insert(aa, a, 0);
2613 }
2614
2615 static inline __m128d
2616 __attribute__((always_inline))
_mm_max_sd(const __m128d & a,const __m128d & b)2617 _mm_max_sd(const __m128d& a, const __m128d& b)
2618 {
2619 double aa = vec_extract(a, 0);
2620 double bb = vec_extract(b, 0);
2621 aa = aa > bb ? aa : bb;
2622 return vec_insert(aa, a, 0);
2623 }
2624
2625
2626 /*
2627 * Logical
2628 */
2629
2630 #define _mm_andnot_ps(_v,_w) vec_andc(_w,_v) // different oder of arguments
2631 #define _mm_andnot_pd(_v,_w) vec_andc(_w,_v) // different oder of arguments
2632 #define _mm_and_ps(_v,_w) vec_and(_v,_w)
2633 #define _mm_and_pd(_v,_w) vec_and(_v,_w)
2634 #define _mm_and_si128(_v,_w) vec_and(_v,_w)
2635 #define _mm_andnot_si128(_v,_w) vec_andc(_w,_v) // different order of arguments
2636 #define _mm_or_ps(_v,_w) vec_or(_v,_w)
2637 #define _mm_or_pd(_v,_w) vec_or(_v,_w)
2638 #define _mm_or_si128(_v,_w) vec_or(_v,_w)
2639 #define _mm_xor_ps(_v,_w) vec_xor(_v,_w)
2640 #define _mm_xor_pd(_v,_w) vec_xor(_v,_w)
2641 #define _mm_xor_si128(_v,_w) vec_xor(_v,_w)
2642
2643 /*
2644 * Broadcast
2645 */
2646
2647 #define _mm_set1_epi32(_v) (__m128i)vec_splats((int)_v)
2648 #define _mm_set1_epi64x(_v) (__m128i)vec_splats((long int)_v)
2649 #define _mm_set1_ps(_v) (__m128)vec_splats((float)_v)
2650 #define _mm_set1_pd(_v) (__m128d)vec_splats((double)_v)
2651 //#define _mm_setr_ps(_e,_f) (__m128d)vec_insert(_e, (__m128d)vec_splats(_f), 0)
2652 //#define _mm_setr_pd(_e,_f) (__m128d)vec_insert(_e, (__m128d)vec_splats(_f), 0)
2653 #define _mm_setzero_ps() (__m128)vec_splats((float)0.0)
2654 #define _mm_setzero_pd() (__m128d)vec_splats((double)0.0)
2655
2656 #define _mm_cvtps_epi32(_v) vec_cts(_v,0)
2657 // Need inline version #define _mm_cvtepi32_pd(_v) vec_ctd(_v,0)
2658 #define _mm_cvtepi32_ps(_v) vec_ctf(_v,0)
2659 #define _mm_cvtss_f32(_v) (float)vec_extract(_v,0)
2660 #define _mm_cvtsd_f64(_v) (double)vec_extract(_v,0)
2661 //#define _mm_cvtpd_ps(_v) (__m128)vec_cvf(_v) // Does not work
2662 #define _mm_cvtpd_ps(_v) vec_insert((float)vec_extract(_v,1), (vec_insert((float)vec_extract(_v,0), (__m128)vec_splats((float)0.0), 0)), 1)
2663 #define _mm_cvtss_sd(_v,_w) vec_insert((double) vec_extract(_w, 0), _v, 0)
2664 #define _mm_extract_ps(_v,_i) vec_extract(_v,_i)
2665
2666 /*
2667 * Floating point
2668 */
2669
2670 #define _mm_add_ps(_v,_w) vec_add(_v,_w)
2671 #define _mm_add_pd(_v,_w) vec_add(_v,_w)
2672 #define _mm_add_epi64(_v,_w) vec_add(_v,_w)
2673 #define _mm_mul_ps(_v,_w) vec_mul(_v,_w)
2674 #define _mm_mul_pd(_v,_w) vec_mul(_v,_w)
2675 #define _mm_sub_ps(_v,_w) vec_sub(_v,_w)
2676 #define _mm_sub_pd(_v,_w) vec_sub(_v,_w)
2677 #define _mm_sub_epi32(_v,_w) vec_sub(_v,_w)
2678 #define _mm_sub_epi64(_v,_w) vec_sub(_v,_w)
2679 #define _mm_div_ps(_v,_w) vec_div(_v,_w)
2680 #define _mm_div_pd(_v,_w) vec_div(_v,_w)
2681 #define _mm_sqrt_ps(_v) vec_sqrt(_v)
2682 #define _mm_sqrt_pd(_v) vec_sqrt(_v)
2683
2684 #define _mm_add_ss(_s,_t) (_s+_t)
2685 #define _mm_add_sd(_s,_t) (_s+_t)
2686 #define _mm_mul_ss(_s,_t) (_s*_t)
2687 #define _mm_mul_sd(_s,_t) (_s*_t)
2688 #define _mm_sub_ss(_s,_t) (_s-_t)
2689 #define _mm_sub_sd(_s,_t) (_s-_t)
2690 #define _mm_div_ss(_s,_t) (_s/_t)
2691 #define _mm_div_sd(_s,_t) (_s/_t)
2692
2693 #define _mm_floor_ps(_v) vec_floor(_v)
2694 #define _mm_floor_pd(_v) vec_floor(_v)
2695
2696 /*
2697 * FMA instructions.
2698 *
2699 * _mm_fnmadd_p{s,d} not the same as Altivec intrinsic vec_nmadd(a,b,c).
2700 * Altivec returns: -(a*b+c).
2701 * We want: (-(a*b)+c)
2702 */
2703
2704 #define _mm_fmadd_ps(_v,_w,_x) vec_madd(_v,_w,_x)
2705 #define _mm_fmadd_pd(_v,_w,_x) vec_madd(_v,_w,_x)
2706 #define _mm_fmsub_ps(_v,_w,_x) vec_msub(_v,_w,_x)
2707 #define _mm_fmsub_pd(_v,_w,_x) vec_msub(_v,_w,_x)
2708 #define _mm_fnmadd_ps(_v,_w,_x) vec_madd((-(_v)),_w,_x)
2709 #define _mm_fnmadd_pd(_v,_w,_x) vec_madd((-(_v)),_w,_x)
2710 #define _mm_min_epi32(_v,_w) vec_min(_v,_w)
2711 #define _mm_max_epi32(_v,_w) vec_max(_v,_w)
2712 #define _mm_max_epu32(_v,_w) vec_max(_v,_w)
2713 //#define _mm_min_sd(_v,_w)
2714
2715 #define _mm_fmadd_ss(_v,_w,_x) vec_madd(_v,_w,_x)//fmaf(_v,_w,_x) //((_v*_w)+_x)
2716 #define _mm_fmadd_sd(_v,_w,_x) vec_madd(_v,_w,_x)//fmaf(_v,_w,_x) //((_v*_w)+_x)
2717 #define _mm_fmsub_ss(_v,_w,_x) vec_msub(_v,_w,_x)//fmsf(_v,_w,_x) //((_v*_w)-_x)
2718 #define _mm_fmsub_sd(_v,_w,_x) vec_msub(_v,_w,_x)//fmsf(_v,_w,_x) //((_v*_w)-_x)
2719
2720 /*
2721 * Integer.
2722 */
2723
2724 #define _mm_add_epi32(_v,_w) vec_add(_v,_w)
2725 #define _mm_sub_epi32(_v,_w) vec_sub(_v,_w)
2726
2727 /*
2728 * Merge.
2729 */
2730
2731 #define _mm_blendv_ps(_v,_w,_m) vec_sel(_v,_w,_m)
2732 #define _mm_blendv_pd(_v,_w,_m) vec_sel(_v,_w,_m)
2733
2734 /*
2735 * Miscelaneous:
2736 * Vector op constant
2737 * Casting
2738 */
2739
2740 #define _mm_castps_si128(_v) (__m128i)(_v)
2741 #define _mm_castpd_si128(_v) (__m128i)(_v)
2742 #define _mm_slli_epi32(_v,_c) vec_sl(_v,vec_splats((unsigned int)_c))
2743 #define _mm_slli_epi64(_v,_c) (__m128i)vec_sl(_v,vec_splats((unsigned long)_c))
2744 #define _mm_sllv_epi64(_v,_w) vec_sl((__m128i)_v,_w)
2745 #define _mm_srli_epi32(_v,_c) vec_sr(_v,vec_splats((unsigned int)_c))
2746 #define _mm_srli_epi64(_v,_c) vec_sr(_v,vec_splats((unsigned long)_c))
2747
2748 /*
2749 * Comparision.
2750 *
2751 * The following 4 macros stole shamelessly from:
2752 * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
2753 */
2754
2755 #define _CAT(_a,_b,...) _a##_b
2756 #define _EMPTY()
2757 #define _DEFER(id) id _EMPTY()
2758 #define _EXPAND1(...) __VA_ARGS__
2759 #define _EXPAND(...) _EXPAND1(_EXPAND1(__VA_ARGS__))
2760
2761 #define __CMP_EQ_OQ(_v,_w) (__typeof__(_v))vec_cmpeq(_v,_w)
2762 #define __CMP_EQ_OS(_v,_w) (__typeof__(_v))vec_cmpeq(_v,_w)
2763 #define __CMP_LE_OQ(_v,_w) (__typeof__(_v))vec_cmple(_v,_w)
2764 #define __CMP_LT_OS(_v,_w) (__typeof__(_v))vec_cmplt(_v,_w)
2765 #define __CMP_LT_OQ(_v,_w) (__typeof__(_v))vec_cmplt(_v,_w)
2766 #define __CMP_GE_OS(_v,_w) (__typeof__(_v))vec_cmpge(_v,_w)
2767 #define __CMP_GT_OS(_v,_w) (__typeof__(_v))vec_cmpgt(_v,_w)
2768 #define __CMP_GT_OQ(_v,_w) (__typeof__(_v))vec_cmpgt(_v,_w)
2769 //#define __CMP_NEQ_UQ(_v,_w) (typeof(_v))vec_andc((__m128i)vec_splats(0xffffffff),(__m128i)vec_cmpeq(_v, _w))
2770 #define __CMP_NEQ_UQ(_v,_w) \
2771 (__typeof__(_v))vec_andc((__m128i)vec_splats(-1),(__m128i)vec_cmpeq(_v, _w))
2772 #define __CMP_NLT_UQ(_v,_w) \
2773 (__typeof__(_v))vec_andc((__m128i)vec_splats(-1),(__m128i)vec_cmplt(_v, _w))
2774 #define __CMP_NGE_UQ(_v,_w) \
2775 (__typeof__(_v))vec_andc((__m128i)vec_splats(-1),(__m128i)vec_cmpge(_v, _w))
2776
2777 #define _mm_cmpeq_epi32(_v,_w) (__m128i)vec_cmpeq(_v,_w)
2778 #define _mm_cmpeq_epi64(_v,_w) (__m128i)vec_cmpeq(_v,_w)
2779 #define _mm_cmpgt_epi32(_v,_w) (__m128i)vec_cmpgt(_v,_w)
2780 #define _mm_cmpgt_epi64(_v,_w) (__m128i)vec_cmpgt(_v,_w)
2781 #define _mm_cmple_ps(_v,_w) (__m128i)vec_cmple(_v,_w)
2782 #define _mm_cmplt_ps(_v,_w) (__m128i)vec_cmplt(_v,_w)
2783 #define _mm_cmpeq_ps(_v,_w) (__m128i)vec_cmpeq(_v,_w)
2784 #define _mm_cmp_ps(_v,_w,_c) _EXPAND(_DEFER(_CAT(_,_c))(_v,_w))
2785 #define _mm_cmp_pd(_v,_w,_c) _EXPAND(_DEFER(_CAT(_,_c))(_v,_w))
2786 #define _mm_cmp_ss(_v,_w,_c) _EXPAND(_DEFER(_CAT(_,_c))(_v,_w))
2787 #define _mm_cmp_sd(_v,_w,_c) _EXPAND(_DEFER(_CAT(_,_c))(_v,_w))
2788
2789 /*
2790 * More macros that have to have secondary expansion.
2791 */
2792
2793 #define __MM_FROUND_TO_ZERO(_v) vec_trunc(_v)
2794 // - does seem to exist with GCC 5.4 #define __MM_FROUND_TO_ZERO(_v) vec_roundz(_v)
2795 #define _mm_round_ps(_v,_m) _EXPAND(_DEFER(_CAT(_,_m))(_v))
2796 #define _mm_round_pd(_v,_m) _EXPAND(_DEFER(_CAT(_,_m))(_v))
2797 #endif
2798
2799
2800 #ifdef DEBUG
2801 #include <stdio.h>
2802 static inline void
2803 __attribute__((always_inline))
_dumpfvec(__m128 a,char * t)2804 _dumpfvec(__m128 a, char *t)
2805 {
2806 int i;
2807 printf("%s:", t);
2808 for (i = 0 ; i < 4 ; i++) {
2809 printf(" %#x", *(unsigned int *)&a[i]);
2810 }
2811 printf("\n");
2812 }
2813 static inline void
2814 __attribute__((always_inline))
_dumpdvec(__m128d a,char * t)2815 _dumpdvec(__m128d a, char *t)
2816 {
2817 int i;
2818 printf("%s:", t);
2819 for (i = 0 ; i < 2 ; i++) {
2820 printf(" %#lx", *(unsigned long int *)&a[i]);
2821 }
2822 printf("\n");
2823 }
2824
2825 #endif
2826