xref: /reactos/sdk/include/crt/dvec.h (revision 6f6b8317)
1 /**
2  * This file has no copyright assigned and is placed in the Public Domain.
3  * This file is part of the w64 mingw-runtime package.
4  * No warranty is given; refer to the file DISCLAIMER within this package.
5  */
6 #ifndef _DVEC_H_INCLUDED
7 #define _DVEC_H_INCLUDED
8 #ifndef RC_INVOKED
9 
10 #if !defined __cplusplus
11 #error This file is only supported in C++ compilations!
12 #endif
13 
14 #include <emmintrin.h>
15 #include <assert.h>
16 #include <fvec.h>
17 #include <corecrt.h>
18 
19 #pragma pack(push,_CRT_PACKING)
20 
21 #if defined(_ENABLE_VEC_DEBUG)
22 #include <iostream>
23 #endif
24 
25 #pragma pack(push,16)
26 
27 #define EXPLICIT explicit
28 
29 class I8vec16;
30 class Is8vec16;
31 class Iu8vec16;
32 class I16vec8;
33 class Is16vec8;
34 class Iu16vec8;
35 class I32vec4;
36 class Is32vec4;
37 class Iu32vec4;
38 class I64vec2;
39 class I128vec1;
40 
41 #define _MM_16UB(element,vector) (*((unsigned char*)&##vector + ##element))
42 #define _MM_16B(element,vector) (*((signed char*)&##vector + ##element))
43 
44 #define _MM_8UW(element,vector) (*((unsigned short*)&##vector + ##element))
45 #define _MM_8W(element,vector) (*((short*)&##vector + ##element))
46 
47 #define _MM_4UDW(element,vector) (*((unsigned int*)&##vector + ##element))
48 #define _MM_4DW(element,vector) (*((int*)&##vector + ##element))
49 
50 #define _MM_2QW(element,vector) (*((__int64*)&##vector + ##element))
51 
get_mask128()52 inline const __m128i get_mask128()
53 {
54   static const __m128i mask128 = _mm_set1_epi64(M64(0xffffffffffffffffi64));
55   return mask128;
56 }
57 
58 class M128
59 {
60 protected:
61   __m128i vec;
62 
63 public:
M128()64   M128() { }
M128(__m128i mm)65   M128(__m128i mm) { vec = mm; }
66 
__m128i()67   operator __m128i() const { return vec; }
68 
69   M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
70   M128& operator|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
71   M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
72 
73 };
74 
75 inline M128 operator&(const M128 &a,const M128 &b) { return _mm_and_si128(a,b); }
76 inline M128 operator|(const M128 &a,const M128 &b) { return _mm_or_si128(a,b); }
77 inline M128 operator^(const M128 &a,const M128 &b) { return _mm_xor_si128(a,b); }
andnot(const M128 & a,const M128 & b)78 inline M128 andnot(const M128 &a,const M128 &b) { return _mm_andnot_si128(a,b); }
79 
80 class I128vec1 : public M128
81 {
82 public:
I128vec1()83   I128vec1() { }
I128vec1(__m128i mm)84   I128vec1(__m128i mm) : M128(mm) { }
85 
86   I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
87   I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
88   I128vec1& operator|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
89   I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
90 
91 };
92 
93 class I64vec2 : public M128
94 {
95 public:
I64vec2()96   I64vec2() { }
I64vec2(__m128i mm)97   I64vec2(__m128i mm) : M128(mm) { }
98 
I64vec2(__m64 q1,__m64 q0)99   I64vec2(__m64 q1,__m64 q0)
100   {
101     _MM_2QW(0,vec) = *(__int64*)&q0;
102     _MM_2QW(1,vec) = *(__int64*)&q1;
103   }
104 
105   I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
106 
107   I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
108   I64vec2& operator|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
109   I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
110 
111   I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
112   I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
113 
114   I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
115   I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
116   I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
117   I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
118   I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
119   I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
120   I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
121   I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
122 
123   const __int64& operator[](int i)const
124   {
125     assert(static_cast<unsigned int>(i) < 2);
126     return _MM_2QW(i,vec);
127   }
128 
129   __int64& operator[](int i)
130   {
131     assert(static_cast<unsigned int>(i) < 2);
132     return _MM_2QW(i,vec);
133   }
134 
135 };
136 
unpack_low(const I64vec2 & a,const I64vec2 & b)137 inline I64vec2 unpack_low(const I64vec2 &a,const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
unpack_high(const I64vec2 & a,const I64vec2 & b)138 inline I64vec2 unpack_high(const I64vec2 &a,const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
139 
140 class I32vec4 : public M128
141 {
142 public:
I32vec4()143   I32vec4() { }
I32vec4(__m128i mm)144   I32vec4(__m128i mm) : M128(mm) { }
145 
146   I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
147 
148   I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
149   I32vec4& operator|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
150   I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
151 
152   I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
153   I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
154 
155   I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
156   I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
157   I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
158   I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
159 
160 };
161 
cmpeq(const I32vec4 & a,const I32vec4 & b)162 inline I32vec4 cmpeq(const I32vec4 &a,const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const I32vec4 & a,const I32vec4 & b)163 inline I32vec4 cmpneq(const I32vec4 &a,const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
164 
unpack_low(const I32vec4 & a,const I32vec4 & b)165 inline I32vec4 unpack_low(const I32vec4 &a,const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const I32vec4 & a,const I32vec4 & b)166 inline I32vec4 unpack_high(const I32vec4 &a,const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
167 
168 class Is32vec4 : public I32vec4
169 {
170 public:
Is32vec4()171   Is32vec4() { }
Is32vec4(__m128i mm)172   Is32vec4(__m128i mm) : I32vec4(mm) { }
Is32vec4(int i3,int i2,int i1,int i0)173   Is32vec4(int i3,int i2,int i1,int i0)
174   {
175     _MM_4DW(0,vec) = i0;
176     _MM_4DW(1,vec) = i1;
177     _MM_4DW(2,vec) = i2;
178     _MM_4DW(3,vec) = i3;
179   }
180 
181   Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
182 
183   Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
184   Is32vec4& operator|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
185   Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
186 
187   Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
188   Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
189 
190   Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
191   Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
192   Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
193   Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
194 
195   Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
196   Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
197   Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
198   Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
199 
200 #if defined(_ENABLE_VEC_DEBUG)
201 
202   friend std::ostream& operator<< (std::ostream &os,const Is32vec4 &a)
203   {
204     os << "[3]:" << _MM_4DW(3,a)
205       << " [2]:" << _MM_4DW(2,a)
206       << " [1]:" << _MM_4DW(1,a)
207       << " [0]:" << _MM_4DW(0,a);
208     return os;
209   }
210 #endif
211 
212   const int& operator[](int i)const
213   {
214     assert(static_cast<unsigned int>(i) < 4);
215     return _MM_4DW(i,vec);
216   }
217 
218   int& operator[](int i)
219   {
220     assert(static_cast<unsigned int>(i) < 4);
221     return _MM_4DW(i,vec);
222   }
223 };
224 
cmpeq(const Is32vec4 & a,const Is32vec4 & b)225 inline Is32vec4 cmpeq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Is32vec4 & a,const Is32vec4 & b)226 inline Is32vec4 cmpneq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
cmpgt(const Is32vec4 & a,const Is32vec4 & b)227 inline Is32vec4 cmpgt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
cmplt(const Is32vec4 & a,const Is32vec4 & b)228 inline Is32vec4 cmplt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
229 
unpack_low(const Is32vec4 & a,const Is32vec4 & b)230 inline Is32vec4 unpack_low(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Is32vec4 & a,const Is32vec4 & b)231 inline Is32vec4 unpack_high(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
232 
233 class Iu32vec4 : public I32vec4
234 {
235 public:
Iu32vec4()236   Iu32vec4() { }
Iu32vec4(__m128i mm)237   Iu32vec4(__m128i mm) : I32vec4(mm) { }
Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)238   Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)
239   {
240     _MM_4UDW(0,vec) = ui0;
241     _MM_4UDW(1,vec) = ui1;
242     _MM_4UDW(2,vec) = ui2;
243     _MM_4UDW(3,vec) = ui3;
244   }
245 
246   Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
247 
248   Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
249   Iu32vec4& operator|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
250   Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
251 
252   Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
253   Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
254 
255   Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
256   Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
257   Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
258   Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
259   Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
260   Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
261   Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
262   Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
263 
264 #if defined(_ENABLE_VEC_DEBUG)
265 
266   friend std::ostream& operator<< (std::ostream &os,const Iu32vec4 &a)
267   {
268     os << "[3]:" << _MM_4UDW(3,a)
269       << " [2]:" << _MM_4UDW(2,a)
270       << " [1]:" << _MM_4UDW(1,a)
271       << " [0]:" << _MM_4UDW(0,a);
272     return os;
273   }
274 #endif
275 
276   const unsigned int& operator[](int i)const
277   {
278     assert(static_cast<unsigned int>(i) < 4);
279     return _MM_4UDW(i,vec);
280   }
281 
282   unsigned int& operator[](int i)
283   {
284     assert(static_cast<unsigned int>(i) < 4);
285     return _MM_4UDW(i,vec);
286   }
287 };
288 
289 inline I64vec2 operator*(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
cmpeq(const Iu32vec4 & a,const Iu32vec4 & b)290 inline Iu32vec4 cmpeq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Iu32vec4 & a,const Iu32vec4 & b)291 inline Iu32vec4 cmpneq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
292 
unpack_low(const Iu32vec4 & a,const Iu32vec4 & b)293 inline Iu32vec4 unpack_low(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Iu32vec4 & a,const Iu32vec4 & b)294 inline Iu32vec4 unpack_high(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
295 
296 class I16vec8 : public M128
297 {
298 public:
I16vec8()299   I16vec8() { }
I16vec8(__m128i mm)300   I16vec8(__m128i mm) : M128(mm) { }
301 
302   I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
303 
304   I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
305   I16vec8& operator|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
306   I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
307 
308   I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
309   I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
310   I16vec8& operator *=(const I16vec8 &a) { return *this = (I16vec8) _mm_mullo_epi16(vec,a); }
311 
312   I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
313   I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
314   I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
315   I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
316 
317 };
318 
319 inline I16vec8 operator*(const I16vec8 &a,const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
320 
cmpeq(const I16vec8 & a,const I16vec8 & b)321 inline I16vec8 cmpeq(const I16vec8 &a,const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const I16vec8 & a,const I16vec8 & b)322 inline I16vec8 cmpneq(const I16vec8 &a,const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
323 
unpack_low(const I16vec8 & a,const I16vec8 & b)324 inline I16vec8 unpack_low(const I16vec8 &a,const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const I16vec8 & a,const I16vec8 & b)325 inline I16vec8 unpack_high(const I16vec8 &a,const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
326 
327 class Is16vec8 : public I16vec8
328 {
329 public:
Is16vec8()330   Is16vec8() { }
Is16vec8(__m128i mm)331   Is16vec8(__m128i mm) : I16vec8(mm) { }
Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)332   Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)
333   {
334     _MM_8W(0,vec) = s0;
335     _MM_8W(1,vec) = s1;
336     _MM_8W(2,vec) = s2;
337     _MM_8W(3,vec) = s3;
338     _MM_8W(4,vec) = s4;
339     _MM_8W(5,vec) = s5;
340     _MM_8W(6,vec) = s6;
341     _MM_8W(7,vec) = s7;
342   }
343 
344   Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
345 
346   Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
347   Is16vec8& operator|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
348   Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
349 
350   Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
351   Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
352   Is16vec8& operator *=(const I16vec8 &a) { return *this = (Is16vec8) _mm_mullo_epi16(vec,a); }
353 
354   Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
355   Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
356   Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
357   Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
358 
359   Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
360   Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
361   Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
362   Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
363 
364 #if defined(_ENABLE_VEC_DEBUG)
365 
366   friend std::ostream& operator<< (std::ostream &os,const Is16vec8 &a)
367   {
368     os << "[7]:" << _MM_8W(7,a)
369       << " [6]:" << _MM_8W(6,a)
370       << " [5]:" << _MM_8W(5,a)
371       << " [4]:" << _MM_8W(4,a)
372       << " [3]:" << _MM_8W(3,a)
373       << " [2]:" << _MM_8W(2,a)
374       << " [1]:" << _MM_8W(1,a)
375       << " [0]:" << _MM_8W(0,a);
376     return os;
377   }
378 #endif
379 
380   const signed short& operator[](int i)const
381   {
382     assert(static_cast<unsigned int>(i) < 8);
383     return _MM_8W(i,vec);
384   }
385 
386   signed short& operator[](int i)
387   {
388     assert(static_cast<unsigned int>(i) < 8);
389     return _MM_8W(i,vec);
390   }
391 };
392 
393 inline Is16vec8 operator*(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
394 
cmpeq(const Is16vec8 & a,const Is16vec8 & b)395 inline Is16vec8 cmpeq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Is16vec8 & a,const Is16vec8 & b)396 inline Is16vec8 cmpneq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
cmpgt(const Is16vec8 & a,const Is16vec8 & b)397 inline Is16vec8 cmpgt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
cmplt(const Is16vec8 & a,const Is16vec8 & b)398 inline Is16vec8 cmplt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
399 
unpack_low(const Is16vec8 & a,const Is16vec8 & b)400 inline Is16vec8 unpack_low(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Is16vec8 & a,const Is16vec8 & b)401 inline Is16vec8 unpack_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
402 
mul_high(const Is16vec8 & a,const Is16vec8 & b)403 inline Is16vec8 mul_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
mul_add(const Is16vec8 & a,const Is16vec8 & b)404 inline Is32vec4 mul_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
405 
sat_add(const Is16vec8 & a,const Is16vec8 & b)406 inline Is16vec8 sat_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
sat_sub(const Is16vec8 & a,const Is16vec8 & b)407 inline Is16vec8 sat_sub(const Is16vec8 &a,const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
408 
simd_max(const Is16vec8 & a,const Is16vec8 & b)409 inline Is16vec8 simd_max(const Is16vec8 &a,const Is16vec8 &b) { return _mm_max_epi16(a,b); }
simd_min(const Is16vec8 & a,const Is16vec8 & b)410 inline Is16vec8 simd_min(const Is16vec8 &a,const Is16vec8 &b) { return _mm_min_epi16(a,b); }
411 
412 class Iu16vec8 : public I16vec8
413 {
414 public:
Iu16vec8()415   Iu16vec8() { }
Iu16vec8(__m128i mm)416   Iu16vec8(__m128i mm) : I16vec8(mm) { }
Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)417   Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)
418   {
419     _MM_8UW(0,vec) = s0;
420     _MM_8UW(1,vec) = s1;
421     _MM_8UW(2,vec) = s2;
422     _MM_8UW(3,vec) = s3;
423     _MM_8UW(4,vec) = s4;
424     _MM_8UW(5,vec) = s5;
425     _MM_8UW(6,vec) = s6;
426     _MM_8UW(7,vec) = s7;
427   }
428 
429   Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
430 
431   Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
432   Iu16vec8& operator|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
433   Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
434 
435   Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
436   Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
437   Iu16vec8& operator *=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
438 
439   Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
440   Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
441   Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
442   Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
443   Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
444   Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
445   Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
446   Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
447 
448 #if defined(_ENABLE_VEC_DEBUG)
449 
450   friend std::ostream& operator << (std::ostream &os,const Iu16vec8 &a)
451   {
452     os << "[7]:" << unsigned short(_MM_8UW(7,a))
453       << " [6]:" << unsigned short(_MM_8UW(6,a))
454       << " [5]:" << unsigned short(_MM_8UW(5,a))
455       << " [4]:" << unsigned short(_MM_8UW(4,a))
456       << " [3]:" << unsigned short(_MM_8UW(3,a))
457       << " [2]:" << unsigned short(_MM_8UW(2,a))
458       << " [1]:" << unsigned short(_MM_8UW(1,a))
459       << " [0]:" << unsigned short(_MM_8UW(0,a));
460     return os;
461   }
462 #endif
463 
464   const unsigned short& operator[](int i)const
465   {
466     assert(static_cast<unsigned int>(i) < 8);
467     return _MM_8UW(i,vec);
468   }
469 
470   unsigned short& operator[](int i)
471   {
472     assert(static_cast<unsigned int>(i) < 8);
473     return _MM_8UW(i,vec);
474   }
475 };
476 
477 inline Iu16vec8 operator*(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
478 
cmpeq(const Iu16vec8 & a,const Iu16vec8 & b)479 inline Iu16vec8 cmpeq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Iu16vec8 & a,const Iu16vec8 & b)480 inline Iu16vec8 cmpneq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
481 
unpack_low(const Iu16vec8 & a,const Iu16vec8 & b)482 inline Iu16vec8 unpack_low(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Iu16vec8 & a,const Iu16vec8 & b)483 inline Iu16vec8 unpack_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
484 
sat_add(const Iu16vec8 & a,const Iu16vec8 & b)485 inline Iu16vec8 sat_add(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
sat_sub(const Iu16vec8 & a,const Iu16vec8 & b)486 inline Iu16vec8 sat_sub(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
487 
simd_avg(const Iu16vec8 & a,const Iu16vec8 & b)488 inline Iu16vec8 simd_avg(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
mul_high(const Iu16vec8 & a,const Iu16vec8 & b)489 inline I16vec8 mul_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
490 
491 class I8vec16 : public M128
492 {
493 public:
I8vec16()494   I8vec16() { }
I8vec16(__m128i mm)495   I8vec16(__m128i mm) : M128(mm) { }
496 
497   I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
498 
499   I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
500   I8vec16& operator|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
501   I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
502 
503   I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
504   I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
505 
506 };
507 
cmpeq(const I8vec16 & a,const I8vec16 & b)508 inline I8vec16 cmpeq(const I8vec16 &a,const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const I8vec16 & a,const I8vec16 & b)509 inline I8vec16 cmpneq(const I8vec16 &a,const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
510 
unpack_low(const I8vec16 & a,const I8vec16 & b)511 inline I8vec16 unpack_low(const I8vec16 &a,const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const I8vec16 & a,const I8vec16 & b)512 inline I8vec16 unpack_high(const I8vec16 &a,const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
513 
514 class Is8vec16 : public I8vec16
515 {
516 public:
Is8vec16()517   Is8vec16() { }
Is8vec16(__m128i mm)518   Is8vec16(__m128i mm) : I8vec16(mm) { }
519 
520   Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
521 
522   Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
523   Is8vec16& operator|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
524   Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
525 
526   Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
527   Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
528 
529 #if defined(_ENABLE_VEC_DEBUG)
530 
531   friend std::ostream& operator << (std::ostream &os,const Is8vec16 &a)
532   {
533     os << "[15]:" << short(_MM_16B(15,a))
534       << " [14]:" << short(_MM_16B(14,a))
535       << " [13]:" << short(_MM_16B(13,a))
536       << " [12]:" << short(_MM_16B(12,a))
537       << " [11]:" << short(_MM_16B(11,a))
538       << " [10]:" << short(_MM_16B(10,a))
539       << " [9]:" << short(_MM_16B(9,a))
540       << " [8]:" << short(_MM_16B(8,a))
541       << " [7]:" << short(_MM_16B(7,a))
542       << " [6]:" << short(_MM_16B(6,a))
543       << " [5]:" << short(_MM_16B(5,a))
544       << " [4]:" << short(_MM_16B(4,a))
545       << " [3]:" << short(_MM_16B(3,a))
546       << " [2]:" << short(_MM_16B(2,a))
547       << " [1]:" << short(_MM_16B(1,a))
548       << " [0]:" << short(_MM_16B(0,a));
549     return os;
550   }
551 #endif
552 
553   const signed char& operator[](int i)const
554   {
555     assert(static_cast<unsigned int>(i) < 16);
556     return _MM_16B(i,vec);
557   }
558 
559   signed char& operator[](int i)
560   {
561     assert(static_cast<unsigned int>(i) < 16);
562     return _MM_16B(i,vec);
563   }
564 
565 };
566 
cmpeq(const Is8vec16 & a,const Is8vec16 & b)567 inline Is8vec16 cmpeq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Is8vec16 & a,const Is8vec16 & b)568 inline Is8vec16 cmpneq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
cmpgt(const Is8vec16 & a,const Is8vec16 & b)569 inline Is8vec16 cmpgt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
cmplt(const Is8vec16 & a,const Is8vec16 & b)570 inline Is8vec16 cmplt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
571 
unpack_low(const Is8vec16 & a,const Is8vec16 & b)572 inline Is8vec16 unpack_low(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Is8vec16 & a,const Is8vec16 & b)573 inline Is8vec16 unpack_high(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
574 
sat_add(const Is8vec16 & a,const Is8vec16 & b)575 inline Is8vec16 sat_add(const Is8vec16 &a,const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
sat_sub(const Is8vec16 & a,const Is8vec16 & b)576 inline Is8vec16 sat_sub(const Is8vec16 &a,const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
577 
578 class Iu8vec16 : public I8vec16
579 {
580 public:
Iu8vec16()581   Iu8vec16() { }
Iu8vec16(__m128i mm)582   Iu8vec16(__m128i mm) : I8vec16(mm) { }
583 
584   Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
585 
586   Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
587   Iu8vec16& operator|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
588   Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
589 
590   Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
591   Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
592 
593 #if defined(_ENABLE_VEC_DEBUG)
594 
595   friend std::ostream& operator << (std::ostream &os,const Iu8vec16 &a)
596   {
597     os << "[15]:" << unsigned short(_MM_16UB(15,a))
598       << " [14]:" << unsigned short(_MM_16UB(14,a))
599       << " [13]:" << unsigned short(_MM_16UB(13,a))
600       << " [12]:" << unsigned short(_MM_16UB(12,a))
601       << " [11]:" << unsigned short(_MM_16UB(11,a))
602       << " [10]:" << unsigned short(_MM_16UB(10,a))
603       << " [9]:" << unsigned short(_MM_16UB(9,a))
604       << " [8]:" << unsigned short(_MM_16UB(8,a))
605       << " [7]:" << unsigned short(_MM_16UB(7,a))
606       << " [6]:" << unsigned short(_MM_16UB(6,a))
607       << " [5]:" << unsigned short(_MM_16UB(5,a))
608       << " [4]:" << unsigned short(_MM_16UB(4,a))
609       << " [3]:" << unsigned short(_MM_16UB(3,a))
610       << " [2]:" << unsigned short(_MM_16UB(2,a))
611       << " [1]:" << unsigned short(_MM_16UB(1,a))
612       << " [0]:" << unsigned short(_MM_16UB(0,a));
613     return os;
614   }
615 #endif
616 
617   const unsigned char& operator[](int i)const
618   {
619     assert(static_cast<unsigned int>(i) < 16);
620     return _MM_16UB(i,vec);
621   }
622 
623   unsigned char& operator[](int i)
624   {
625     assert(static_cast<unsigned int>(i) < 16);
626     return _MM_16UB(i,vec);
627   }
628 
629 };
630 
cmpeq(const Iu8vec16 & a,const Iu8vec16 & b)631 inline Iu8vec16 cmpeq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Iu8vec16 & a,const Iu8vec16 & b)632 inline Iu8vec16 cmpneq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
633 
unpack_low(const Iu8vec16 & a,const Iu8vec16 & b)634 inline Iu8vec16 unpack_low(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Iu8vec16 & a,const Iu8vec16 & b)635 inline Iu8vec16 unpack_high(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
636 
sat_add(const Iu8vec16 & a,const Iu8vec16 & b)637 inline Iu8vec16 sat_add(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
sat_sub(const Iu8vec16 & a,const Iu8vec16 & b)638 inline Iu8vec16 sat_sub(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
639 
sum_abs(const Iu8vec16 & a,const Iu8vec16 & b)640 inline I64vec2 sum_abs(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
641 
simd_avg(const Iu8vec16 & a,const Iu8vec16 & b)642 inline Iu8vec16 simd_avg(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
simd_max(const Iu8vec16 & a,const Iu8vec16 & b)643 inline Iu8vec16 simd_max(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
simd_min(const Iu8vec16 & a,const Iu8vec16 & b)644 inline Iu8vec16 simd_min(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
645 
pack_sat(const Is32vec4 & a,const Is32vec4 & b)646 inline Is16vec8 pack_sat(const Is32vec4 &a,const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
pack_sat(const Is16vec8 & a,const Is16vec8 & b)647 inline Is8vec16 pack_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
packu_sat(const Is16vec8 & a,const Is16vec8 & b)648 inline Iu8vec16 packu_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
649 
650 #define IVEC128_LOGICALS(vect,element) inline I##vect##vec##element operator& (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_and_si128(a,b); } inline I##vect##vec##element operator| (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_or_si128(a,b); } inline I##vect##vec##element operator^ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_xor_si128(a,b); } inline I##vect##vec##element andnot (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_andnot_si128(a,b); }
651 
652 IVEC128_LOGICALS(8,16)
653 IVEC128_LOGICALS(u8,16)
654 IVEC128_LOGICALS(s8,16)
655 IVEC128_LOGICALS(16,8)
656 IVEC128_LOGICALS(u16,8)
657 IVEC128_LOGICALS(s16,8)
658 IVEC128_LOGICALS(32,4)
659 IVEC128_LOGICALS(u32,4)
660 IVEC128_LOGICALS(s32,4)
661 IVEC128_LOGICALS(64,2)
662 IVEC128_LOGICALS(128,1)
663 #undef IVEC128_LOGICALS
664 
665 #define IVEC128_ADD_SUB(vect,element,opsize) inline I##vect##vec##element operator+ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_add_##opsize(a,b); } inline I##vect##vec##element operator- (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_sub_##opsize(a,b); }
666 
667 IVEC128_ADD_SUB(8,16,epi8)
668 IVEC128_ADD_SUB(u8,16,epi8)
669 IVEC128_ADD_SUB(s8,16,epi8)
670 IVEC128_ADD_SUB(16,8,epi16)
671 IVEC128_ADD_SUB(u16,8,epi16)
672 IVEC128_ADD_SUB(s16,8,epi16)
673 IVEC128_ADD_SUB(32,4,epi32)
674 IVEC128_ADD_SUB(u32,4,epi32)
675 IVEC128_ADD_SUB(s32,4,epi32)
676 IVEC128_ADD_SUB(64,2,epi64)
677 #undef IVEC128_ADD_SUB
678 
679 #define IVEC128_SELECT(vect12,vect34,element,selop,arg1,arg2) inline I##vect34##vec##element select_##selop (const I##vect12##vec##element &a,const I##vect12##vec##element &b,const I##vect34##vec##element &c,const I##vect34##vec##element &d) { I##vect12##vec##element mask = cmp##selop(a,b); return(I##vect34##vec##element ((mask & arg1) | I##vect12##vec##element ((_mm_andnot_si128(mask,arg2))))); }
680 IVEC128_SELECT(8,s8,16,eq,c,d)
681 IVEC128_SELECT(8,u8,16,eq,c,d)
682 IVEC128_SELECT(8,8,16,eq,c,d)
683 IVEC128_SELECT(8,s8,16,neq,c,d)
684 IVEC128_SELECT(8,u8,16,neq,c,d)
685 IVEC128_SELECT(8,8,16,neq,c,d)
686 
687 IVEC128_SELECT(16,s16,8,eq,c,d)
688 IVEC128_SELECT(16,u16,8,eq,c,d)
689 IVEC128_SELECT(16,16,8,eq,c,d)
690 IVEC128_SELECT(16,s16,8,neq,c,d)
691 IVEC128_SELECT(16,u16,8,neq,c,d)
692 IVEC128_SELECT(16,16,8,neq,c,d)
693 
694 IVEC128_SELECT(32,s32,4,eq,c,d)
695 IVEC128_SELECT(32,u32,4,eq,c,d)
696 IVEC128_SELECT(32,32,4,eq,c,d)
697 IVEC128_SELECT(32,s32,4,neq,c,d)
698 IVEC128_SELECT(32,u32,4,neq,c,d)
699 IVEC128_SELECT(32,32,4,neq,c,d)
700 
701 IVEC128_SELECT(s8,s8,16,gt,c,d)
702 IVEC128_SELECT(s8,u8,16,gt,c,d)
703 IVEC128_SELECT(s8,8,16,gt,c,d)
704 IVEC128_SELECT(s8,s8,16,lt,c,d)
705 IVEC128_SELECT(s8,u8,16,lt,c,d)
706 IVEC128_SELECT(s8,8,16,lt,c,d)
707 
708 IVEC128_SELECT(s16,s16,8,gt,c,d)
709 IVEC128_SELECT(s16,u16,8,gt,c,d)
710 IVEC128_SELECT(s16,16,8,gt,c,d)
711 IVEC128_SELECT(s16,s16,8,lt,c,d)
712 IVEC128_SELECT(s16,u16,8,lt,c,d)
713 IVEC128_SELECT(s16,16,8,lt,c,d)
714 
715 #undef IVEC128_SELECT
716 
717 class F64vec2
718 {
719 protected:
720   __m128d vec;
721 public:
722 
F64vec2()723   F64vec2() {}
724 
F64vec2(__m128d m)725   F64vec2(__m128d m) { vec = m;}
726 
F64vec2(double d1,double d0)727   F64vec2(double d1,double d0) { vec= _mm_set_pd(d1,d0); }
728 
F64vec2(double d)729   EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
730 
__m128d()731   operator __m128d() const { return vec; }
732 
733   friend F64vec2 operator &(const F64vec2 &a,const F64vec2 &b) { return _mm_and_pd(a,b); }
734   friend F64vec2 operator |(const F64vec2 &a,const F64vec2 &b) { return _mm_or_pd(a,b); }
735   friend F64vec2 operator ^(const F64vec2 &a,const F64vec2 &b) { return _mm_xor_pd(a,b); }
736 
737   friend F64vec2 operator +(const F64vec2 &a,const F64vec2 &b) { return _mm_add_pd(a,b); }
738   friend F64vec2 operator -(const F64vec2 &a,const F64vec2 &b) { return _mm_sub_pd(a,b); }
739   friend F64vec2 operator *(const F64vec2 &a,const F64vec2 &b) { return _mm_mul_pd(a,b); }
740   friend F64vec2 operator /(const F64vec2 &a,const F64vec2 &b) { return _mm_div_pd(a,b); }
741 
742   F64vec2& operator +=(F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
743   F64vec2& operator -=(F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
744   F64vec2& operator *=(F64vec2 &a) { return *this = _mm_mul_pd(vec,a); }
745   F64vec2& operator /=(F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
746   F64vec2& operator &=(F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
747   F64vec2& operator |=(F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
748   F64vec2& operator ^=(F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
749 
add_horizontal(F64vec2 & a)750   friend double add_horizontal(F64vec2 &a)
751   {
752     F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a,a,1));
753     return ftemp[0];
754   }
755 
andnot(const F64vec2 & a,const F64vec2 & b)756   friend F64vec2 andnot(const F64vec2 &a,const F64vec2 &b) { return _mm_andnot_pd(a,b); }
757 
sqrt(const F64vec2 & a)758   friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
759 
760 #define F64vec2_COMP(op) friend F64vec2 cmp##op (const F64vec2 &a,const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
761   F64vec2_COMP(eq)
F64vec2_COMP(lt)762     F64vec2_COMP(lt)
763     F64vec2_COMP(le)
764     F64vec2_COMP(gt)
765     F64vec2_COMP(ge)
766     F64vec2_COMP(ngt)
767     F64vec2_COMP(nge)
768     F64vec2_COMP(neq)
769     F64vec2_COMP(nlt)
770     F64vec2_COMP(nle)
771 #undef F64vec2_COMP
772 
773     friend F64vec2 simd_min(const F64vec2 &a,const F64vec2 &b) { return _mm_min_pd(a,b); }
simd_max(const F64vec2 & a,const F64vec2 & b)774   friend F64vec2 simd_max(const F64vec2 &a,const F64vec2 &b) { return _mm_max_pd(a,b); }
775 
776 #define F64vec2_COMI(op) friend int comi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
777   F64vec2_COMI(eq)
778     F64vec2_COMI(lt)
779     F64vec2_COMI(le)
780     F64vec2_COMI(gt)
781     F64vec2_COMI(ge)
782     F64vec2_COMI(neq)
783 #undef F64vec2_COMI
784 
785 #define F64vec2_UCOMI(op) friend int ucomi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
786     F64vec2_UCOMI(eq)
787     F64vec2_UCOMI(lt)
788     F64vec2_UCOMI(le)
789     F64vec2_UCOMI(gt)
790     F64vec2_UCOMI(ge)
791     F64vec2_UCOMI(neq)
792 #undef F64vec2_UCOMI
793 
794 #if defined(_ENABLE_VEC_DEBUG)
795 
796   friend std::ostream & operator<<(std::ostream & os,const F64vec2 &a) {
797     double *dp = (double*)&a;
798     os << " [1]:" << *(dp+1)
799       << " [0]:" << *dp;
800     return os;
801   }
802 #endif
803 
804   const double &operator[](int i) const {
805     assert((0 <= i) && (i <= 1));
806     double *dp = (double*)&vec;
807     return *(dp+i);
808   }
809 
810   double &operator[](int i) {
811     assert((0 <= i) && (i <= 1));
812     double *dp = (double*)&vec;
813     return *(dp+i);
814   }
815 };
816 
unpack_low(const F64vec2 & a,const F64vec2 & b)817 inline F64vec2 unpack_low(const F64vec2 &a,const F64vec2 &b) { return _mm_unpacklo_pd(a,b); }
unpack_high(const F64vec2 & a,const F64vec2 & b)818 inline F64vec2 unpack_high(const F64vec2 &a,const F64vec2 &b) { return _mm_unpackhi_pd(a,b); }
move_mask(const F64vec2 & a)819 inline int move_mask(const F64vec2 &a) { return _mm_movemask_pd(a); }
loadu(F64vec2 & a,double * p)820 inline void loadu(F64vec2 &a,double *p) { a = _mm_loadu_pd(p); }
storeu(double * p,const F64vec2 & a)821 inline void storeu(double *p,const F64vec2 &a) { _mm_storeu_pd(p,a); }
store_nta(double * p,F64vec2 & a)822 inline void store_nta(double *p,F64vec2 &a) { _mm_stream_pd(p,a); }
823 
824 #define F64vec2_SELECT(op) inline F64vec2 select_##op (const F64vec2 &a,const F64vec2 &b,const F64vec2 &c,const F64vec2 &d) { F64vec2 mask = _mm_cmp##op##_pd(a,b); return((mask & c) | F64vec2((_mm_andnot_pd(mask,d)))); }
825 F64vec2_SELECT(eq)
F64vec2_SELECT(lt)826 F64vec2_SELECT(lt)
827 F64vec2_SELECT(le)
828 F64vec2_SELECT(gt)
829 F64vec2_SELECT(ge)
830 F64vec2_SELECT(neq)
831 F64vec2_SELECT(nlt)
832 F64vec2_SELECT(nle)
833 #undef F64vec2_SELECT
834 
835 inline int F64vec2ToInt(const F64vec2 &a) { return _mm_cvttsd_si32(a); }
F32vec4ToF64vec2(const F32vec4 & a)836 inline F64vec2 F32vec4ToF64vec2(const F32vec4 &a) { return _mm_cvtps_pd(a); }
F64vec2ToF32vec4(const F64vec2 & a)837 inline F32vec4 F64vec2ToF32vec4(const F64vec2 &a) { return _mm_cvtpd_ps(a); }
IntToF64vec2(const F64vec2 & a,int b)838 inline F64vec2 IntToF64vec2(const F64vec2 &a,int b) { return _mm_cvtsi32_sd(a,b); }
839 
840 #pragma pack(pop)
841 #pragma pack(pop)
842 #endif
843 #endif
844