1 /**
2  * This file has no copyright assigned and is placed in the Public Domain.
3  * This file is part of the mingw-w64 runtime package.
4  * No warranty is given; refer to the file DISCLAIMER.PD within this package.
5  */
6 #ifndef _DVEC_H_INCLUDED
7 #define _DVEC_H_INCLUDED
8 #ifndef RC_INVOKED
9 
10 #if !defined __cplusplus
11 #error This file is only supported in C++ compilations!
12 #endif
13 
14 #include <intrin.h>
15 #include <assert.h>
16 #include <fvec.h>
17 #include <crtdefs.h>
18 
19 #pragma pack(push,_CRT_PACKING)
20 
21 #if defined(_ENABLE_VEC_DEBUG)
22 #include <iostream>
23 #endif
24 
25 #ifdef __SSE__
26 
27 #pragma pack(push,16)
28 
29 #define EXPLICIT explicit
30 
31 class I8vec16;
32 class Is8vec16;
33 class Iu8vec16;
34 class I16vec8;
35 class Is16vec8;
36 class Iu16vec8;
37 class I32vec4;
38 class Is32vec4;
39 class Iu32vec4;
40 class I64vec2;
41 class I128vec1;
42 
43 #define _MM_16UB(element,vector) (*((unsigned char*)&(vector) + (element)))
44 #define _MM_16B(element,vector) (*((signed char*)&(vector) + (element)))
45 
46 #define _MM_8UW(element,vector) (*((unsigned short*)&(vector) + (element)))
47 #define _MM_8W(element,vector) (*((short*)&(vector) + (element)))
48 
49 #define _MM_4UDW(element,vector) (*((unsigned int*)&(vector) + (element)))
50 #define _MM_4DW(element,vector) (*((int*)&(vector) + (element)))
51 
52 #define _MM_2QW(element,vector) (*((__int64*)&(vector) + (element)))
53 
get_mask128()54 __MINGW_EXTENSION inline const __m128i get_mask128()
55 {
56   static const __m128i mask128 = _mm_set1_epi64(M64((__int64)0xffffffffffffffffll));
57   return mask128;
58 }
59 
60 class M128
61 {
62 protected:
63   __m128i vec;
64 
65 public:
M128()66   M128() { }
M128(__m128i mm)67   M128(__m128i mm) { vec = mm; }
68 
__m128i()69   operator __m128i() const { return vec; }
70 
71   M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
72   M128& operator|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
73   M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
74 
75 };
76 
77 inline M128 operator&(const M128 &a,const M128 &b) { return _mm_and_si128(a,b); }
78 inline M128 operator|(const M128 &a,const M128 &b) { return _mm_or_si128(a,b); }
79 inline M128 operator^(const M128 &a,const M128 &b) { return _mm_xor_si128(a,b); }
andnot(const M128 & a,const M128 & b)80 inline M128 andnot(const M128 &a,const M128 &b) { return _mm_andnot_si128(a,b); }
81 
82 class I128vec1 : public M128
83 {
84 public:
I128vec1()85   I128vec1() { }
I128vec1(__m128i mm)86   I128vec1(__m128i mm) : M128(mm) { }
87 
88   I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
89   I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
90   I128vec1& operator|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
91   I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
92 
93 };
94 
95 class I64vec2 : public M128
96 {
97 public:
I64vec2()98   I64vec2() { }
I64vec2(__m128i mm)99   I64vec2(__m128i mm) : M128(mm) { }
100 
I64vec2(__m64 q1,__m64 q0)101   __MINGW_EXTENSION I64vec2(__m64 q1,__m64 q0)
102   {
103     _MM_2QW(0,vec) = *(__int64*)&q0;
104     _MM_2QW(1,vec) = *(__int64*)&q1;
105   }
106 
107   I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
108 
109   I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
110   I64vec2& operator|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
111   I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
112 
113   I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
114   I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
115 
116   I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
117   I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
118   I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
119   I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
120   I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
121   I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
122   I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
123   I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
124 
125   __MINGW_EXTENSION const __int64& operator[](int i)const
126   {
127     assert(static_cast<unsigned int>(i) < 2);
128     return _MM_2QW(i,vec);
129   }
130 
131   __MINGW_EXTENSION __int64& operator[](int i)
132   {
133     assert(static_cast<unsigned int>(i) < 2);
134     return _MM_2QW(i,vec);
135   }
136 
137 };
138 
unpack_low(const I64vec2 & a,const I64vec2 & b)139 inline I64vec2 unpack_low(const I64vec2 &a,const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
unpack_high(const I64vec2 & a,const I64vec2 & b)140 inline I64vec2 unpack_high(const I64vec2 &a,const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
141 
142 class I32vec4 : public M128
143 {
144 public:
I32vec4()145   I32vec4() { }
I32vec4(__m128i mm)146   I32vec4(__m128i mm) : M128(mm) { }
147 
148   I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
149 
150   I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
151   I32vec4& operator|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
152   I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
153 
154   I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
155   I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
156 
157   I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
158   I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
159   I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
160   I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
161 
162 };
163 
cmpeq(const I32vec4 & a,const I32vec4 & b)164 inline I32vec4 cmpeq(const I32vec4 &a,const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const I32vec4 & a,const I32vec4 & b)165 inline I32vec4 cmpneq(const I32vec4 &a,const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
166 
unpack_low(const I32vec4 & a,const I32vec4 & b)167 inline I32vec4 unpack_low(const I32vec4 &a,const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const I32vec4 & a,const I32vec4 & b)168 inline I32vec4 unpack_high(const I32vec4 &a,const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
169 
170 class Is32vec4 : public I32vec4
171 {
172 public:
Is32vec4()173   Is32vec4() { }
Is32vec4(__m128i mm)174   Is32vec4(__m128i mm) : I32vec4(mm) { }
Is32vec4(int i3,int i2,int i1,int i0)175   Is32vec4(int i3,int i2,int i1,int i0)
176   {
177     _MM_4DW(0,vec) = i0;
178     _MM_4DW(1,vec) = i1;
179     _MM_4DW(2,vec) = i2;
180     _MM_4DW(3,vec) = i3;
181   }
182 
183   Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
184 
185   Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
186   Is32vec4& operator|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
187   Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
188 
189   Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
190   Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
191 
192   Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
193   Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
194   Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
195   Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
196 
197   Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
198   Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
199   Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
200   Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
201 
202 #if defined(_ENABLE_VEC_DEBUG)
203 
204   friend std::ostream& operator<< (std::ostream &os,const Is32vec4 &a)
205   {
206     os << "[3]:" << _MM_4DW(3,a)
207       << " [2]:" << _MM_4DW(2,a)
208       << " [1]:" << _MM_4DW(1,a)
209       << " [0]:" << _MM_4DW(0,a);
210     return os;
211   }
212 #endif
213 
214   const int& operator[](int i)const
215   {
216     assert(static_cast<unsigned int>(i) < 4);
217     return _MM_4DW(i,vec);
218   }
219 
220   int& operator[](int i)
221   {
222     assert(static_cast<unsigned int>(i) < 4);
223     return _MM_4DW(i,vec);
224   }
225 };
226 
cmpeq(const Is32vec4 & a,const Is32vec4 & b)227 inline Is32vec4 cmpeq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Is32vec4 & a,const Is32vec4 & b)228 inline Is32vec4 cmpneq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
cmpgt(const Is32vec4 & a,const Is32vec4 & b)229 inline Is32vec4 cmpgt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
cmplt(const Is32vec4 & a,const Is32vec4 & b)230 inline Is32vec4 cmplt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
231 
unpack_low(const Is32vec4 & a,const Is32vec4 & b)232 inline Is32vec4 unpack_low(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Is32vec4 & a,const Is32vec4 & b)233 inline Is32vec4 unpack_high(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
234 
235 class Iu32vec4 : public I32vec4
236 {
237 public:
Iu32vec4()238   Iu32vec4() { }
Iu32vec4(__m128i mm)239   Iu32vec4(__m128i mm) : I32vec4(mm) { }
Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)240   Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)
241   {
242     _MM_4UDW(0,vec) = ui0;
243     _MM_4UDW(1,vec) = ui1;
244     _MM_4UDW(2,vec) = ui2;
245     _MM_4UDW(3,vec) = ui3;
246   }
247 
248   Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
249 
250   Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
251   Iu32vec4& operator|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
252   Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
253 
254   Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
255   Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
256 
257   Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
258   Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
259   Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
260   Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
261   Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
262   Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
263   Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
264   Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
265 
266 #if defined(_ENABLE_VEC_DEBUG)
267 
268   friend std::ostream& operator<< (std::ostream &os,const Iu32vec4 &a)
269   {
270     os << "[3]:" << _MM_4UDW(3,a)
271       << " [2]:" << _MM_4UDW(2,a)
272       << " [1]:" << _MM_4UDW(1,a)
273       << " [0]:" << _MM_4UDW(0,a);
274     return os;
275   }
276 #endif
277 
278   const unsigned int& operator[](int i)const
279   {
280     assert(static_cast<unsigned int>(i) < 4);
281     return _MM_4UDW(i,vec);
282   }
283 
284   unsigned int& operator[](int i)
285   {
286     assert(static_cast<unsigned int>(i) < 4);
287     return _MM_4UDW(i,vec);
288   }
289 };
290 
291 inline I64vec2 operator*(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
cmpeq(const Iu32vec4 & a,const Iu32vec4 & b)292 inline Iu32vec4 cmpeq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Iu32vec4 & a,const Iu32vec4 & b)293 inline Iu32vec4 cmpneq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
294 
unpack_low(const Iu32vec4 & a,const Iu32vec4 & b)295 inline Iu32vec4 unpack_low(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Iu32vec4 & a,const Iu32vec4 & b)296 inline Iu32vec4 unpack_high(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
297 
298 class I16vec8 : public M128
299 {
300 public:
I16vec8()301   I16vec8() { }
I16vec8(__m128i mm)302   I16vec8(__m128i mm) : M128(mm) { }
303 
304   I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
305 
306   I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
307   I16vec8& operator|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
308   I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
309 
310   I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
311   I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
312   I16vec8& operator *=(const I16vec8 &a) { return *this = (I16vec8) _mm_mullo_epi16(vec,a); }
313 
314   I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
315   I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
316   I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
317   I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
318 
319 };
320 
321 inline I16vec8 operator*(const I16vec8 &a,const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
322 
cmpeq(const I16vec8 & a,const I16vec8 & b)323 inline I16vec8 cmpeq(const I16vec8 &a,const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const I16vec8 & a,const I16vec8 & b)324 inline I16vec8 cmpneq(const I16vec8 &a,const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
325 
unpack_low(const I16vec8 & a,const I16vec8 & b)326 inline I16vec8 unpack_low(const I16vec8 &a,const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const I16vec8 & a,const I16vec8 & b)327 inline I16vec8 unpack_high(const I16vec8 &a,const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
328 
329 class Is16vec8 : public I16vec8
330 {
331 public:
Is16vec8()332   Is16vec8() { }
Is16vec8(__m128i mm)333   Is16vec8(__m128i mm) : I16vec8(mm) { }
Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)334   Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)
335   {
336     _MM_8W(0,vec) = s0;
337     _MM_8W(1,vec) = s1;
338     _MM_8W(2,vec) = s2;
339     _MM_8W(3,vec) = s3;
340     _MM_8W(4,vec) = s4;
341     _MM_8W(5,vec) = s5;
342     _MM_8W(6,vec) = s6;
343     _MM_8W(7,vec) = s7;
344   }
345 
346   Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
347 
348   Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
349   Is16vec8& operator|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
350   Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
351 
352   Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
353   Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
354   Is16vec8& operator *=(const I16vec8 &a) { return *this = (Is16vec8) _mm_mullo_epi16(vec,a); }
355 
356   Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
357   Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
358   Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
359   Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
360 
361   Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
362   Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
363   Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
364   Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
365 
366 #if defined(_ENABLE_VEC_DEBUG)
367 
368   friend std::ostream& operator<< (std::ostream &os,const Is16vec8 &a)
369   {
370     os << "[7]:" << _MM_8W(7,a)
371       << " [6]:" << _MM_8W(6,a)
372       << " [5]:" << _MM_8W(5,a)
373       << " [4]:" << _MM_8W(4,a)
374       << " [3]:" << _MM_8W(3,a)
375       << " [2]:" << _MM_8W(2,a)
376       << " [1]:" << _MM_8W(1,a)
377       << " [0]:" << _MM_8W(0,a);
378     return os;
379   }
380 #endif
381 
382   const signed short& operator[](int i)const
383   {
384     assert(static_cast<unsigned int>(i) < 8);
385     return _MM_8W(i,vec);
386   }
387 
388   signed short& operator[](int i)
389   {
390     assert(static_cast<unsigned int>(i) < 8);
391     return _MM_8W(i,vec);
392   }
393 };
394 
395 inline Is16vec8 operator*(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
396 
cmpeq(const Is16vec8 & a,const Is16vec8 & b)397 inline Is16vec8 cmpeq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Is16vec8 & a,const Is16vec8 & b)398 inline Is16vec8 cmpneq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
cmpgt(const Is16vec8 & a,const Is16vec8 & b)399 inline Is16vec8 cmpgt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
cmplt(const Is16vec8 & a,const Is16vec8 & b)400 inline Is16vec8 cmplt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
401 
unpack_low(const Is16vec8 & a,const Is16vec8 & b)402 inline Is16vec8 unpack_low(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Is16vec8 & a,const Is16vec8 & b)403 inline Is16vec8 unpack_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
404 
mul_high(const Is16vec8 & a,const Is16vec8 & b)405 inline Is16vec8 mul_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
mul_add(const Is16vec8 & a,const Is16vec8 & b)406 inline Is32vec4 mul_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
407 
sat_add(const Is16vec8 & a,const Is16vec8 & b)408 inline Is16vec8 sat_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
sat_sub(const Is16vec8 & a,const Is16vec8 & b)409 inline Is16vec8 sat_sub(const Is16vec8 &a,const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
410 
simd_max(const Is16vec8 & a,const Is16vec8 & b)411 inline Is16vec8 simd_max(const Is16vec8 &a,const Is16vec8 &b) { return _mm_max_epi16(a,b); }
simd_min(const Is16vec8 & a,const Is16vec8 & b)412 inline Is16vec8 simd_min(const Is16vec8 &a,const Is16vec8 &b) { return _mm_min_epi16(a,b); }
413 
414 class Iu16vec8 : public I16vec8
415 {
416 public:
Iu16vec8()417   Iu16vec8() { }
Iu16vec8(__m128i mm)418   Iu16vec8(__m128i mm) : I16vec8(mm) { }
Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)419   Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)
420   {
421     _MM_8UW(0,vec) = s0;
422     _MM_8UW(1,vec) = s1;
423     _MM_8UW(2,vec) = s2;
424     _MM_8UW(3,vec) = s3;
425     _MM_8UW(4,vec) = s4;
426     _MM_8UW(5,vec) = s5;
427     _MM_8UW(6,vec) = s6;
428     _MM_8UW(7,vec) = s7;
429   }
430 
431   Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
432 
433   Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
434   Iu16vec8& operator|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
435   Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
436 
437   Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
438   Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
439   Iu16vec8& operator *=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
440 
441   Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
442   Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
443   Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
444   Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
445   Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
446   Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
447   Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
448   Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
449 
450 #if defined(_ENABLE_VEC_DEBUG)
451 
452   friend std::ostream& operator << (std::ostream &os,const Iu16vec8 &a)
453   {
454     os << "[7]:" << unsigned short(_MM_8UW(7,a))
455       << " [6]:" << unsigned short(_MM_8UW(6,a))
456       << " [5]:" << unsigned short(_MM_8UW(5,a))
457       << " [4]:" << unsigned short(_MM_8UW(4,a))
458       << " [3]:" << unsigned short(_MM_8UW(3,a))
459       << " [2]:" << unsigned short(_MM_8UW(2,a))
460       << " [1]:" << unsigned short(_MM_8UW(1,a))
461       << " [0]:" << unsigned short(_MM_8UW(0,a));
462     return os;
463   }
464 #endif
465 
466   const unsigned short& operator[](int i)const
467   {
468     assert(static_cast<unsigned int>(i) < 8);
469     return _MM_8UW(i,vec);
470   }
471 
472   unsigned short& operator[](int i)
473   {
474     assert(static_cast<unsigned int>(i) < 8);
475     return _MM_8UW(i,vec);
476   }
477 };
478 
479 inline Iu16vec8 operator*(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
480 
cmpeq(const Iu16vec8 & a,const Iu16vec8 & b)481 inline Iu16vec8 cmpeq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Iu16vec8 & a,const Iu16vec8 & b)482 inline Iu16vec8 cmpneq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
483 
unpack_low(const Iu16vec8 & a,const Iu16vec8 & b)484 inline Iu16vec8 unpack_low(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Iu16vec8 & a,const Iu16vec8 & b)485 inline Iu16vec8 unpack_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
486 
sat_add(const Iu16vec8 & a,const Iu16vec8 & b)487 inline Iu16vec8 sat_add(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
sat_sub(const Iu16vec8 & a,const Iu16vec8 & b)488 inline Iu16vec8 sat_sub(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
489 
simd_avg(const Iu16vec8 & a,const Iu16vec8 & b)490 inline Iu16vec8 simd_avg(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
mul_high(const Iu16vec8 & a,const Iu16vec8 & b)491 inline I16vec8 mul_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
492 
493 class I8vec16 : public M128
494 {
495 public:
I8vec16()496   I8vec16() { }
I8vec16(__m128i mm)497   I8vec16(__m128i mm) : M128(mm) { }
498 
499   I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
500 
501   I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
502   I8vec16& operator|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
503   I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
504 
505   I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
506   I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
507 
508 };
509 
cmpeq(const I8vec16 & a,const I8vec16 & b)510 inline I8vec16 cmpeq(const I8vec16 &a,const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const I8vec16 & a,const I8vec16 & b)511 inline I8vec16 cmpneq(const I8vec16 &a,const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
512 
unpack_low(const I8vec16 & a,const I8vec16 & b)513 inline I8vec16 unpack_low(const I8vec16 &a,const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const I8vec16 & a,const I8vec16 & b)514 inline I8vec16 unpack_high(const I8vec16 &a,const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
515 
516 class Is8vec16 : public I8vec16
517 {
518 public:
Is8vec16()519   Is8vec16() { }
Is8vec16(__m128i mm)520   Is8vec16(__m128i mm) : I8vec16(mm) { }
521 
522   Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
523 
524   Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
525   Is8vec16& operator|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
526   Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
527 
528   Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
529   Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
530 
531 #if defined(_ENABLE_VEC_DEBUG)
532 
533   friend std::ostream& operator << (std::ostream &os,const Is8vec16 &a)
534   {
535     os << "[15]:" << short(_MM_16B(15,a))
536       << " [14]:" << short(_MM_16B(14,a))
537       << " [13]:" << short(_MM_16B(13,a))
538       << " [12]:" << short(_MM_16B(12,a))
539       << " [11]:" << short(_MM_16B(11,a))
540       << " [10]:" << short(_MM_16B(10,a))
541       << " [9]:" << short(_MM_16B(9,a))
542       << " [8]:" << short(_MM_16B(8,a))
543       << " [7]:" << short(_MM_16B(7,a))
544       << " [6]:" << short(_MM_16B(6,a))
545       << " [5]:" << short(_MM_16B(5,a))
546       << " [4]:" << short(_MM_16B(4,a))
547       << " [3]:" << short(_MM_16B(3,a))
548       << " [2]:" << short(_MM_16B(2,a))
549       << " [1]:" << short(_MM_16B(1,a))
550       << " [0]:" << short(_MM_16B(0,a));
551     return os;
552   }
553 #endif
554 
555   const signed char& operator[](int i)const
556   {
557     assert(static_cast<unsigned int>(i) < 16);
558     return _MM_16B(i,vec);
559   }
560 
561   signed char& operator[](int i)
562   {
563     assert(static_cast<unsigned int>(i) < 16);
564     return _MM_16B(i,vec);
565   }
566 
567 };
568 
cmpeq(const Is8vec16 & a,const Is8vec16 & b)569 inline Is8vec16 cmpeq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Is8vec16 & a,const Is8vec16 & b)570 inline Is8vec16 cmpneq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
cmpgt(const Is8vec16 & a,const Is8vec16 & b)571 inline Is8vec16 cmpgt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
cmplt(const Is8vec16 & a,const Is8vec16 & b)572 inline Is8vec16 cmplt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
573 
unpack_low(const Is8vec16 & a,const Is8vec16 & b)574 inline Is8vec16 unpack_low(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Is8vec16 & a,const Is8vec16 & b)575 inline Is8vec16 unpack_high(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
576 
sat_add(const Is8vec16 & a,const Is8vec16 & b)577 inline Is8vec16 sat_add(const Is8vec16 &a,const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
sat_sub(const Is8vec16 & a,const Is8vec16 & b)578 inline Is8vec16 sat_sub(const Is8vec16 &a,const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
579 
580 class Iu8vec16 : public I8vec16
581 {
582 public:
Iu8vec16()583   Iu8vec16() { }
Iu8vec16(__m128i mm)584   Iu8vec16(__m128i mm) : I8vec16(mm) { }
585 
586   Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
587 
588   Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
589   Iu8vec16& operator|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
590   Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
591 
592   Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
593   Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
594 
595 #if defined(_ENABLE_VEC_DEBUG)
596 
597   friend std::ostream& operator << (std::ostream &os,const Iu8vec16 &a)
598   {
599     os << "[15]:" << unsigned short(_MM_16UB(15,a))
600       << " [14]:" << unsigned short(_MM_16UB(14,a))
601       << " [13]:" << unsigned short(_MM_16UB(13,a))
602       << " [12]:" << unsigned short(_MM_16UB(12,a))
603       << " [11]:" << unsigned short(_MM_16UB(11,a))
604       << " [10]:" << unsigned short(_MM_16UB(10,a))
605       << " [9]:" << unsigned short(_MM_16UB(9,a))
606       << " [8]:" << unsigned short(_MM_16UB(8,a))
607       << " [7]:" << unsigned short(_MM_16UB(7,a))
608       << " [6]:" << unsigned short(_MM_16UB(6,a))
609       << " [5]:" << unsigned short(_MM_16UB(5,a))
610       << " [4]:" << unsigned short(_MM_16UB(4,a))
611       << " [3]:" << unsigned short(_MM_16UB(3,a))
612       << " [2]:" << unsigned short(_MM_16UB(2,a))
613       << " [1]:" << unsigned short(_MM_16UB(1,a))
614       << " [0]:" << unsigned short(_MM_16UB(0,a));
615     return os;
616   }
617 #endif
618 
619   const unsigned char& operator[](int i)const
620   {
621     assert(static_cast<unsigned int>(i) < 16);
622     return _MM_16UB(i,vec);
623   }
624 
625   unsigned char& operator[](int i)
626   {
627     assert(static_cast<unsigned int>(i) < 16);
628     return _MM_16UB(i,vec);
629   }
630 
631 };
632 
cmpeq(const Iu8vec16 & a,const Iu8vec16 & b)633 inline Iu8vec16 cmpeq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Iu8vec16 & a,const Iu8vec16 & b)634 inline Iu8vec16 cmpneq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
635 
unpack_low(const Iu8vec16 & a,const Iu8vec16 & b)636 inline Iu8vec16 unpack_low(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Iu8vec16 & a,const Iu8vec16 & b)637 inline Iu8vec16 unpack_high(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
638 
sat_add(const Iu8vec16 & a,const Iu8vec16 & b)639 inline Iu8vec16 sat_add(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
sat_sub(const Iu8vec16 & a,const Iu8vec16 & b)640 inline Iu8vec16 sat_sub(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
641 
sum_abs(const Iu8vec16 & a,const Iu8vec16 & b)642 inline I64vec2 sum_abs(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
643 
simd_avg(const Iu8vec16 & a,const Iu8vec16 & b)644 inline Iu8vec16 simd_avg(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
simd_max(const Iu8vec16 & a,const Iu8vec16 & b)645 inline Iu8vec16 simd_max(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
simd_min(const Iu8vec16 & a,const Iu8vec16 & b)646 inline Iu8vec16 simd_min(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
647 
pack_sat(const Is32vec4 & a,const Is32vec4 & b)648 inline Is16vec8 pack_sat(const Is32vec4 &a,const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
pack_sat(const Is16vec8 & a,const Is16vec8 & b)649 inline Is8vec16 pack_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
packu_sat(const Is16vec8 & a,const Is16vec8 & b)650 inline Iu8vec16 packu_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
651 
652 #define IVEC128_LOGICALS(vect,element) inline I##vect##vec##element operator& (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_and_si128(a,b); } inline I##vect##vec##element operator| (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_or_si128(a,b); } inline I##vect##vec##element operator^ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_xor_si128(a,b); } inline I##vect##vec##element andnot (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_andnot_si128(a,b); }
653 
654 IVEC128_LOGICALS(8,16)
655 IVEC128_LOGICALS(u8,16)
656 IVEC128_LOGICALS(s8,16)
657 IVEC128_LOGICALS(16,8)
658 IVEC128_LOGICALS(u16,8)
659 IVEC128_LOGICALS(s16,8)
660 IVEC128_LOGICALS(32,4)
661 IVEC128_LOGICALS(u32,4)
662 IVEC128_LOGICALS(s32,4)
663 IVEC128_LOGICALS(64,2)
664 IVEC128_LOGICALS(128,1)
665 #undef IVEC128_LOGICALS
666 
667 #define IVEC128_ADD_SUB(vect,element,opsize) inline I##vect##vec##element operator+ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_add_##opsize(a,b); } inline I##vect##vec##element operator- (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_sub_##opsize(a,b); }
668 
669 IVEC128_ADD_SUB(8,16,epi8)
670 IVEC128_ADD_SUB(u8,16,epi8)
671 IVEC128_ADD_SUB(s8,16,epi8)
672 IVEC128_ADD_SUB(16,8,epi16)
673 IVEC128_ADD_SUB(u16,8,epi16)
674 IVEC128_ADD_SUB(s16,8,epi16)
675 IVEC128_ADD_SUB(32,4,epi32)
676 IVEC128_ADD_SUB(u32,4,epi32)
677 IVEC128_ADD_SUB(s32,4,epi32)
678 IVEC128_ADD_SUB(64,2,epi64)
679 #undef IVEC128_ADD_SUB
680 
681 #define IVEC128_SELECT(vect12,vect34,element,selop,arg1,arg2) inline I##vect34##vec##element select_##selop (const I##vect12##vec##element &a,const I##vect12##vec##element &b,const I##vect34##vec##element &c,const I##vect34##vec##element &d) { I##vect12##vec##element mask = cmp##selop(a,b); return(I##vect34##vec##element ((mask & arg1) | I##vect12##vec##element ((_mm_andnot_si128(mask,arg2))))); }
682 IVEC128_SELECT(8,s8,16,eq,c,d)
683 IVEC128_SELECT(8,u8,16,eq,c,d)
684 IVEC128_SELECT(8,8,16,eq,c,d)
685 IVEC128_SELECT(8,s8,16,neq,c,d)
686 IVEC128_SELECT(8,u8,16,neq,c,d)
687 IVEC128_SELECT(8,8,16,neq,c,d)
688 
689 IVEC128_SELECT(16,s16,8,eq,c,d)
690 IVEC128_SELECT(16,u16,8,eq,c,d)
691 IVEC128_SELECT(16,16,8,eq,c,d)
692 IVEC128_SELECT(16,s16,8,neq,c,d)
693 IVEC128_SELECT(16,u16,8,neq,c,d)
694 IVEC128_SELECT(16,16,8,neq,c,d)
695 
696 IVEC128_SELECT(32,s32,4,eq,c,d)
697 IVEC128_SELECT(32,u32,4,eq,c,d)
698 IVEC128_SELECT(32,32,4,eq,c,d)
699 IVEC128_SELECT(32,s32,4,neq,c,d)
700 IVEC128_SELECT(32,u32,4,neq,c,d)
701 IVEC128_SELECT(32,32,4,neq,c,d)
702 
703 IVEC128_SELECT(s8,s8,16,gt,c,d)
704 IVEC128_SELECT(s8,u8,16,gt,c,d)
705 IVEC128_SELECT(s8,8,16,gt,c,d)
706 IVEC128_SELECT(s8,s8,16,lt,c,d)
707 IVEC128_SELECT(s8,u8,16,lt,c,d)
708 IVEC128_SELECT(s8,8,16,lt,c,d)
709 
710 IVEC128_SELECT(s16,s16,8,gt,c,d)
711 IVEC128_SELECT(s16,u16,8,gt,c,d)
712 IVEC128_SELECT(s16,16,8,gt,c,d)
713 IVEC128_SELECT(s16,s16,8,lt,c,d)
714 IVEC128_SELECT(s16,u16,8,lt,c,d)
715 IVEC128_SELECT(s16,16,8,lt,c,d)
716 
717 #undef IVEC128_SELECT
718 
719 class F64vec2
720 {
721 protected:
722   __m128d vec;
723 public:
724 
F64vec2()725   F64vec2() {}
726 
F64vec2(__m128d m)727   F64vec2(__m128d m) { vec = m;}
728 
F64vec2(double d1,double d0)729   F64vec2(double d1,double d0) { vec= _mm_set_pd(d1,d0); }
730 
F64vec2(double d)731   EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
732 
__m128d()733   operator __m128d() const { return vec; }
734 
735   friend F64vec2 operator &(const F64vec2 &a,const F64vec2 &b) { return _mm_and_pd(a,b); }
736   friend F64vec2 operator |(const F64vec2 &a,const F64vec2 &b) { return _mm_or_pd(a,b); }
737   friend F64vec2 operator ^(const F64vec2 &a,const F64vec2 &b) { return _mm_xor_pd(a,b); }
738 
739   friend F64vec2 operator +(const F64vec2 &a,const F64vec2 &b) { return _mm_add_pd(a,b); }
740   friend F64vec2 operator -(const F64vec2 &a,const F64vec2 &b) { return _mm_sub_pd(a,b); }
741   friend F64vec2 operator *(const F64vec2 &a,const F64vec2 &b) { return _mm_mul_pd(a,b); }
742   friend F64vec2 operator /(const F64vec2 &a,const F64vec2 &b) { return _mm_div_pd(a,b); }
743 
744   F64vec2& operator +=(F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
745   F64vec2& operator -=(F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
746   F64vec2& operator *=(F64vec2 &a) { return *this = _mm_mul_pd(vec,a); }
747   F64vec2& operator /=(F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
748   F64vec2& operator &=(F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
749   F64vec2& operator |=(F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
750   F64vec2& operator ^=(F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
751 
add_horizontal(F64vec2 & a)752   friend double add_horizontal(F64vec2 &a)
753   {
754     F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a,a,1));
755     return ftemp[0];
756   }
757 
andnot(const F64vec2 & a,const F64vec2 & b)758   friend F64vec2 andnot(const F64vec2 &a,const F64vec2 &b) { return _mm_andnot_pd(a,b); }
759 
sqrt(const F64vec2 & a)760   friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
761 
762 #define F64vec2_COMP(op) friend F64vec2 cmp##op (const F64vec2 &a,const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
763   F64vec2_COMP(eq)
F64vec2_COMP(lt)764     F64vec2_COMP(lt)
765     F64vec2_COMP(le)
766     F64vec2_COMP(gt)
767     F64vec2_COMP(ge)
768     F64vec2_COMP(ngt)
769     F64vec2_COMP(nge)
770     F64vec2_COMP(neq)
771     F64vec2_COMP(nlt)
772     F64vec2_COMP(nle)
773 #undef F64vec2_COMP
774 
775     friend F64vec2 simd_min(const F64vec2 &a,const F64vec2 &b) { return _mm_min_pd(a,b); }
simd_max(const F64vec2 & a,const F64vec2 & b)776   friend F64vec2 simd_max(const F64vec2 &a,const F64vec2 &b) { return _mm_max_pd(a,b); }
777 
778 #define F64vec2_COMI(op) friend int comi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
779   F64vec2_COMI(eq)
780     F64vec2_COMI(lt)
781     F64vec2_COMI(le)
782     F64vec2_COMI(gt)
783     F64vec2_COMI(ge)
784     F64vec2_COMI(neq)
785 #undef F64vec2_COMI
786 
787 #define F64vec2_UCOMI(op) friend int ucomi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
788     F64vec2_UCOMI(eq)
789     F64vec2_UCOMI(lt)
790     F64vec2_UCOMI(le)
791     F64vec2_UCOMI(gt)
792     F64vec2_UCOMI(ge)
793     F64vec2_UCOMI(neq)
794 #undef F64vec2_UCOMI
795 
796 #if defined(_ENABLE_VEC_DEBUG)
797 
798   friend std::ostream & operator<<(std::ostream & os,const F64vec2 &a) {
799     double *dp = (double*)&a;
800     os << " [1]:" << *(dp+1)
801       << " [0]:" << *dp;
802     return os;
803   }
804 #endif
805 
806   const double &operator[](int i) const {
807     assert((0 <= i) && (i <= 1));
808     double *dp = (double*)&vec;
809     return *(dp+i);
810   }
811 
812   double &operator[](int i) {
813     assert((0 <= i) && (i <= 1));
814     double *dp = (double*)&vec;
815     return *(dp+i);
816   }
817 };
818 
unpack_low(const F64vec2 & a,const F64vec2 & b)819 inline F64vec2 unpack_low(const F64vec2 &a,const F64vec2 &b) { return _mm_unpacklo_pd(a,b); }
unpack_high(const F64vec2 & a,const F64vec2 & b)820 inline F64vec2 unpack_high(const F64vec2 &a,const F64vec2 &b) { return _mm_unpackhi_pd(a,b); }
move_mask(const F64vec2 & a)821 inline int move_mask(const F64vec2 &a) { return _mm_movemask_pd(a); }
loadu(F64vec2 & a,double * p)822 inline void loadu(F64vec2 &a,double *p) { a = _mm_loadu_pd(p); }
storeu(double * p,const F64vec2 & a)823 inline void storeu(double *p,const F64vec2 &a) { _mm_storeu_pd(p,a); }
store_nta(double * p,F64vec2 & a)824 inline void store_nta(double *p,F64vec2 &a) { _mm_stream_pd(p,a); }
825 
826 #define F64vec2_SELECT(op) inline F64vec2 select_##op (const F64vec2 &a,const F64vec2 &b,const F64vec2 &c,const F64vec2 &d) { F64vec2 mask = _mm_cmp##op##_pd(a,b); return((mask & c) | F64vec2((_mm_andnot_pd(mask,d)))); }
827 F64vec2_SELECT(eq)
F64vec2_SELECT(lt)828 F64vec2_SELECT(lt)
829 F64vec2_SELECT(le)
830 F64vec2_SELECT(gt)
831 F64vec2_SELECT(ge)
832 F64vec2_SELECT(neq)
833 F64vec2_SELECT(nlt)
834 F64vec2_SELECT(nle)
835 #undef F64vec2_SELECT
836 
837 inline int F64vec2ToInt(const F64vec2 &a) { return _mm_cvttsd_si32(a); }
F32vec4ToF64vec2(const F32vec4 & a)838 inline F64vec2 F32vec4ToF64vec2(const F32vec4 &a) { return _mm_cvtps_pd(a); }
F64vec2ToF32vec4(const F64vec2 & a)839 inline F32vec4 F64vec2ToF32vec4(const F64vec2 &a) { return _mm_cvtpd_ps(a); }
IntToF64vec2(const F64vec2 & a,int b)840 inline F64vec2 IntToF64vec2(const F64vec2 &a,int b) { return _mm_cvtsi32_sd(a,b); }
841 
842 #pragma pack(pop)
843 
844 #endif /* ifdef __SSE__ */
845 
846 #pragma pack(pop)
847 #endif
848 #endif
849