1 /**
2 * This file has no copyright assigned and is placed in the Public Domain.
3 * This file is part of the mingw-w64 runtime package.
4 * No warranty is given; refer to the file DISCLAIMER.PD within this package.
5 */
6 #ifndef _DVEC_H_INCLUDED
7 #define _DVEC_H_INCLUDED
8 #ifndef RC_INVOKED
9
10 #if !defined __cplusplus
11 #error This file is only supported in C++ compilations!
12 #endif
13
14 #include <intrin.h>
15 #include <assert.h>
16 #include <fvec.h>
17 #include <crtdefs.h>
18
19 #pragma pack(push,_CRT_PACKING)
20
21 #if defined(_ENABLE_VEC_DEBUG)
22 #include <iostream>
23 #endif
24
25 #ifdef __SSE__
26
27 #pragma pack(push,16)
28
29 #define EXPLICIT explicit
30
31 class I8vec16;
32 class Is8vec16;
33 class Iu8vec16;
34 class I16vec8;
35 class Is16vec8;
36 class Iu16vec8;
37 class I32vec4;
38 class Is32vec4;
39 class Iu32vec4;
40 class I64vec2;
41 class I128vec1;
42
43 #define _MM_16UB(element,vector) (*((unsigned char*)&(vector) + (element)))
44 #define _MM_16B(element,vector) (*((signed char*)&(vector) + (element)))
45
46 #define _MM_8UW(element,vector) (*((unsigned short*)&(vector) + (element)))
47 #define _MM_8W(element,vector) (*((short*)&(vector) + (element)))
48
49 #define _MM_4UDW(element,vector) (*((unsigned int*)&(vector) + (element)))
50 #define _MM_4DW(element,vector) (*((int*)&(vector) + (element)))
51
52 #define _MM_2QW(element,vector) (*((__int64*)&(vector) + (element)))
53
get_mask128()54 __MINGW_EXTENSION inline const __m128i get_mask128()
55 {
56 static const __m128i mask128 = _mm_set1_epi64(M64((__int64)0xffffffffffffffffll));
57 return mask128;
58 }
59
60 class M128
61 {
62 protected:
63 __m128i vec;
64
65 public:
M128()66 M128() { }
M128(__m128i mm)67 M128(__m128i mm) { vec = mm; }
68
__m128i()69 operator __m128i() const { return vec; }
70
71 M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
72 M128& operator|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
73 M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
74
75 };
76
77 inline M128 operator&(const M128 &a,const M128 &b) { return _mm_and_si128(a,b); }
78 inline M128 operator|(const M128 &a,const M128 &b) { return _mm_or_si128(a,b); }
79 inline M128 operator^(const M128 &a,const M128 &b) { return _mm_xor_si128(a,b); }
andnot(const M128 & a,const M128 & b)80 inline M128 andnot(const M128 &a,const M128 &b) { return _mm_andnot_si128(a,b); }
81
82 class I128vec1 : public M128
83 {
84 public:
I128vec1()85 I128vec1() { }
I128vec1(__m128i mm)86 I128vec1(__m128i mm) : M128(mm) { }
87
88 I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
89 I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
90 I128vec1& operator|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
91 I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
92
93 };
94
95 class I64vec2 : public M128
96 {
97 public:
I64vec2()98 I64vec2() { }
I64vec2(__m128i mm)99 I64vec2(__m128i mm) : M128(mm) { }
100
I64vec2(__m64 q1,__m64 q0)101 __MINGW_EXTENSION I64vec2(__m64 q1,__m64 q0)
102 {
103 _MM_2QW(0,vec) = *(__int64*)&q0;
104 _MM_2QW(1,vec) = *(__int64*)&q1;
105 }
106
107 I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
108
109 I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
110 I64vec2& operator|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
111 I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
112
113 I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
114 I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
115
116 I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
117 I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
118 I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
119 I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
120 I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
121 I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
122 I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
123 I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
124
125 __MINGW_EXTENSION const __int64& operator[](int i)const
126 {
127 assert(static_cast<unsigned int>(i) < 2);
128 return _MM_2QW(i,vec);
129 }
130
131 __MINGW_EXTENSION __int64& operator[](int i)
132 {
133 assert(static_cast<unsigned int>(i) < 2);
134 return _MM_2QW(i,vec);
135 }
136
137 };
138
unpack_low(const I64vec2 & a,const I64vec2 & b)139 inline I64vec2 unpack_low(const I64vec2 &a,const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
unpack_high(const I64vec2 & a,const I64vec2 & b)140 inline I64vec2 unpack_high(const I64vec2 &a,const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
141
142 class I32vec4 : public M128
143 {
144 public:
I32vec4()145 I32vec4() { }
I32vec4(__m128i mm)146 I32vec4(__m128i mm) : M128(mm) { }
147
148 I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
149
150 I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
151 I32vec4& operator|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
152 I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
153
154 I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
155 I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
156
157 I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
158 I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
159 I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
160 I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
161
162 };
163
cmpeq(const I32vec4 & a,const I32vec4 & b)164 inline I32vec4 cmpeq(const I32vec4 &a,const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const I32vec4 & a,const I32vec4 & b)165 inline I32vec4 cmpneq(const I32vec4 &a,const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
166
unpack_low(const I32vec4 & a,const I32vec4 & b)167 inline I32vec4 unpack_low(const I32vec4 &a,const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const I32vec4 & a,const I32vec4 & b)168 inline I32vec4 unpack_high(const I32vec4 &a,const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
169
170 class Is32vec4 : public I32vec4
171 {
172 public:
Is32vec4()173 Is32vec4() { }
Is32vec4(__m128i mm)174 Is32vec4(__m128i mm) : I32vec4(mm) { }
Is32vec4(int i3,int i2,int i1,int i0)175 Is32vec4(int i3,int i2,int i1,int i0)
176 {
177 _MM_4DW(0,vec) = i0;
178 _MM_4DW(1,vec) = i1;
179 _MM_4DW(2,vec) = i2;
180 _MM_4DW(3,vec) = i3;
181 }
182
183 Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
184
185 Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
186 Is32vec4& operator|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
187 Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
188
189 Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
190 Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
191
192 Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
193 Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
194 Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
195 Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
196
197 Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
198 Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
199 Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
200 Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
201
202 #if defined(_ENABLE_VEC_DEBUG)
203
204 friend std::ostream& operator<< (std::ostream &os,const Is32vec4 &a)
205 {
206 os << "[3]:" << _MM_4DW(3,a)
207 << " [2]:" << _MM_4DW(2,a)
208 << " [1]:" << _MM_4DW(1,a)
209 << " [0]:" << _MM_4DW(0,a);
210 return os;
211 }
212 #endif
213
214 const int& operator[](int i)const
215 {
216 assert(static_cast<unsigned int>(i) < 4);
217 return _MM_4DW(i,vec);
218 }
219
220 int& operator[](int i)
221 {
222 assert(static_cast<unsigned int>(i) < 4);
223 return _MM_4DW(i,vec);
224 }
225 };
226
cmpeq(const Is32vec4 & a,const Is32vec4 & b)227 inline Is32vec4 cmpeq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Is32vec4 & a,const Is32vec4 & b)228 inline Is32vec4 cmpneq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
cmpgt(const Is32vec4 & a,const Is32vec4 & b)229 inline Is32vec4 cmpgt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
cmplt(const Is32vec4 & a,const Is32vec4 & b)230 inline Is32vec4 cmplt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
231
unpack_low(const Is32vec4 & a,const Is32vec4 & b)232 inline Is32vec4 unpack_low(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Is32vec4 & a,const Is32vec4 & b)233 inline Is32vec4 unpack_high(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
234
235 class Iu32vec4 : public I32vec4
236 {
237 public:
Iu32vec4()238 Iu32vec4() { }
Iu32vec4(__m128i mm)239 Iu32vec4(__m128i mm) : I32vec4(mm) { }
Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)240 Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)
241 {
242 _MM_4UDW(0,vec) = ui0;
243 _MM_4UDW(1,vec) = ui1;
244 _MM_4UDW(2,vec) = ui2;
245 _MM_4UDW(3,vec) = ui3;
246 }
247
248 Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
249
250 Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
251 Iu32vec4& operator|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
252 Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
253
254 Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
255 Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
256
257 Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
258 Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
259 Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
260 Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
261 Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
262 Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
263 Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
264 Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
265
266 #if defined(_ENABLE_VEC_DEBUG)
267
268 friend std::ostream& operator<< (std::ostream &os,const Iu32vec4 &a)
269 {
270 os << "[3]:" << _MM_4UDW(3,a)
271 << " [2]:" << _MM_4UDW(2,a)
272 << " [1]:" << _MM_4UDW(1,a)
273 << " [0]:" << _MM_4UDW(0,a);
274 return os;
275 }
276 #endif
277
278 const unsigned int& operator[](int i)const
279 {
280 assert(static_cast<unsigned int>(i) < 4);
281 return _MM_4UDW(i,vec);
282 }
283
284 unsigned int& operator[](int i)
285 {
286 assert(static_cast<unsigned int>(i) < 4);
287 return _MM_4UDW(i,vec);
288 }
289 };
290
291 inline I64vec2 operator*(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
cmpeq(const Iu32vec4 & a,const Iu32vec4 & b)292 inline Iu32vec4 cmpeq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Iu32vec4 & a,const Iu32vec4 & b)293 inline Iu32vec4 cmpneq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
294
unpack_low(const Iu32vec4 & a,const Iu32vec4 & b)295 inline Iu32vec4 unpack_low(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Iu32vec4 & a,const Iu32vec4 & b)296 inline Iu32vec4 unpack_high(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
297
298 class I16vec8 : public M128
299 {
300 public:
I16vec8()301 I16vec8() { }
I16vec8(__m128i mm)302 I16vec8(__m128i mm) : M128(mm) { }
303
304 I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
305
306 I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
307 I16vec8& operator|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
308 I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
309
310 I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
311 I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
312 I16vec8& operator *=(const I16vec8 &a) { return *this = (I16vec8) _mm_mullo_epi16(vec,a); }
313
314 I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
315 I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
316 I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
317 I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
318
319 };
320
321 inline I16vec8 operator*(const I16vec8 &a,const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
322
cmpeq(const I16vec8 & a,const I16vec8 & b)323 inline I16vec8 cmpeq(const I16vec8 &a,const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const I16vec8 & a,const I16vec8 & b)324 inline I16vec8 cmpneq(const I16vec8 &a,const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
325
unpack_low(const I16vec8 & a,const I16vec8 & b)326 inline I16vec8 unpack_low(const I16vec8 &a,const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const I16vec8 & a,const I16vec8 & b)327 inline I16vec8 unpack_high(const I16vec8 &a,const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
328
329 class Is16vec8 : public I16vec8
330 {
331 public:
Is16vec8()332 Is16vec8() { }
Is16vec8(__m128i mm)333 Is16vec8(__m128i mm) : I16vec8(mm) { }
Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)334 Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)
335 {
336 _MM_8W(0,vec) = s0;
337 _MM_8W(1,vec) = s1;
338 _MM_8W(2,vec) = s2;
339 _MM_8W(3,vec) = s3;
340 _MM_8W(4,vec) = s4;
341 _MM_8W(5,vec) = s5;
342 _MM_8W(6,vec) = s6;
343 _MM_8W(7,vec) = s7;
344 }
345
346 Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
347
348 Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
349 Is16vec8& operator|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
350 Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
351
352 Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
353 Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
354 Is16vec8& operator *=(const I16vec8 &a) { return *this = (Is16vec8) _mm_mullo_epi16(vec,a); }
355
356 Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
357 Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
358 Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
359 Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
360
361 Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
362 Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
363 Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
364 Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
365
366 #if defined(_ENABLE_VEC_DEBUG)
367
368 friend std::ostream& operator<< (std::ostream &os,const Is16vec8 &a)
369 {
370 os << "[7]:" << _MM_8W(7,a)
371 << " [6]:" << _MM_8W(6,a)
372 << " [5]:" << _MM_8W(5,a)
373 << " [4]:" << _MM_8W(4,a)
374 << " [3]:" << _MM_8W(3,a)
375 << " [2]:" << _MM_8W(2,a)
376 << " [1]:" << _MM_8W(1,a)
377 << " [0]:" << _MM_8W(0,a);
378 return os;
379 }
380 #endif
381
382 const signed short& operator[](int i)const
383 {
384 assert(static_cast<unsigned int>(i) < 8);
385 return _MM_8W(i,vec);
386 }
387
388 signed short& operator[](int i)
389 {
390 assert(static_cast<unsigned int>(i) < 8);
391 return _MM_8W(i,vec);
392 }
393 };
394
395 inline Is16vec8 operator*(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
396
cmpeq(const Is16vec8 & a,const Is16vec8 & b)397 inline Is16vec8 cmpeq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Is16vec8 & a,const Is16vec8 & b)398 inline Is16vec8 cmpneq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
cmpgt(const Is16vec8 & a,const Is16vec8 & b)399 inline Is16vec8 cmpgt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
cmplt(const Is16vec8 & a,const Is16vec8 & b)400 inline Is16vec8 cmplt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
401
unpack_low(const Is16vec8 & a,const Is16vec8 & b)402 inline Is16vec8 unpack_low(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Is16vec8 & a,const Is16vec8 & b)403 inline Is16vec8 unpack_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
404
mul_high(const Is16vec8 & a,const Is16vec8 & b)405 inline Is16vec8 mul_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
mul_add(const Is16vec8 & a,const Is16vec8 & b)406 inline Is32vec4 mul_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
407
sat_add(const Is16vec8 & a,const Is16vec8 & b)408 inline Is16vec8 sat_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
sat_sub(const Is16vec8 & a,const Is16vec8 & b)409 inline Is16vec8 sat_sub(const Is16vec8 &a,const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
410
simd_max(const Is16vec8 & a,const Is16vec8 & b)411 inline Is16vec8 simd_max(const Is16vec8 &a,const Is16vec8 &b) { return _mm_max_epi16(a,b); }
simd_min(const Is16vec8 & a,const Is16vec8 & b)412 inline Is16vec8 simd_min(const Is16vec8 &a,const Is16vec8 &b) { return _mm_min_epi16(a,b); }
413
414 class Iu16vec8 : public I16vec8
415 {
416 public:
Iu16vec8()417 Iu16vec8() { }
Iu16vec8(__m128i mm)418 Iu16vec8(__m128i mm) : I16vec8(mm) { }
Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)419 Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)
420 {
421 _MM_8UW(0,vec) = s0;
422 _MM_8UW(1,vec) = s1;
423 _MM_8UW(2,vec) = s2;
424 _MM_8UW(3,vec) = s3;
425 _MM_8UW(4,vec) = s4;
426 _MM_8UW(5,vec) = s5;
427 _MM_8UW(6,vec) = s6;
428 _MM_8UW(7,vec) = s7;
429 }
430
431 Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
432
433 Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
434 Iu16vec8& operator|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
435 Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
436
437 Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
438 Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
439 Iu16vec8& operator *=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
440
441 Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
442 Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
443 Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
444 Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
445 Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
446 Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
447 Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
448 Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
449
450 #if defined(_ENABLE_VEC_DEBUG)
451
452 friend std::ostream& operator << (std::ostream &os,const Iu16vec8 &a)
453 {
454 os << "[7]:" << unsigned short(_MM_8UW(7,a))
455 << " [6]:" << unsigned short(_MM_8UW(6,a))
456 << " [5]:" << unsigned short(_MM_8UW(5,a))
457 << " [4]:" << unsigned short(_MM_8UW(4,a))
458 << " [3]:" << unsigned short(_MM_8UW(3,a))
459 << " [2]:" << unsigned short(_MM_8UW(2,a))
460 << " [1]:" << unsigned short(_MM_8UW(1,a))
461 << " [0]:" << unsigned short(_MM_8UW(0,a));
462 return os;
463 }
464 #endif
465
466 const unsigned short& operator[](int i)const
467 {
468 assert(static_cast<unsigned int>(i) < 8);
469 return _MM_8UW(i,vec);
470 }
471
472 unsigned short& operator[](int i)
473 {
474 assert(static_cast<unsigned int>(i) < 8);
475 return _MM_8UW(i,vec);
476 }
477 };
478
479 inline Iu16vec8 operator*(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
480
cmpeq(const Iu16vec8 & a,const Iu16vec8 & b)481 inline Iu16vec8 cmpeq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Iu16vec8 & a,const Iu16vec8 & b)482 inline Iu16vec8 cmpneq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
483
unpack_low(const Iu16vec8 & a,const Iu16vec8 & b)484 inline Iu16vec8 unpack_low(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Iu16vec8 & a,const Iu16vec8 & b)485 inline Iu16vec8 unpack_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
486
sat_add(const Iu16vec8 & a,const Iu16vec8 & b)487 inline Iu16vec8 sat_add(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
sat_sub(const Iu16vec8 & a,const Iu16vec8 & b)488 inline Iu16vec8 sat_sub(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
489
simd_avg(const Iu16vec8 & a,const Iu16vec8 & b)490 inline Iu16vec8 simd_avg(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
mul_high(const Iu16vec8 & a,const Iu16vec8 & b)491 inline I16vec8 mul_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
492
493 class I8vec16 : public M128
494 {
495 public:
I8vec16()496 I8vec16() { }
I8vec16(__m128i mm)497 I8vec16(__m128i mm) : M128(mm) { }
498
499 I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
500
501 I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
502 I8vec16& operator|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
503 I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
504
505 I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
506 I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
507
508 };
509
cmpeq(const I8vec16 & a,const I8vec16 & b)510 inline I8vec16 cmpeq(const I8vec16 &a,const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const I8vec16 & a,const I8vec16 & b)511 inline I8vec16 cmpneq(const I8vec16 &a,const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
512
unpack_low(const I8vec16 & a,const I8vec16 & b)513 inline I8vec16 unpack_low(const I8vec16 &a,const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const I8vec16 & a,const I8vec16 & b)514 inline I8vec16 unpack_high(const I8vec16 &a,const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
515
516 class Is8vec16 : public I8vec16
517 {
518 public:
Is8vec16()519 Is8vec16() { }
Is8vec16(__m128i mm)520 Is8vec16(__m128i mm) : I8vec16(mm) { }
521
522 Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
523
524 Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
525 Is8vec16& operator|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
526 Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
527
528 Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
529 Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
530
531 #if defined(_ENABLE_VEC_DEBUG)
532
533 friend std::ostream& operator << (std::ostream &os,const Is8vec16 &a)
534 {
535 os << "[15]:" << short(_MM_16B(15,a))
536 << " [14]:" << short(_MM_16B(14,a))
537 << " [13]:" << short(_MM_16B(13,a))
538 << " [12]:" << short(_MM_16B(12,a))
539 << " [11]:" << short(_MM_16B(11,a))
540 << " [10]:" << short(_MM_16B(10,a))
541 << " [9]:" << short(_MM_16B(9,a))
542 << " [8]:" << short(_MM_16B(8,a))
543 << " [7]:" << short(_MM_16B(7,a))
544 << " [6]:" << short(_MM_16B(6,a))
545 << " [5]:" << short(_MM_16B(5,a))
546 << " [4]:" << short(_MM_16B(4,a))
547 << " [3]:" << short(_MM_16B(3,a))
548 << " [2]:" << short(_MM_16B(2,a))
549 << " [1]:" << short(_MM_16B(1,a))
550 << " [0]:" << short(_MM_16B(0,a));
551 return os;
552 }
553 #endif
554
555 const signed char& operator[](int i)const
556 {
557 assert(static_cast<unsigned int>(i) < 16);
558 return _MM_16B(i,vec);
559 }
560
561 signed char& operator[](int i)
562 {
563 assert(static_cast<unsigned int>(i) < 16);
564 return _MM_16B(i,vec);
565 }
566
567 };
568
cmpeq(const Is8vec16 & a,const Is8vec16 & b)569 inline Is8vec16 cmpeq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Is8vec16 & a,const Is8vec16 & b)570 inline Is8vec16 cmpneq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
cmpgt(const Is8vec16 & a,const Is8vec16 & b)571 inline Is8vec16 cmpgt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
cmplt(const Is8vec16 & a,const Is8vec16 & b)572 inline Is8vec16 cmplt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
573
unpack_low(const Is8vec16 & a,const Is8vec16 & b)574 inline Is8vec16 unpack_low(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Is8vec16 & a,const Is8vec16 & b)575 inline Is8vec16 unpack_high(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
576
sat_add(const Is8vec16 & a,const Is8vec16 & b)577 inline Is8vec16 sat_add(const Is8vec16 &a,const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
sat_sub(const Is8vec16 & a,const Is8vec16 & b)578 inline Is8vec16 sat_sub(const Is8vec16 &a,const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
579
580 class Iu8vec16 : public I8vec16
581 {
582 public:
Iu8vec16()583 Iu8vec16() { }
Iu8vec16(__m128i mm)584 Iu8vec16(__m128i mm) : I8vec16(mm) { }
585
586 Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
587
588 Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
589 Iu8vec16& operator|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
590 Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
591
592 Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
593 Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
594
595 #if defined(_ENABLE_VEC_DEBUG)
596
597 friend std::ostream& operator << (std::ostream &os,const Iu8vec16 &a)
598 {
599 os << "[15]:" << unsigned short(_MM_16UB(15,a))
600 << " [14]:" << unsigned short(_MM_16UB(14,a))
601 << " [13]:" << unsigned short(_MM_16UB(13,a))
602 << " [12]:" << unsigned short(_MM_16UB(12,a))
603 << " [11]:" << unsigned short(_MM_16UB(11,a))
604 << " [10]:" << unsigned short(_MM_16UB(10,a))
605 << " [9]:" << unsigned short(_MM_16UB(9,a))
606 << " [8]:" << unsigned short(_MM_16UB(8,a))
607 << " [7]:" << unsigned short(_MM_16UB(7,a))
608 << " [6]:" << unsigned short(_MM_16UB(6,a))
609 << " [5]:" << unsigned short(_MM_16UB(5,a))
610 << " [4]:" << unsigned short(_MM_16UB(4,a))
611 << " [3]:" << unsigned short(_MM_16UB(3,a))
612 << " [2]:" << unsigned short(_MM_16UB(2,a))
613 << " [1]:" << unsigned short(_MM_16UB(1,a))
614 << " [0]:" << unsigned short(_MM_16UB(0,a));
615 return os;
616 }
617 #endif
618
619 const unsigned char& operator[](int i)const
620 {
621 assert(static_cast<unsigned int>(i) < 16);
622 return _MM_16UB(i,vec);
623 }
624
625 unsigned char& operator[](int i)
626 {
627 assert(static_cast<unsigned int>(i) < 16);
628 return _MM_16UB(i,vec);
629 }
630
631 };
632
cmpeq(const Iu8vec16 & a,const Iu8vec16 & b)633 inline Iu8vec16 cmpeq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Iu8vec16 & a,const Iu8vec16 & b)634 inline Iu8vec16 cmpneq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
635
unpack_low(const Iu8vec16 & a,const Iu8vec16 & b)636 inline Iu8vec16 unpack_low(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Iu8vec16 & a,const Iu8vec16 & b)637 inline Iu8vec16 unpack_high(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
638
sat_add(const Iu8vec16 & a,const Iu8vec16 & b)639 inline Iu8vec16 sat_add(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
sat_sub(const Iu8vec16 & a,const Iu8vec16 & b)640 inline Iu8vec16 sat_sub(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
641
sum_abs(const Iu8vec16 & a,const Iu8vec16 & b)642 inline I64vec2 sum_abs(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
643
simd_avg(const Iu8vec16 & a,const Iu8vec16 & b)644 inline Iu8vec16 simd_avg(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
simd_max(const Iu8vec16 & a,const Iu8vec16 & b)645 inline Iu8vec16 simd_max(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
simd_min(const Iu8vec16 & a,const Iu8vec16 & b)646 inline Iu8vec16 simd_min(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
647
pack_sat(const Is32vec4 & a,const Is32vec4 & b)648 inline Is16vec8 pack_sat(const Is32vec4 &a,const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
pack_sat(const Is16vec8 & a,const Is16vec8 & b)649 inline Is8vec16 pack_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
packu_sat(const Is16vec8 & a,const Is16vec8 & b)650 inline Iu8vec16 packu_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
651
652 #define IVEC128_LOGICALS(vect,element) inline I##vect##vec##element operator& (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_and_si128(a,b); } inline I##vect##vec##element operator| (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_or_si128(a,b); } inline I##vect##vec##element operator^ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_xor_si128(a,b); } inline I##vect##vec##element andnot (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_andnot_si128(a,b); }
653
654 IVEC128_LOGICALS(8,16)
655 IVEC128_LOGICALS(u8,16)
656 IVEC128_LOGICALS(s8,16)
657 IVEC128_LOGICALS(16,8)
658 IVEC128_LOGICALS(u16,8)
659 IVEC128_LOGICALS(s16,8)
660 IVEC128_LOGICALS(32,4)
661 IVEC128_LOGICALS(u32,4)
662 IVEC128_LOGICALS(s32,4)
663 IVEC128_LOGICALS(64,2)
664 IVEC128_LOGICALS(128,1)
665 #undef IVEC128_LOGICALS
666
667 #define IVEC128_ADD_SUB(vect,element,opsize) inline I##vect##vec##element operator+ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_add_##opsize(a,b); } inline I##vect##vec##element operator- (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_sub_##opsize(a,b); }
668
669 IVEC128_ADD_SUB(8,16,epi8)
670 IVEC128_ADD_SUB(u8,16,epi8)
671 IVEC128_ADD_SUB(s8,16,epi8)
672 IVEC128_ADD_SUB(16,8,epi16)
673 IVEC128_ADD_SUB(u16,8,epi16)
674 IVEC128_ADD_SUB(s16,8,epi16)
675 IVEC128_ADD_SUB(32,4,epi32)
676 IVEC128_ADD_SUB(u32,4,epi32)
677 IVEC128_ADD_SUB(s32,4,epi32)
678 IVEC128_ADD_SUB(64,2,epi64)
679 #undef IVEC128_ADD_SUB
680
681 #define IVEC128_SELECT(vect12,vect34,element,selop,arg1,arg2) inline I##vect34##vec##element select_##selop (const I##vect12##vec##element &a,const I##vect12##vec##element &b,const I##vect34##vec##element &c,const I##vect34##vec##element &d) { I##vect12##vec##element mask = cmp##selop(a,b); return(I##vect34##vec##element ((mask & arg1) | I##vect12##vec##element ((_mm_andnot_si128(mask,arg2))))); }
682 IVEC128_SELECT(8,s8,16,eq,c,d)
683 IVEC128_SELECT(8,u8,16,eq,c,d)
684 IVEC128_SELECT(8,8,16,eq,c,d)
685 IVEC128_SELECT(8,s8,16,neq,c,d)
686 IVEC128_SELECT(8,u8,16,neq,c,d)
687 IVEC128_SELECT(8,8,16,neq,c,d)
688
689 IVEC128_SELECT(16,s16,8,eq,c,d)
690 IVEC128_SELECT(16,u16,8,eq,c,d)
691 IVEC128_SELECT(16,16,8,eq,c,d)
692 IVEC128_SELECT(16,s16,8,neq,c,d)
693 IVEC128_SELECT(16,u16,8,neq,c,d)
694 IVEC128_SELECT(16,16,8,neq,c,d)
695
696 IVEC128_SELECT(32,s32,4,eq,c,d)
697 IVEC128_SELECT(32,u32,4,eq,c,d)
698 IVEC128_SELECT(32,32,4,eq,c,d)
699 IVEC128_SELECT(32,s32,4,neq,c,d)
700 IVEC128_SELECT(32,u32,4,neq,c,d)
701 IVEC128_SELECT(32,32,4,neq,c,d)
702
703 IVEC128_SELECT(s8,s8,16,gt,c,d)
704 IVEC128_SELECT(s8,u8,16,gt,c,d)
705 IVEC128_SELECT(s8,8,16,gt,c,d)
706 IVEC128_SELECT(s8,s8,16,lt,c,d)
707 IVEC128_SELECT(s8,u8,16,lt,c,d)
708 IVEC128_SELECT(s8,8,16,lt,c,d)
709
710 IVEC128_SELECT(s16,s16,8,gt,c,d)
711 IVEC128_SELECT(s16,u16,8,gt,c,d)
712 IVEC128_SELECT(s16,16,8,gt,c,d)
713 IVEC128_SELECT(s16,s16,8,lt,c,d)
714 IVEC128_SELECT(s16,u16,8,lt,c,d)
715 IVEC128_SELECT(s16,16,8,lt,c,d)
716
717 #undef IVEC128_SELECT
718
719 class F64vec2
720 {
721 protected:
722 __m128d vec;
723 public:
724
F64vec2()725 F64vec2() {}
726
F64vec2(__m128d m)727 F64vec2(__m128d m) { vec = m;}
728
F64vec2(double d1,double d0)729 F64vec2(double d1,double d0) { vec= _mm_set_pd(d1,d0); }
730
F64vec2(double d)731 EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
732
__m128d()733 operator __m128d() const { return vec; }
734
735 friend F64vec2 operator &(const F64vec2 &a,const F64vec2 &b) { return _mm_and_pd(a,b); }
736 friend F64vec2 operator |(const F64vec2 &a,const F64vec2 &b) { return _mm_or_pd(a,b); }
737 friend F64vec2 operator ^(const F64vec2 &a,const F64vec2 &b) { return _mm_xor_pd(a,b); }
738
739 friend F64vec2 operator +(const F64vec2 &a,const F64vec2 &b) { return _mm_add_pd(a,b); }
740 friend F64vec2 operator -(const F64vec2 &a,const F64vec2 &b) { return _mm_sub_pd(a,b); }
741 friend F64vec2 operator *(const F64vec2 &a,const F64vec2 &b) { return _mm_mul_pd(a,b); }
742 friend F64vec2 operator /(const F64vec2 &a,const F64vec2 &b) { return _mm_div_pd(a,b); }
743
744 F64vec2& operator +=(F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
745 F64vec2& operator -=(F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
746 F64vec2& operator *=(F64vec2 &a) { return *this = _mm_mul_pd(vec,a); }
747 F64vec2& operator /=(F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
748 F64vec2& operator &=(F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
749 F64vec2& operator |=(F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
750 F64vec2& operator ^=(F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
751
add_horizontal(F64vec2 & a)752 friend double add_horizontal(F64vec2 &a)
753 {
754 F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a,a,1));
755 return ftemp[0];
756 }
757
andnot(const F64vec2 & a,const F64vec2 & b)758 friend F64vec2 andnot(const F64vec2 &a,const F64vec2 &b) { return _mm_andnot_pd(a,b); }
759
sqrt(const F64vec2 & a)760 friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
761
762 #define F64vec2_COMP(op) friend F64vec2 cmp##op (const F64vec2 &a,const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
763 F64vec2_COMP(eq)
F64vec2_COMP(lt)764 F64vec2_COMP(lt)
765 F64vec2_COMP(le)
766 F64vec2_COMP(gt)
767 F64vec2_COMP(ge)
768 F64vec2_COMP(ngt)
769 F64vec2_COMP(nge)
770 F64vec2_COMP(neq)
771 F64vec2_COMP(nlt)
772 F64vec2_COMP(nle)
773 #undef F64vec2_COMP
774
775 friend F64vec2 simd_min(const F64vec2 &a,const F64vec2 &b) { return _mm_min_pd(a,b); }
simd_max(const F64vec2 & a,const F64vec2 & b)776 friend F64vec2 simd_max(const F64vec2 &a,const F64vec2 &b) { return _mm_max_pd(a,b); }
777
778 #define F64vec2_COMI(op) friend int comi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
779 F64vec2_COMI(eq)
780 F64vec2_COMI(lt)
781 F64vec2_COMI(le)
782 F64vec2_COMI(gt)
783 F64vec2_COMI(ge)
784 F64vec2_COMI(neq)
785 #undef F64vec2_COMI
786
787 #define F64vec2_UCOMI(op) friend int ucomi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
788 F64vec2_UCOMI(eq)
789 F64vec2_UCOMI(lt)
790 F64vec2_UCOMI(le)
791 F64vec2_UCOMI(gt)
792 F64vec2_UCOMI(ge)
793 F64vec2_UCOMI(neq)
794 #undef F64vec2_UCOMI
795
796 #if defined(_ENABLE_VEC_DEBUG)
797
798 friend std::ostream & operator<<(std::ostream & os,const F64vec2 &a) {
799 double *dp = (double*)&a;
800 os << " [1]:" << *(dp+1)
801 << " [0]:" << *dp;
802 return os;
803 }
804 #endif
805
806 const double &operator[](int i) const {
807 assert((0 <= i) && (i <= 1));
808 double *dp = (double*)&vec;
809 return *(dp+i);
810 }
811
812 double &operator[](int i) {
813 assert((0 <= i) && (i <= 1));
814 double *dp = (double*)&vec;
815 return *(dp+i);
816 }
817 };
818
unpack_low(const F64vec2 & a,const F64vec2 & b)819 inline F64vec2 unpack_low(const F64vec2 &a,const F64vec2 &b) { return _mm_unpacklo_pd(a,b); }
unpack_high(const F64vec2 & a,const F64vec2 & b)820 inline F64vec2 unpack_high(const F64vec2 &a,const F64vec2 &b) { return _mm_unpackhi_pd(a,b); }
move_mask(const F64vec2 & a)821 inline int move_mask(const F64vec2 &a) { return _mm_movemask_pd(a); }
loadu(F64vec2 & a,double * p)822 inline void loadu(F64vec2 &a,double *p) { a = _mm_loadu_pd(p); }
storeu(double * p,const F64vec2 & a)823 inline void storeu(double *p,const F64vec2 &a) { _mm_storeu_pd(p,a); }
store_nta(double * p,F64vec2 & a)824 inline void store_nta(double *p,F64vec2 &a) { _mm_stream_pd(p,a); }
825
826 #define F64vec2_SELECT(op) inline F64vec2 select_##op (const F64vec2 &a,const F64vec2 &b,const F64vec2 &c,const F64vec2 &d) { F64vec2 mask = _mm_cmp##op##_pd(a,b); return((mask & c) | F64vec2((_mm_andnot_pd(mask,d)))); }
827 F64vec2_SELECT(eq)
F64vec2_SELECT(lt)828 F64vec2_SELECT(lt)
829 F64vec2_SELECT(le)
830 F64vec2_SELECT(gt)
831 F64vec2_SELECT(ge)
832 F64vec2_SELECT(neq)
833 F64vec2_SELECT(nlt)
834 F64vec2_SELECT(nle)
835 #undef F64vec2_SELECT
836
837 inline int F64vec2ToInt(const F64vec2 &a) { return _mm_cvttsd_si32(a); }
F32vec4ToF64vec2(const F32vec4 & a)838 inline F64vec2 F32vec4ToF64vec2(const F32vec4 &a) { return _mm_cvtps_pd(a); }
F64vec2ToF32vec4(const F64vec2 & a)839 inline F32vec4 F64vec2ToF32vec4(const F64vec2 &a) { return _mm_cvtpd_ps(a); }
IntToF64vec2(const F64vec2 & a,int b)840 inline F64vec2 IntToF64vec2(const F64vec2 &a,int b) { return _mm_cvtsi32_sd(a,b); }
841
842 #pragma pack(pop)
843
844 #endif /* ifdef __SSE__ */
845
846 #pragma pack(pop)
847 #endif
848 #endif
849