1 /**
2 * This file has no copyright assigned and is placed in the Public Domain.
3 * This file is part of the w64 mingw-runtime package.
4 * No warranty is given; refer to the file DISCLAIMER within this package.
5 */
6 #ifndef _DVEC_H_INCLUDED
7 #define _DVEC_H_INCLUDED
8 #ifndef RC_INVOKED
9
10 #if !defined __cplusplus
11 #error This file is only supported in C++ compilations!
12 #endif
13
14 #include <emmintrin.h>
15 #include <assert.h>
16 #include <fvec.h>
17 #include <corecrt.h>
18
19 #pragma pack(push,_CRT_PACKING)
20
21 #if defined(_ENABLE_VEC_DEBUG)
22 #include <iostream>
23 #endif
24
25 #pragma pack(push,16)
26
27 #define EXPLICIT explicit
28
29 class I8vec16;
30 class Is8vec16;
31 class Iu8vec16;
32 class I16vec8;
33 class Is16vec8;
34 class Iu16vec8;
35 class I32vec4;
36 class Is32vec4;
37 class Iu32vec4;
38 class I64vec2;
39 class I128vec1;
40
41 #define _MM_16UB(element,vector) (*((unsigned char*)&##vector + ##element))
42 #define _MM_16B(element,vector) (*((signed char*)&##vector + ##element))
43
44 #define _MM_8UW(element,vector) (*((unsigned short*)&##vector + ##element))
45 #define _MM_8W(element,vector) (*((short*)&##vector + ##element))
46
47 #define _MM_4UDW(element,vector) (*((unsigned int*)&##vector + ##element))
48 #define _MM_4DW(element,vector) (*((int*)&##vector + ##element))
49
50 #define _MM_2QW(element,vector) (*((__int64*)&##vector + ##element))
51
get_mask128()52 inline const __m128i get_mask128()
53 {
54 static const __m128i mask128 = _mm_set1_epi64(M64(0xffffffffffffffffi64));
55 return mask128;
56 }
57
58 class M128
59 {
60 protected:
61 __m128i vec;
62
63 public:
M128()64 M128() { }
M128(__m128i mm)65 M128(__m128i mm) { vec = mm; }
66
__m128i()67 operator __m128i() const { return vec; }
68
69 M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
70 M128& operator|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
71 M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
72
73 };
74
75 inline M128 operator&(const M128 &a,const M128 &b) { return _mm_and_si128(a,b); }
76 inline M128 operator|(const M128 &a,const M128 &b) { return _mm_or_si128(a,b); }
77 inline M128 operator^(const M128 &a,const M128 &b) { return _mm_xor_si128(a,b); }
andnot(const M128 & a,const M128 & b)78 inline M128 andnot(const M128 &a,const M128 &b) { return _mm_andnot_si128(a,b); }
79
80 class I128vec1 : public M128
81 {
82 public:
I128vec1()83 I128vec1() { }
I128vec1(__m128i mm)84 I128vec1(__m128i mm) : M128(mm) { }
85
86 I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
87 I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
88 I128vec1& operator|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
89 I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
90
91 };
92
93 class I64vec2 : public M128
94 {
95 public:
I64vec2()96 I64vec2() { }
I64vec2(__m128i mm)97 I64vec2(__m128i mm) : M128(mm) { }
98
I64vec2(__m64 q1,__m64 q0)99 I64vec2(__m64 q1,__m64 q0)
100 {
101 _MM_2QW(0,vec) = *(__int64*)&q0;
102 _MM_2QW(1,vec) = *(__int64*)&q1;
103 }
104
105 I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
106
107 I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
108 I64vec2& operator|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
109 I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
110
111 I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
112 I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
113
114 I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
115 I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
116 I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
117 I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
118 I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
119 I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
120 I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
121 I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
122
123 const __int64& operator[](int i)const
124 {
125 assert(static_cast<unsigned int>(i) < 2);
126 return _MM_2QW(i,vec);
127 }
128
129 __int64& operator[](int i)
130 {
131 assert(static_cast<unsigned int>(i) < 2);
132 return _MM_2QW(i,vec);
133 }
134
135 };
136
unpack_low(const I64vec2 & a,const I64vec2 & b)137 inline I64vec2 unpack_low(const I64vec2 &a,const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
unpack_high(const I64vec2 & a,const I64vec2 & b)138 inline I64vec2 unpack_high(const I64vec2 &a,const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
139
140 class I32vec4 : public M128
141 {
142 public:
I32vec4()143 I32vec4() { }
I32vec4(__m128i mm)144 I32vec4(__m128i mm) : M128(mm) { }
145
146 I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
147
148 I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
149 I32vec4& operator|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
150 I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
151
152 I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
153 I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
154
155 I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
156 I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
157 I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
158 I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
159
160 };
161
cmpeq(const I32vec4 & a,const I32vec4 & b)162 inline I32vec4 cmpeq(const I32vec4 &a,const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const I32vec4 & a,const I32vec4 & b)163 inline I32vec4 cmpneq(const I32vec4 &a,const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
164
unpack_low(const I32vec4 & a,const I32vec4 & b)165 inline I32vec4 unpack_low(const I32vec4 &a,const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const I32vec4 & a,const I32vec4 & b)166 inline I32vec4 unpack_high(const I32vec4 &a,const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
167
168 class Is32vec4 : public I32vec4
169 {
170 public:
Is32vec4()171 Is32vec4() { }
Is32vec4(__m128i mm)172 Is32vec4(__m128i mm) : I32vec4(mm) { }
Is32vec4(int i3,int i2,int i1,int i0)173 Is32vec4(int i3,int i2,int i1,int i0)
174 {
175 _MM_4DW(0,vec) = i0;
176 _MM_4DW(1,vec) = i1;
177 _MM_4DW(2,vec) = i2;
178 _MM_4DW(3,vec) = i3;
179 }
180
181 Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
182
183 Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
184 Is32vec4& operator|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
185 Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
186
187 Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
188 Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
189
190 Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
191 Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
192 Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
193 Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
194
195 Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
196 Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
197 Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
198 Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
199
200 #if defined(_ENABLE_VEC_DEBUG)
201
202 friend std::ostream& operator<< (std::ostream &os,const Is32vec4 &a)
203 {
204 os << "[3]:" << _MM_4DW(3,a)
205 << " [2]:" << _MM_4DW(2,a)
206 << " [1]:" << _MM_4DW(1,a)
207 << " [0]:" << _MM_4DW(0,a);
208 return os;
209 }
210 #endif
211
212 const int& operator[](int i)const
213 {
214 assert(static_cast<unsigned int>(i) < 4);
215 return _MM_4DW(i,vec);
216 }
217
218 int& operator[](int i)
219 {
220 assert(static_cast<unsigned int>(i) < 4);
221 return _MM_4DW(i,vec);
222 }
223 };
224
cmpeq(const Is32vec4 & a,const Is32vec4 & b)225 inline Is32vec4 cmpeq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Is32vec4 & a,const Is32vec4 & b)226 inline Is32vec4 cmpneq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
cmpgt(const Is32vec4 & a,const Is32vec4 & b)227 inline Is32vec4 cmpgt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
cmplt(const Is32vec4 & a,const Is32vec4 & b)228 inline Is32vec4 cmplt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
229
unpack_low(const Is32vec4 & a,const Is32vec4 & b)230 inline Is32vec4 unpack_low(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Is32vec4 & a,const Is32vec4 & b)231 inline Is32vec4 unpack_high(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
232
233 class Iu32vec4 : public I32vec4
234 {
235 public:
Iu32vec4()236 Iu32vec4() { }
Iu32vec4(__m128i mm)237 Iu32vec4(__m128i mm) : I32vec4(mm) { }
Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)238 Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)
239 {
240 _MM_4UDW(0,vec) = ui0;
241 _MM_4UDW(1,vec) = ui1;
242 _MM_4UDW(2,vec) = ui2;
243 _MM_4UDW(3,vec) = ui3;
244 }
245
246 Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
247
248 Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
249 Iu32vec4& operator|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
250 Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
251
252 Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
253 Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
254
255 Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
256 Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
257 Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
258 Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
259 Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
260 Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
261 Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
262 Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
263
264 #if defined(_ENABLE_VEC_DEBUG)
265
266 friend std::ostream& operator<< (std::ostream &os,const Iu32vec4 &a)
267 {
268 os << "[3]:" << _MM_4UDW(3,a)
269 << " [2]:" << _MM_4UDW(2,a)
270 << " [1]:" << _MM_4UDW(1,a)
271 << " [0]:" << _MM_4UDW(0,a);
272 return os;
273 }
274 #endif
275
276 const unsigned int& operator[](int i)const
277 {
278 assert(static_cast<unsigned int>(i) < 4);
279 return _MM_4UDW(i,vec);
280 }
281
282 unsigned int& operator[](int i)
283 {
284 assert(static_cast<unsigned int>(i) < 4);
285 return _MM_4UDW(i,vec);
286 }
287 };
288
289 inline I64vec2 operator*(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
cmpeq(const Iu32vec4 & a,const Iu32vec4 & b)290 inline Iu32vec4 cmpeq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
cmpneq(const Iu32vec4 & a,const Iu32vec4 & b)291 inline Iu32vec4 cmpneq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
292
unpack_low(const Iu32vec4 & a,const Iu32vec4 & b)293 inline Iu32vec4 unpack_low(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
unpack_high(const Iu32vec4 & a,const Iu32vec4 & b)294 inline Iu32vec4 unpack_high(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
295
296 class I16vec8 : public M128
297 {
298 public:
I16vec8()299 I16vec8() { }
I16vec8(__m128i mm)300 I16vec8(__m128i mm) : M128(mm) { }
301
302 I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
303
304 I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
305 I16vec8& operator|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
306 I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
307
308 I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
309 I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
310 I16vec8& operator *=(const I16vec8 &a) { return *this = (I16vec8) _mm_mullo_epi16(vec,a); }
311
312 I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
313 I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
314 I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
315 I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
316
317 };
318
319 inline I16vec8 operator*(const I16vec8 &a,const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
320
cmpeq(const I16vec8 & a,const I16vec8 & b)321 inline I16vec8 cmpeq(const I16vec8 &a,const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const I16vec8 & a,const I16vec8 & b)322 inline I16vec8 cmpneq(const I16vec8 &a,const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
323
unpack_low(const I16vec8 & a,const I16vec8 & b)324 inline I16vec8 unpack_low(const I16vec8 &a,const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const I16vec8 & a,const I16vec8 & b)325 inline I16vec8 unpack_high(const I16vec8 &a,const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
326
327 class Is16vec8 : public I16vec8
328 {
329 public:
Is16vec8()330 Is16vec8() { }
Is16vec8(__m128i mm)331 Is16vec8(__m128i mm) : I16vec8(mm) { }
Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)332 Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)
333 {
334 _MM_8W(0,vec) = s0;
335 _MM_8W(1,vec) = s1;
336 _MM_8W(2,vec) = s2;
337 _MM_8W(3,vec) = s3;
338 _MM_8W(4,vec) = s4;
339 _MM_8W(5,vec) = s5;
340 _MM_8W(6,vec) = s6;
341 _MM_8W(7,vec) = s7;
342 }
343
344 Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
345
346 Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
347 Is16vec8& operator|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
348 Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
349
350 Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
351 Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
352 Is16vec8& operator *=(const I16vec8 &a) { return *this = (Is16vec8) _mm_mullo_epi16(vec,a); }
353
354 Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
355 Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
356 Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
357 Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
358
359 Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
360 Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
361 Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
362 Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
363
364 #if defined(_ENABLE_VEC_DEBUG)
365
366 friend std::ostream& operator<< (std::ostream &os,const Is16vec8 &a)
367 {
368 os << "[7]:" << _MM_8W(7,a)
369 << " [6]:" << _MM_8W(6,a)
370 << " [5]:" << _MM_8W(5,a)
371 << " [4]:" << _MM_8W(4,a)
372 << " [3]:" << _MM_8W(3,a)
373 << " [2]:" << _MM_8W(2,a)
374 << " [1]:" << _MM_8W(1,a)
375 << " [0]:" << _MM_8W(0,a);
376 return os;
377 }
378 #endif
379
380 const signed short& operator[](int i)const
381 {
382 assert(static_cast<unsigned int>(i) < 8);
383 return _MM_8W(i,vec);
384 }
385
386 signed short& operator[](int i)
387 {
388 assert(static_cast<unsigned int>(i) < 8);
389 return _MM_8W(i,vec);
390 }
391 };
392
393 inline Is16vec8 operator*(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
394
cmpeq(const Is16vec8 & a,const Is16vec8 & b)395 inline Is16vec8 cmpeq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Is16vec8 & a,const Is16vec8 & b)396 inline Is16vec8 cmpneq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
cmpgt(const Is16vec8 & a,const Is16vec8 & b)397 inline Is16vec8 cmpgt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
cmplt(const Is16vec8 & a,const Is16vec8 & b)398 inline Is16vec8 cmplt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
399
unpack_low(const Is16vec8 & a,const Is16vec8 & b)400 inline Is16vec8 unpack_low(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Is16vec8 & a,const Is16vec8 & b)401 inline Is16vec8 unpack_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
402
mul_high(const Is16vec8 & a,const Is16vec8 & b)403 inline Is16vec8 mul_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
mul_add(const Is16vec8 & a,const Is16vec8 & b)404 inline Is32vec4 mul_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
405
sat_add(const Is16vec8 & a,const Is16vec8 & b)406 inline Is16vec8 sat_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
sat_sub(const Is16vec8 & a,const Is16vec8 & b)407 inline Is16vec8 sat_sub(const Is16vec8 &a,const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
408
simd_max(const Is16vec8 & a,const Is16vec8 & b)409 inline Is16vec8 simd_max(const Is16vec8 &a,const Is16vec8 &b) { return _mm_max_epi16(a,b); }
simd_min(const Is16vec8 & a,const Is16vec8 & b)410 inline Is16vec8 simd_min(const Is16vec8 &a,const Is16vec8 &b) { return _mm_min_epi16(a,b); }
411
412 class Iu16vec8 : public I16vec8
413 {
414 public:
Iu16vec8()415 Iu16vec8() { }
Iu16vec8(__m128i mm)416 Iu16vec8(__m128i mm) : I16vec8(mm) { }
Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)417 Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)
418 {
419 _MM_8UW(0,vec) = s0;
420 _MM_8UW(1,vec) = s1;
421 _MM_8UW(2,vec) = s2;
422 _MM_8UW(3,vec) = s3;
423 _MM_8UW(4,vec) = s4;
424 _MM_8UW(5,vec) = s5;
425 _MM_8UW(6,vec) = s6;
426 _MM_8UW(7,vec) = s7;
427 }
428
429 Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
430
431 Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
432 Iu16vec8& operator|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
433 Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
434
435 Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
436 Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
437 Iu16vec8& operator *=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
438
439 Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
440 Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
441 Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
442 Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
443 Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
444 Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
445 Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
446 Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
447
448 #if defined(_ENABLE_VEC_DEBUG)
449
450 friend std::ostream& operator << (std::ostream &os,const Iu16vec8 &a)
451 {
452 os << "[7]:" << unsigned short(_MM_8UW(7,a))
453 << " [6]:" << unsigned short(_MM_8UW(6,a))
454 << " [5]:" << unsigned short(_MM_8UW(5,a))
455 << " [4]:" << unsigned short(_MM_8UW(4,a))
456 << " [3]:" << unsigned short(_MM_8UW(3,a))
457 << " [2]:" << unsigned short(_MM_8UW(2,a))
458 << " [1]:" << unsigned short(_MM_8UW(1,a))
459 << " [0]:" << unsigned short(_MM_8UW(0,a));
460 return os;
461 }
462 #endif
463
464 const unsigned short& operator[](int i)const
465 {
466 assert(static_cast<unsigned int>(i) < 8);
467 return _MM_8UW(i,vec);
468 }
469
470 unsigned short& operator[](int i)
471 {
472 assert(static_cast<unsigned int>(i) < 8);
473 return _MM_8UW(i,vec);
474 }
475 };
476
477 inline Iu16vec8 operator*(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
478
cmpeq(const Iu16vec8 & a,const Iu16vec8 & b)479 inline Iu16vec8 cmpeq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
cmpneq(const Iu16vec8 & a,const Iu16vec8 & b)480 inline Iu16vec8 cmpneq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
481
unpack_low(const Iu16vec8 & a,const Iu16vec8 & b)482 inline Iu16vec8 unpack_low(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
unpack_high(const Iu16vec8 & a,const Iu16vec8 & b)483 inline Iu16vec8 unpack_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
484
sat_add(const Iu16vec8 & a,const Iu16vec8 & b)485 inline Iu16vec8 sat_add(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
sat_sub(const Iu16vec8 & a,const Iu16vec8 & b)486 inline Iu16vec8 sat_sub(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
487
simd_avg(const Iu16vec8 & a,const Iu16vec8 & b)488 inline Iu16vec8 simd_avg(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
mul_high(const Iu16vec8 & a,const Iu16vec8 & b)489 inline I16vec8 mul_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
490
491 class I8vec16 : public M128
492 {
493 public:
I8vec16()494 I8vec16() { }
I8vec16(__m128i mm)495 I8vec16(__m128i mm) : M128(mm) { }
496
497 I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
498
499 I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
500 I8vec16& operator|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
501 I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
502
503 I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
504 I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
505
506 };
507
cmpeq(const I8vec16 & a,const I8vec16 & b)508 inline I8vec16 cmpeq(const I8vec16 &a,const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const I8vec16 & a,const I8vec16 & b)509 inline I8vec16 cmpneq(const I8vec16 &a,const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
510
unpack_low(const I8vec16 & a,const I8vec16 & b)511 inline I8vec16 unpack_low(const I8vec16 &a,const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const I8vec16 & a,const I8vec16 & b)512 inline I8vec16 unpack_high(const I8vec16 &a,const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
513
514 class Is8vec16 : public I8vec16
515 {
516 public:
Is8vec16()517 Is8vec16() { }
Is8vec16(__m128i mm)518 Is8vec16(__m128i mm) : I8vec16(mm) { }
519
520 Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
521
522 Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
523 Is8vec16& operator|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
524 Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
525
526 Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
527 Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
528
529 #if defined(_ENABLE_VEC_DEBUG)
530
531 friend std::ostream& operator << (std::ostream &os,const Is8vec16 &a)
532 {
533 os << "[15]:" << short(_MM_16B(15,a))
534 << " [14]:" << short(_MM_16B(14,a))
535 << " [13]:" << short(_MM_16B(13,a))
536 << " [12]:" << short(_MM_16B(12,a))
537 << " [11]:" << short(_MM_16B(11,a))
538 << " [10]:" << short(_MM_16B(10,a))
539 << " [9]:" << short(_MM_16B(9,a))
540 << " [8]:" << short(_MM_16B(8,a))
541 << " [7]:" << short(_MM_16B(7,a))
542 << " [6]:" << short(_MM_16B(6,a))
543 << " [5]:" << short(_MM_16B(5,a))
544 << " [4]:" << short(_MM_16B(4,a))
545 << " [3]:" << short(_MM_16B(3,a))
546 << " [2]:" << short(_MM_16B(2,a))
547 << " [1]:" << short(_MM_16B(1,a))
548 << " [0]:" << short(_MM_16B(0,a));
549 return os;
550 }
551 #endif
552
553 const signed char& operator[](int i)const
554 {
555 assert(static_cast<unsigned int>(i) < 16);
556 return _MM_16B(i,vec);
557 }
558
559 signed char& operator[](int i)
560 {
561 assert(static_cast<unsigned int>(i) < 16);
562 return _MM_16B(i,vec);
563 }
564
565 };
566
cmpeq(const Is8vec16 & a,const Is8vec16 & b)567 inline Is8vec16 cmpeq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Is8vec16 & a,const Is8vec16 & b)568 inline Is8vec16 cmpneq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
cmpgt(const Is8vec16 & a,const Is8vec16 & b)569 inline Is8vec16 cmpgt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
cmplt(const Is8vec16 & a,const Is8vec16 & b)570 inline Is8vec16 cmplt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
571
unpack_low(const Is8vec16 & a,const Is8vec16 & b)572 inline Is8vec16 unpack_low(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Is8vec16 & a,const Is8vec16 & b)573 inline Is8vec16 unpack_high(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
574
sat_add(const Is8vec16 & a,const Is8vec16 & b)575 inline Is8vec16 sat_add(const Is8vec16 &a,const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
sat_sub(const Is8vec16 & a,const Is8vec16 & b)576 inline Is8vec16 sat_sub(const Is8vec16 &a,const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
577
578 class Iu8vec16 : public I8vec16
579 {
580 public:
Iu8vec16()581 Iu8vec16() { }
Iu8vec16(__m128i mm)582 Iu8vec16(__m128i mm) : I8vec16(mm) { }
583
584 Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
585
586 Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
587 Iu8vec16& operator|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
588 Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
589
590 Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
591 Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
592
593 #if defined(_ENABLE_VEC_DEBUG)
594
595 friend std::ostream& operator << (std::ostream &os,const Iu8vec16 &a)
596 {
597 os << "[15]:" << unsigned short(_MM_16UB(15,a))
598 << " [14]:" << unsigned short(_MM_16UB(14,a))
599 << " [13]:" << unsigned short(_MM_16UB(13,a))
600 << " [12]:" << unsigned short(_MM_16UB(12,a))
601 << " [11]:" << unsigned short(_MM_16UB(11,a))
602 << " [10]:" << unsigned short(_MM_16UB(10,a))
603 << " [9]:" << unsigned short(_MM_16UB(9,a))
604 << " [8]:" << unsigned short(_MM_16UB(8,a))
605 << " [7]:" << unsigned short(_MM_16UB(7,a))
606 << " [6]:" << unsigned short(_MM_16UB(6,a))
607 << " [5]:" << unsigned short(_MM_16UB(5,a))
608 << " [4]:" << unsigned short(_MM_16UB(4,a))
609 << " [3]:" << unsigned short(_MM_16UB(3,a))
610 << " [2]:" << unsigned short(_MM_16UB(2,a))
611 << " [1]:" << unsigned short(_MM_16UB(1,a))
612 << " [0]:" << unsigned short(_MM_16UB(0,a));
613 return os;
614 }
615 #endif
616
617 const unsigned char& operator[](int i)const
618 {
619 assert(static_cast<unsigned int>(i) < 16);
620 return _MM_16UB(i,vec);
621 }
622
623 unsigned char& operator[](int i)
624 {
625 assert(static_cast<unsigned int>(i) < 16);
626 return _MM_16UB(i,vec);
627 }
628
629 };
630
cmpeq(const Iu8vec16 & a,const Iu8vec16 & b)631 inline Iu8vec16 cmpeq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
cmpneq(const Iu8vec16 & a,const Iu8vec16 & b)632 inline Iu8vec16 cmpneq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
633
unpack_low(const Iu8vec16 & a,const Iu8vec16 & b)634 inline Iu8vec16 unpack_low(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
unpack_high(const Iu8vec16 & a,const Iu8vec16 & b)635 inline Iu8vec16 unpack_high(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
636
sat_add(const Iu8vec16 & a,const Iu8vec16 & b)637 inline Iu8vec16 sat_add(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
sat_sub(const Iu8vec16 & a,const Iu8vec16 & b)638 inline Iu8vec16 sat_sub(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
639
sum_abs(const Iu8vec16 & a,const Iu8vec16 & b)640 inline I64vec2 sum_abs(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
641
simd_avg(const Iu8vec16 & a,const Iu8vec16 & b)642 inline Iu8vec16 simd_avg(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
simd_max(const Iu8vec16 & a,const Iu8vec16 & b)643 inline Iu8vec16 simd_max(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
simd_min(const Iu8vec16 & a,const Iu8vec16 & b)644 inline Iu8vec16 simd_min(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
645
pack_sat(const Is32vec4 & a,const Is32vec4 & b)646 inline Is16vec8 pack_sat(const Is32vec4 &a,const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
pack_sat(const Is16vec8 & a,const Is16vec8 & b)647 inline Is8vec16 pack_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
packu_sat(const Is16vec8 & a,const Is16vec8 & b)648 inline Iu8vec16 packu_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
649
650 #define IVEC128_LOGICALS(vect,element) inline I##vect##vec##element operator& (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_and_si128(a,b); } inline I##vect##vec##element operator| (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_or_si128(a,b); } inline I##vect##vec##element operator^ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_xor_si128(a,b); } inline I##vect##vec##element andnot (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_andnot_si128(a,b); }
651
652 IVEC128_LOGICALS(8,16)
653 IVEC128_LOGICALS(u8,16)
654 IVEC128_LOGICALS(s8,16)
655 IVEC128_LOGICALS(16,8)
656 IVEC128_LOGICALS(u16,8)
657 IVEC128_LOGICALS(s16,8)
658 IVEC128_LOGICALS(32,4)
659 IVEC128_LOGICALS(u32,4)
660 IVEC128_LOGICALS(s32,4)
661 IVEC128_LOGICALS(64,2)
662 IVEC128_LOGICALS(128,1)
663 #undef IVEC128_LOGICALS
664
665 #define IVEC128_ADD_SUB(vect,element,opsize) inline I##vect##vec##element operator+ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_add_##opsize(a,b); } inline I##vect##vec##element operator- (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_sub_##opsize(a,b); }
666
667 IVEC128_ADD_SUB(8,16,epi8)
668 IVEC128_ADD_SUB(u8,16,epi8)
669 IVEC128_ADD_SUB(s8,16,epi8)
670 IVEC128_ADD_SUB(16,8,epi16)
671 IVEC128_ADD_SUB(u16,8,epi16)
672 IVEC128_ADD_SUB(s16,8,epi16)
673 IVEC128_ADD_SUB(32,4,epi32)
674 IVEC128_ADD_SUB(u32,4,epi32)
675 IVEC128_ADD_SUB(s32,4,epi32)
676 IVEC128_ADD_SUB(64,2,epi64)
677 #undef IVEC128_ADD_SUB
678
679 #define IVEC128_SELECT(vect12,vect34,element,selop,arg1,arg2) inline I##vect34##vec##element select_##selop (const I##vect12##vec##element &a,const I##vect12##vec##element &b,const I##vect34##vec##element &c,const I##vect34##vec##element &d) { I##vect12##vec##element mask = cmp##selop(a,b); return(I##vect34##vec##element ((mask & arg1) | I##vect12##vec##element ((_mm_andnot_si128(mask,arg2))))); }
680 IVEC128_SELECT(8,s8,16,eq,c,d)
681 IVEC128_SELECT(8,u8,16,eq,c,d)
682 IVEC128_SELECT(8,8,16,eq,c,d)
683 IVEC128_SELECT(8,s8,16,neq,c,d)
684 IVEC128_SELECT(8,u8,16,neq,c,d)
685 IVEC128_SELECT(8,8,16,neq,c,d)
686
687 IVEC128_SELECT(16,s16,8,eq,c,d)
688 IVEC128_SELECT(16,u16,8,eq,c,d)
689 IVEC128_SELECT(16,16,8,eq,c,d)
690 IVEC128_SELECT(16,s16,8,neq,c,d)
691 IVEC128_SELECT(16,u16,8,neq,c,d)
692 IVEC128_SELECT(16,16,8,neq,c,d)
693
694 IVEC128_SELECT(32,s32,4,eq,c,d)
695 IVEC128_SELECT(32,u32,4,eq,c,d)
696 IVEC128_SELECT(32,32,4,eq,c,d)
697 IVEC128_SELECT(32,s32,4,neq,c,d)
698 IVEC128_SELECT(32,u32,4,neq,c,d)
699 IVEC128_SELECT(32,32,4,neq,c,d)
700
701 IVEC128_SELECT(s8,s8,16,gt,c,d)
702 IVEC128_SELECT(s8,u8,16,gt,c,d)
703 IVEC128_SELECT(s8,8,16,gt,c,d)
704 IVEC128_SELECT(s8,s8,16,lt,c,d)
705 IVEC128_SELECT(s8,u8,16,lt,c,d)
706 IVEC128_SELECT(s8,8,16,lt,c,d)
707
708 IVEC128_SELECT(s16,s16,8,gt,c,d)
709 IVEC128_SELECT(s16,u16,8,gt,c,d)
710 IVEC128_SELECT(s16,16,8,gt,c,d)
711 IVEC128_SELECT(s16,s16,8,lt,c,d)
712 IVEC128_SELECT(s16,u16,8,lt,c,d)
713 IVEC128_SELECT(s16,16,8,lt,c,d)
714
715 #undef IVEC128_SELECT
716
717 class F64vec2
718 {
719 protected:
720 __m128d vec;
721 public:
722
F64vec2()723 F64vec2() {}
724
F64vec2(__m128d m)725 F64vec2(__m128d m) { vec = m;}
726
F64vec2(double d1,double d0)727 F64vec2(double d1,double d0) { vec= _mm_set_pd(d1,d0); }
728
F64vec2(double d)729 EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
730
__m128d()731 operator __m128d() const { return vec; }
732
733 friend F64vec2 operator &(const F64vec2 &a,const F64vec2 &b) { return _mm_and_pd(a,b); }
734 friend F64vec2 operator |(const F64vec2 &a,const F64vec2 &b) { return _mm_or_pd(a,b); }
735 friend F64vec2 operator ^(const F64vec2 &a,const F64vec2 &b) { return _mm_xor_pd(a,b); }
736
737 friend F64vec2 operator +(const F64vec2 &a,const F64vec2 &b) { return _mm_add_pd(a,b); }
738 friend F64vec2 operator -(const F64vec2 &a,const F64vec2 &b) { return _mm_sub_pd(a,b); }
739 friend F64vec2 operator *(const F64vec2 &a,const F64vec2 &b) { return _mm_mul_pd(a,b); }
740 friend F64vec2 operator /(const F64vec2 &a,const F64vec2 &b) { return _mm_div_pd(a,b); }
741
742 F64vec2& operator +=(F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
743 F64vec2& operator -=(F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
744 F64vec2& operator *=(F64vec2 &a) { return *this = _mm_mul_pd(vec,a); }
745 F64vec2& operator /=(F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
746 F64vec2& operator &=(F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
747 F64vec2& operator |=(F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
748 F64vec2& operator ^=(F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
749
add_horizontal(F64vec2 & a)750 friend double add_horizontal(F64vec2 &a)
751 {
752 F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a,a,1));
753 return ftemp[0];
754 }
755
andnot(const F64vec2 & a,const F64vec2 & b)756 friend F64vec2 andnot(const F64vec2 &a,const F64vec2 &b) { return _mm_andnot_pd(a,b); }
757
sqrt(const F64vec2 & a)758 friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
759
760 #define F64vec2_COMP(op) friend F64vec2 cmp##op (const F64vec2 &a,const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
761 F64vec2_COMP(eq)
F64vec2_COMP(lt)762 F64vec2_COMP(lt)
763 F64vec2_COMP(le)
764 F64vec2_COMP(gt)
765 F64vec2_COMP(ge)
766 F64vec2_COMP(ngt)
767 F64vec2_COMP(nge)
768 F64vec2_COMP(neq)
769 F64vec2_COMP(nlt)
770 F64vec2_COMP(nle)
771 #undef F64vec2_COMP
772
773 friend F64vec2 simd_min(const F64vec2 &a,const F64vec2 &b) { return _mm_min_pd(a,b); }
simd_max(const F64vec2 & a,const F64vec2 & b)774 friend F64vec2 simd_max(const F64vec2 &a,const F64vec2 &b) { return _mm_max_pd(a,b); }
775
776 #define F64vec2_COMI(op) friend int comi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
777 F64vec2_COMI(eq)
778 F64vec2_COMI(lt)
779 F64vec2_COMI(le)
780 F64vec2_COMI(gt)
781 F64vec2_COMI(ge)
782 F64vec2_COMI(neq)
783 #undef F64vec2_COMI
784
785 #define F64vec2_UCOMI(op) friend int ucomi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
786 F64vec2_UCOMI(eq)
787 F64vec2_UCOMI(lt)
788 F64vec2_UCOMI(le)
789 F64vec2_UCOMI(gt)
790 F64vec2_UCOMI(ge)
791 F64vec2_UCOMI(neq)
792 #undef F64vec2_UCOMI
793
794 #if defined(_ENABLE_VEC_DEBUG)
795
796 friend std::ostream & operator<<(std::ostream & os,const F64vec2 &a) {
797 double *dp = (double*)&a;
798 os << " [1]:" << *(dp+1)
799 << " [0]:" << *dp;
800 return os;
801 }
802 #endif
803
804 const double &operator[](int i) const {
805 assert((0 <= i) && (i <= 1));
806 double *dp = (double*)&vec;
807 return *(dp+i);
808 }
809
810 double &operator[](int i) {
811 assert((0 <= i) && (i <= 1));
812 double *dp = (double*)&vec;
813 return *(dp+i);
814 }
815 };
816
unpack_low(const F64vec2 & a,const F64vec2 & b)817 inline F64vec2 unpack_low(const F64vec2 &a,const F64vec2 &b) { return _mm_unpacklo_pd(a,b); }
unpack_high(const F64vec2 & a,const F64vec2 & b)818 inline F64vec2 unpack_high(const F64vec2 &a,const F64vec2 &b) { return _mm_unpackhi_pd(a,b); }
move_mask(const F64vec2 & a)819 inline int move_mask(const F64vec2 &a) { return _mm_movemask_pd(a); }
loadu(F64vec2 & a,double * p)820 inline void loadu(F64vec2 &a,double *p) { a = _mm_loadu_pd(p); }
storeu(double * p,const F64vec2 & a)821 inline void storeu(double *p,const F64vec2 &a) { _mm_storeu_pd(p,a); }
store_nta(double * p,F64vec2 & a)822 inline void store_nta(double *p,F64vec2 &a) { _mm_stream_pd(p,a); }
823
824 #define F64vec2_SELECT(op) inline F64vec2 select_##op (const F64vec2 &a,const F64vec2 &b,const F64vec2 &c,const F64vec2 &d) { F64vec2 mask = _mm_cmp##op##_pd(a,b); return((mask & c) | F64vec2((_mm_andnot_pd(mask,d)))); }
825 F64vec2_SELECT(eq)
F64vec2_SELECT(lt)826 F64vec2_SELECT(lt)
827 F64vec2_SELECT(le)
828 F64vec2_SELECT(gt)
829 F64vec2_SELECT(ge)
830 F64vec2_SELECT(neq)
831 F64vec2_SELECT(nlt)
832 F64vec2_SELECT(nle)
833 #undef F64vec2_SELECT
834
835 inline int F64vec2ToInt(const F64vec2 &a) { return _mm_cvttsd_si32(a); }
F32vec4ToF64vec2(const F32vec4 & a)836 inline F64vec2 F32vec4ToF64vec2(const F32vec4 &a) { return _mm_cvtps_pd(a); }
F64vec2ToF32vec4(const F64vec2 & a)837 inline F32vec4 F64vec2ToF32vec4(const F64vec2 &a) { return _mm_cvtpd_ps(a); }
IntToF64vec2(const F64vec2 & a,int b)838 inline F64vec2 IntToF64vec2(const F64vec2 &a,int b) { return _mm_cvtsi32_sd(a,b); }
839
840 #pragma pack(pop)
841 #pragma pack(pop)
842 #endif
843 #endif
844