1/*
2 * Copyright 2010-2019 Branimir Karadzic. All rights reserved.
3 * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
4 */
5
6#ifndef BX_SIMD_T_H_HEADER_GUARD
7#	error "Must be included from bx/simd_t.h!"
8#endif // BX_SIMD_T_H_HEADER_GUARD
9
10namespace bx
11{
12	BX_CONST_FUNC float sqrt(float);
13	BX_CONST_FUNC float rsqrt(float);
14
15#define ELEMx 0
16#define ELEMy 1
17#define ELEMz 2
18#define ELEMw 3
19#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w)                                        \
20			template<>                                                                      \
21			BX_SIMD_FORCE_INLINE simd128_ref_t simd_swiz_##_x##_y##_z##_w(simd128_ref_t _a) \
22			{                                                                               \
23				simd128_ref_t result;                                                       \
24				result.ixyzw[0] = _a.ixyzw[ELEM##_x];                                       \
25				result.ixyzw[1] = _a.ixyzw[ELEM##_y];                                       \
26				result.ixyzw[2] = _a.ixyzw[ELEM##_z];                                       \
27				result.ixyzw[3] = _a.ixyzw[ELEM##_w];                                       \
28				return result;                                                              \
29			}
30
31#include "simd128_swizzle.inl"
32
33#undef BX_SIMD128_IMPLEMENT_SWIZZLE
34#undef ELEMw
35#undef ELEMz
36#undef ELEMy
37#undef ELEMx
38
39#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _mask)                                  \
40			template<>                                                           \
41			BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_ref_t _test) \
42			{                                                                    \
43				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3)                        \
44				             | ( (_test.uxyzw[2]>>31)<<2)                        \
45				             | ( (_test.uxyzw[1]>>31)<<1)                        \
46				             | (  _test.uxyzw[0]>>31)                            \
47				             ;                                                   \
48				return 0 != (tmp&(_mask) );                                      \
49			}                                                                    \
50			                                                                     \
51			template<>                                                           \
52			BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_ref_t _test) \
53			{                                                                    \
54				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3)                        \
55				             | ( (_test.uxyzw[2]>>31)<<2)                        \
56				             | ( (_test.uxyzw[1]>>31)<<1)                        \
57				             | (  _test.uxyzw[0]>>31)                            \
58				             ;                                                   \
59				return (_mask) == (tmp&(_mask) );                                \
60			}
61
62BX_SIMD128_IMPLEMENT_TEST(x    , 0x1);
63BX_SIMD128_IMPLEMENT_TEST(y    , 0x2);
64BX_SIMD128_IMPLEMENT_TEST(xy   , 0x3);
65BX_SIMD128_IMPLEMENT_TEST(z    , 0x4);
66BX_SIMD128_IMPLEMENT_TEST(xz   , 0x5);
67BX_SIMD128_IMPLEMENT_TEST(yz   , 0x6);
68BX_SIMD128_IMPLEMENT_TEST(xyz  , 0x7);
69BX_SIMD128_IMPLEMENT_TEST(w    , 0x8);
70BX_SIMD128_IMPLEMENT_TEST(xw   , 0x9);
71BX_SIMD128_IMPLEMENT_TEST(yw   , 0xa);
72BX_SIMD128_IMPLEMENT_TEST(xyw  , 0xb);
73BX_SIMD128_IMPLEMENT_TEST(zw   , 0xc);
74BX_SIMD128_IMPLEMENT_TEST(xzw  , 0xd);
75BX_SIMD128_IMPLEMENT_TEST(yzw  , 0xe);
76BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
77
78#undef BX_SIMD128_IMPLEMENT_TEST
79
80	template<>
81	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xyAB(simd128_ref_t _a, simd128_ref_t _b)
82	{
83		simd128_ref_t result;
84		result.uxyzw[0] = _a.uxyzw[0];
85		result.uxyzw[1] = _a.uxyzw[1];
86		result.uxyzw[2] = _b.uxyzw[0];
87		result.uxyzw[3] = _b.uxyzw[1];
88		return result;
89	}
90
91	template<>
92	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_ABxy(simd128_ref_t _a, simd128_ref_t _b)
93	{
94		simd128_ref_t result;
95		result.uxyzw[0] = _b.uxyzw[0];
96		result.uxyzw[1] = _b.uxyzw[1];
97		result.uxyzw[2] = _a.uxyzw[0];
98		result.uxyzw[3] = _a.uxyzw[1];
99		return result;
100	}
101
102	template<>
103	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CDzw(simd128_ref_t _a, simd128_ref_t _b)
104	{
105		simd128_ref_t result;
106		result.uxyzw[0] = _b.uxyzw[2];
107		result.uxyzw[1] = _b.uxyzw[3];
108		result.uxyzw[2] = _a.uxyzw[2];
109		result.uxyzw[3] = _a.uxyzw[3];
110		return result;
111	}
112
113	template<>
114	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zwCD(simd128_ref_t _a, simd128_ref_t _b)
115	{
116		simd128_ref_t result;
117		result.uxyzw[0] = _a.uxyzw[2];
118		result.uxyzw[1] = _a.uxyzw[3];
119		result.uxyzw[2] = _b.uxyzw[2];
120		result.uxyzw[3] = _b.uxyzw[3];
121		return result;
122	}
123
124	template<>
125	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xAyB(simd128_ref_t _a, simd128_ref_t _b)
126	{
127		simd128_ref_t result;
128		result.uxyzw[0] = _a.uxyzw[0];
129		result.uxyzw[1] = _b.uxyzw[0];
130		result.uxyzw[2] = _a.uxyzw[1];
131		result.uxyzw[3] = _b.uxyzw[1];
132		return result;
133	}
134
135	template<>
136	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_AxBy(simd128_ref_t _a, simd128_ref_t _b)
137	{
138		simd128_ref_t result;
139		result.uxyzw[0] = _a.uxyzw[1];
140		result.uxyzw[1] = _b.uxyzw[1];
141		result.uxyzw[2] = _a.uxyzw[0];
142		result.uxyzw[3] = _b.uxyzw[0];
143		return result;
144	}
145
146	template<>
147	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zCwD(simd128_ref_t _a, simd128_ref_t _b)
148	{
149		simd128_ref_t result;
150		result.uxyzw[0] = _a.uxyzw[2];
151		result.uxyzw[1] = _b.uxyzw[2];
152		result.uxyzw[2] = _a.uxyzw[3];
153		result.uxyzw[3] = _b.uxyzw[3];
154		return result;
155	}
156
157	template<>
158	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CzDw(simd128_ref_t _a, simd128_ref_t _b)
159	{
160		simd128_ref_t result;
161		result.uxyzw[0] = _b.uxyzw[2];
162		result.uxyzw[1] = _a.uxyzw[2];
163		result.uxyzw[2] = _b.uxyzw[3];
164		result.uxyzw[3] = _a.uxyzw[3];
165		return result;
166	}
167
168	template<>
169	BX_SIMD_FORCE_INLINE float simd_x(simd128_ref_t _a)
170	{
171		return _a.fxyzw[0];
172	}
173
174	template<>
175	BX_SIMD_FORCE_INLINE float simd_y(simd128_ref_t _a)
176	{
177		return _a.fxyzw[1];
178	}
179
180	template<>
181	BX_SIMD_FORCE_INLINE float simd_z(simd128_ref_t _a)
182	{
183		return _a.fxyzw[2];
184	}
185
186	template<>
187	BX_SIMD_FORCE_INLINE float simd_w(simd128_ref_t _a)
188	{
189		return _a.fxyzw[3];
190	}
191
192	template<>
193	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(const void* _ptr)
194	{
195		const uint32_t* input = reinterpret_cast<const uint32_t*>(_ptr);
196		simd128_ref_t result;
197		result.uxyzw[0] = input[0];
198		result.uxyzw[1] = input[1];
199		result.uxyzw[2] = input[2];
200		result.uxyzw[3] = input[3];
201		return result;
202	}
203
204	template<>
205	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_ref_t _a)
206	{
207		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
208		result[0] = _a.uxyzw[0];
209		result[1] = _a.uxyzw[1];
210		result[2] = _a.uxyzw[2];
211		result[3] = _a.uxyzw[3];
212	}
213
214	template<>
215	BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_ref_t _a)
216	{
217		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
218		result[0] = _a.uxyzw[0];
219	}
220
221	template<>
222	BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_ref_t _a)
223	{
224		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
225		result[0] = _a.uxyzw[0];
226		result[1] = _a.uxyzw[1];
227		result[2] = _a.uxyzw[2];
228		result[3] = _a.uxyzw[3];
229	}
230
231	template<>
232	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(float _x, float _y, float _z, float _w)
233	{
234		simd128_ref_t result;
235		result.fxyzw[0] = _x;
236		result.fxyzw[1] = _y;
237		result.fxyzw[2] = _z;
238		result.fxyzw[3] = _w;
239		return result;
240	}
241
242	template<>
243	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
244	{
245		simd128_ref_t result;
246		result.uxyzw[0] = _x;
247		result.uxyzw[1] = _y;
248		result.uxyzw[2] = _z;
249		result.uxyzw[3] = _w;
250		return result;
251	}
252
253	template<>
254	BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(const void* _ptr)
255	{
256		const uint32_t val = *reinterpret_cast<const uint32_t*>(_ptr);
257		simd128_ref_t result;
258		result.uxyzw[0] = val;
259		result.uxyzw[1] = val;
260		result.uxyzw[2] = val;
261		result.uxyzw[3] = val;
262		return result;
263	}
264
265	template<>
266	BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(float _a)
267	{
268		return simd_ld<simd128_ref_t>(_a, _a, _a, _a);
269	}
270
271	template<>
272	BX_SIMD_FORCE_INLINE simd128_ref_t simd_isplat(uint32_t _a)
273	{
274		return simd_ild<simd128_ref_t>(_a, _a, _a, _a);
275	}
276
277	template<>
278	BX_SIMD_FORCE_INLINE simd128_ref_t simd_zero()
279	{
280		return simd_ild<simd128_ref_t>(0, 0, 0, 0);
281	}
282
283	template<>
284	BX_SIMD_FORCE_INLINE simd128_ref_t simd_itof(simd128_ref_t _a)
285	{
286		simd128_ref_t result;
287		result.fxyzw[0] = (float)_a.ixyzw[0];
288		result.fxyzw[1] = (float)_a.ixyzw[1];
289		result.fxyzw[2] = (float)_a.ixyzw[2];
290		result.fxyzw[3] = (float)_a.ixyzw[3];
291		return result;
292	}
293
294	template<>
295	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ftoi(simd128_ref_t _a)
296	{
297		simd128_ref_t result;
298		result.ixyzw[0] = (int)_a.fxyzw[0];
299		result.ixyzw[1] = (int)_a.fxyzw[1];
300		result.ixyzw[2] = (int)_a.fxyzw[2];
301		result.ixyzw[3] = (int)_a.fxyzw[3];
302		return result;
303	}
304
305	template<>
306	BX_SIMD_FORCE_INLINE simd128_ref_t simd_round(simd128_ref_t _a)
307	{
308		return simd_round_ni(_a);
309	}
310
311	template<>
312	BX_SIMD_FORCE_INLINE simd128_ref_t simd_add(simd128_ref_t _a, simd128_ref_t _b)
313	{
314		simd128_ref_t result;
315		result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0];
316		result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1];
317		result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2];
318		result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3];
319		return result;
320	}
321
322	template<>
323	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sub(simd128_ref_t _a, simd128_ref_t _b)
324	{
325		simd128_ref_t result;
326		result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0];
327		result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1];
328		result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2];
329		result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3];
330		return result;
331	}
332
333	template<>
334	BX_SIMD_FORCE_INLINE simd128_ref_t simd_mul(simd128_ref_t _a, simd128_ref_t _b)
335	{
336		simd128_ref_t result;
337		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
338		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
339		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
340		result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3];
341		return result;
342	}
343
344	template<>
345	BX_SIMD_FORCE_INLINE simd128_ref_t simd_div(simd128_ref_t _a, simd128_ref_t _b)
346	{
347		simd128_ref_t result;
348		result.fxyzw[0] = _a.fxyzw[0] / _b.fxyzw[0];
349		result.fxyzw[1] = _a.fxyzw[1] / _b.fxyzw[1];
350		result.fxyzw[2] = _a.fxyzw[2] / _b.fxyzw[2];
351		result.fxyzw[3] = _a.fxyzw[3] / _b.fxyzw[3];
352		return result;
353	}
354
355	template<>
356	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rcp_est(simd128_ref_t _a)
357	{
358		simd128_ref_t result;
359		result.fxyzw[0] = 1.0f / _a.fxyzw[0];
360		result.fxyzw[1] = 1.0f / _a.fxyzw[1];
361		result.fxyzw[2] = 1.0f / _a.fxyzw[2];
362		result.fxyzw[3] = 1.0f / _a.fxyzw[3];
363		return result;
364	}
365
366	template<>
367	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sqrt(simd128_ref_t _a)
368	{
369		simd128_ref_t result;
370		result.fxyzw[0] = sqrt(_a.fxyzw[0]);
371		result.fxyzw[1] = sqrt(_a.fxyzw[1]);
372		result.fxyzw[2] = sqrt(_a.fxyzw[2]);
373		result.fxyzw[3] = sqrt(_a.fxyzw[3]);
374		return result;
375	}
376
377	template<>
378	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_est(simd128_ref_t _a)
379	{
380		simd128_ref_t result;
381		result.fxyzw[0] = rsqrt(_a.fxyzw[0]);
382		result.fxyzw[1] = rsqrt(_a.fxyzw[1]);
383		result.fxyzw[2] = rsqrt(_a.fxyzw[2]);
384		result.fxyzw[3] = rsqrt(_a.fxyzw[3]);
385		return result;
386	}
387
388	template<>
389	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpeq(simd128_ref_t _a, simd128_ref_t _b)
390	{
391		simd128_ref_t result;
392		result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0;
393		result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0;
394		result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0;
395		result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0;
396		return result;
397	}
398
399	template<>
400	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmplt(simd128_ref_t _a, simd128_ref_t _b)
401	{
402		simd128_ref_t result;
403		result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0;
404		result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0;
405		result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0;
406		result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0;
407		return result;
408	}
409
410	template<>
411	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmple(simd128_ref_t _a, simd128_ref_t _b)
412	{
413		simd128_ref_t result;
414		result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0;
415		result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0;
416		result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0;
417		result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0;
418		return result;
419	}
420
421	template<>
422	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpgt(simd128_ref_t _a, simd128_ref_t _b)
423	{
424		simd128_ref_t result;
425		result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0;
426		result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0;
427		result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0;
428		result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0;
429		return result;
430	}
431
432	template<>
433	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpge(simd128_ref_t _a, simd128_ref_t _b)
434	{
435		simd128_ref_t result;
436		result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0;
437		result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0;
438		result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0;
439		result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0;
440		return result;
441	}
442
443	template<>
444	BX_SIMD_FORCE_INLINE simd128_ref_t simd_min(simd128_ref_t _a, simd128_ref_t _b)
445	{
446		simd128_ref_t result;
447		result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
448		result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
449		result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
450		result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
451		return result;
452	}
453
454	template<>
455	BX_SIMD_FORCE_INLINE simd128_ref_t simd_max(simd128_ref_t _a, simd128_ref_t _b)
456	{
457		simd128_ref_t result;
458		result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
459		result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
460		result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
461		result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
462		return result;
463	}
464
465	template<>
466	BX_SIMD_FORCE_INLINE simd128_ref_t simd_and(simd128_ref_t _a, simd128_ref_t _b)
467	{
468		simd128_ref_t result;
469		result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0];
470		result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1];
471		result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2];
472		result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3];
473		return result;
474	}
475
476	template<>
477	BX_SIMD_FORCE_INLINE simd128_ref_t simd_andc(simd128_ref_t _a, simd128_ref_t _b)
478	{
479		simd128_ref_t result;
480		result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0];
481		result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1];
482		result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2];
483		result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3];
484		return result;
485	}
486
487	template<>
488	BX_SIMD_FORCE_INLINE simd128_ref_t simd_or(simd128_ref_t _a, simd128_ref_t _b)
489	{
490		simd128_ref_t result;
491		result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0];
492		result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1];
493		result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2];
494		result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3];
495		return result;
496	}
497
498	template<>
499	BX_SIMD_FORCE_INLINE simd128_ref_t simd_xor(simd128_ref_t _a, simd128_ref_t _b)
500	{
501		simd128_ref_t result;
502		result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0];
503		result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1];
504		result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2];
505		result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3];
506		return result;
507	}
508
509	template<>
510	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sll(simd128_ref_t _a, int _count)
511	{
512		simd128_ref_t result;
513		result.uxyzw[0] = _a.uxyzw[0] << _count;
514		result.uxyzw[1] = _a.uxyzw[1] << _count;
515		result.uxyzw[2] = _a.uxyzw[2] << _count;
516		result.uxyzw[3] = _a.uxyzw[3] << _count;
517		return result;
518	}
519
520	template<>
521	BX_SIMD_FORCE_INLINE simd128_ref_t simd_srl(simd128_ref_t _a, int _count)
522	{
523		simd128_ref_t result;
524		result.uxyzw[0] = _a.uxyzw[0] >> _count;
525		result.uxyzw[1] = _a.uxyzw[1] >> _count;
526		result.uxyzw[2] = _a.uxyzw[2] >> _count;
527		result.uxyzw[3] = _a.uxyzw[3] >> _count;
528		return result;
529	}
530
531	template<>
532	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sra(simd128_ref_t _a, int _count)
533	{
534		simd128_ref_t result;
535		result.ixyzw[0] = _a.ixyzw[0] >> _count;
536		result.ixyzw[1] = _a.ixyzw[1] >> _count;
537		result.ixyzw[2] = _a.ixyzw[2] >> _count;
538		result.ixyzw[3] = _a.ixyzw[3] >> _count;
539		return result;
540	}
541
542	template<>
543	BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpeq(simd128_ref_t _a, simd128_ref_t _b)
544	{
545		simd128_ref_t result;
546		result.ixyzw[0] = _a.ixyzw[0] == _b.ixyzw[0] ? 0xffffffff : 0x0;
547		result.ixyzw[1] = _a.ixyzw[1] == _b.ixyzw[1] ? 0xffffffff : 0x0;
548		result.ixyzw[2] = _a.ixyzw[2] == _b.ixyzw[2] ? 0xffffffff : 0x0;
549		result.ixyzw[3] = _a.ixyzw[3] == _b.ixyzw[3] ? 0xffffffff : 0x0;
550		return result;
551	}
552
553	template<>
554	BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmplt(simd128_ref_t _a, simd128_ref_t _b)
555	{
556		simd128_ref_t result;
557		result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? 0xffffffff : 0x0;
558		result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? 0xffffffff : 0x0;
559		result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? 0xffffffff : 0x0;
560		result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? 0xffffffff : 0x0;
561		return result;
562	}
563
564	template<>
565	BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpgt(simd128_ref_t _a, simd128_ref_t _b)
566	{
567		simd128_ref_t result;
568		result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? 0xffffffff : 0x0;
569		result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? 0xffffffff : 0x0;
570		result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? 0xffffffff : 0x0;
571		result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? 0xffffffff : 0x0;
572		return result;
573	}
574
575	template<>
576	BX_SIMD_FORCE_INLINE simd128_ref_t simd_imin(simd128_ref_t _a, simd128_ref_t _b)
577	{
578		simd128_ref_t result;
579		result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0];
580		result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1];
581		result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2];
582		result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3];
583		return result;
584	}
585
586	template<>
587	BX_SIMD_FORCE_INLINE simd128_ref_t simd_imax(simd128_ref_t _a, simd128_ref_t _b)
588	{
589		simd128_ref_t result;
590		result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0];
591		result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1];
592		result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2];
593		result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3];
594		return result;
595	}
596
597	template<>
598	BX_SIMD_FORCE_INLINE simd128_ref_t simd_iadd(simd128_ref_t _a, simd128_ref_t _b)
599	{
600		simd128_ref_t result;
601		result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0];
602		result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1];
603		result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2];
604		result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3];
605		return result;
606	}
607
608	template<>
609	BX_SIMD_FORCE_INLINE simd128_ref_t simd_isub(simd128_ref_t _a, simd128_ref_t _b)
610	{
611		simd128_ref_t result;
612		result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0];
613		result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1];
614		result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2];
615		result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3];
616		return result;
617	}
618
619	BX_SIMD_FORCE_INLINE simd128_t simd_zero()
620	{
621		return simd_zero<simd128_t>();
622	}
623
624	BX_SIMD_FORCE_INLINE simd128_t simd_ld(const void* _ptr)
625	{
626		return simd_ld<simd128_t>(_ptr);
627	}
628
629	BX_SIMD_FORCE_INLINE simd128_t simd_ld(float _x, float _y, float _z, float _w)
630	{
631		return simd_ld<simd128_t>(_x, _y, _z, _w);
632	}
633
634	BX_SIMD_FORCE_INLINE simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
635	{
636		return simd_ild<simd128_t>(_x, _y, _z, _w);
637	}
638
639	BX_SIMD_FORCE_INLINE simd128_t simd_splat(const void* _ptr)
640	{
641		return simd_splat<simd128_t>(_ptr);
642	}
643
644	BX_SIMD_FORCE_INLINE simd128_t simd_splat(float _a)
645	{
646		return simd_splat<simd128_t>(_a);
647	}
648
649	BX_SIMD_FORCE_INLINE simd128_t simd_isplat(uint32_t _a)
650	{
651		return simd_isplat<simd128_t>(_a);
652	}
653
654	template<>
655	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xAzC(simd128_ref_t _a, simd128_ref_t _b)
656	{
657		return simd_shuf_xAzC_ni(_a, _b);
658	}
659
660	template<>
661	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_yBwD(simd128_ref_t _a, simd128_ref_t _b)
662	{
663		return simd_shuf_yBwD_ni(_a, _b);
664	}
665
666	template<>
667	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rcp(simd128_ref_t _a)
668	{
669		return simd_rcp_ni(_a);
670	}
671
672	template<>
673	BX_SIMD_FORCE_INLINE simd128_ref_t simd_orx(simd128_ref_t _a)
674	{
675		return simd_orx_ni(_a);
676	}
677
678	template<>
679	BX_SIMD_FORCE_INLINE simd128_ref_t simd_orc(simd128_ref_t _a, simd128_ref_t _b)
680	{
681		return simd_orc_ni(_a, _b);
682	}
683
684	template<>
685	BX_SIMD_FORCE_INLINE simd128_ref_t simd_neg(simd128_ref_t _a)
686	{
687		return simd_neg_ni(_a);
688	}
689
690	template<>
691	BX_SIMD_FORCE_INLINE simd128_ref_t simd_madd(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c)
692	{
693		return simd_madd_ni(_a, _b, _c);
694	}
695
696	template<>
697	BX_SIMD_FORCE_INLINE simd128_ref_t simd_nmsub(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c)
698	{
699		return simd_nmsub_ni(_a, _b, _c);
700	}
701
702	template<>
703	BX_SIMD_FORCE_INLINE simd128_ref_t simd_div_nr(simd128_ref_t _a, simd128_ref_t _b)
704	{
705		return simd_div_nr_ni(_a, _b);
706	}
707
708	template<>
709	BX_SIMD_FORCE_INLINE simd128_ref_t simd_selb(simd128_ref_t _mask, simd128_ref_t _a, simd128_ref_t _b)
710	{
711		return simd_selb_ni(_mask, _a, _b);
712	}
713
714	template<>
715	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sels(simd128_ref_t _test, simd128_ref_t _a, simd128_ref_t _b)
716	{
717		return simd_sels_ni(_test, _a, _b);
718	}
719
720	template<>
721	BX_SIMD_FORCE_INLINE simd128_ref_t simd_not(simd128_ref_t _a)
722	{
723		return simd_not_ni(_a);
724	}
725
726	template<>
727	BX_SIMD_FORCE_INLINE simd128_ref_t simd_abs(simd128_ref_t _a)
728	{
729		return simd_abs_ni(_a);
730	}
731
732	template<>
733	BX_SIMD_FORCE_INLINE simd128_ref_t simd_clamp(simd128_ref_t _a, simd128_ref_t _min, simd128_ref_t _max)
734	{
735		return simd_clamp_ni(_a, _min, _max);
736	}
737
738	template<>
739	BX_SIMD_FORCE_INLINE simd128_ref_t simd_lerp(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _s)
740	{
741		return simd_lerp_ni(_a, _b, _s);
742	}
743
744	template<>
745	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt(simd128_ref_t _a)
746	{
747		return simd_rsqrt_ni(_a);
748	}
749
750	template<>
751	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_nr(simd128_ref_t _a)
752	{
753		return simd_rsqrt_nr_ni(_a);
754	}
755
756	template<>
757	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_carmack(simd128_ref_t _a)
758	{
759		return simd_rsqrt_carmack_ni(_a);
760	}
761
762	template<>
763	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sqrt_nr(simd128_ref_t _a)
764	{
765		return simd_sqrt_nr_ni(_a);
766	}
767
768	template<>
769	BX_SIMD_FORCE_INLINE simd128_ref_t simd_log2(simd128_ref_t _a)
770	{
771		return simd_log2_ni(_a);
772	}
773
774	template<>
775	BX_SIMD_FORCE_INLINE simd128_ref_t simd_exp2(simd128_ref_t _a)
776	{
777		return simd_exp2_ni(_a);
778	}
779
780	template<>
781	BX_SIMD_FORCE_INLINE simd128_ref_t simd_pow(simd128_ref_t _a, simd128_ref_t _b)
782	{
783		return simd_pow_ni(_a, _b);
784	}
785
786	template<>
787	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cross3(simd128_ref_t _a, simd128_ref_t _b)
788	{
789		return simd_cross3_ni(_a, _b);
790	}
791
792	template<>
793	BX_SIMD_FORCE_INLINE simd128_ref_t simd_normalize3(simd128_ref_t _a)
794	{
795		return simd_normalize3_ni(_a);
796	}
797
798	template<>
799	BX_SIMD_FORCE_INLINE simd128_ref_t simd_dot3(simd128_ref_t _a, simd128_ref_t _b)
800	{
801		return simd_dot3_ni(_a, _b);
802	}
803
804	template<>
805	BX_SIMD_FORCE_INLINE simd128_ref_t simd_dot(simd128_ref_t _a, simd128_ref_t _b)
806	{
807		return simd_dot_ni(_a, _b);
808	}
809
810	template<>
811	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ceil(simd128_ref_t _a)
812	{
813		return simd_ceil_ni(_a);
814	}
815
816	template<>
817	BX_SIMD_FORCE_INLINE simd128_ref_t simd_floor(simd128_ref_t _a)
818	{
819		return simd_floor_ni(_a);
820	}
821
822} // namespace bx
823