1/****************************************************************************
2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23#if !defined(__SIMD_LIB_AVX_HPP__)
24#error Do not include this file directly, use "simdlib.hpp" instead.
25#endif
26
27//============================================================================
28// SIMD16 AVX (1) implementation
29//============================================================================
30
31static const int TARGET_SIMD_WIDTH = 8;
32using SIMD128T                     = SIMD128Impl::AVXImpl;
33
34#define SIMD_WRAPPER_1(op)                              \
35    static SIMDINLINE Float SIMDCALL op(Float const& a) \
36    {                                                   \
37        return Float{                                   \
38            SIMD256T::op(a.v8[0]),                      \
39            SIMD256T::op(a.v8[1]),                      \
40        };                                              \
41    }
42
43#define SIMD_WRAPPER_2(op)                                              \
44    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
45    {                                                                   \
46        return Float{                                                   \
47            SIMD256T::op(a.v8[0], b.v8[0]),                             \
48            SIMD256T::op(a.v8[1], b.v8[1]),                             \
49        };                                                              \
50    }
51
52#define SIMD_WRAPPER_2I(op)                                                              \
53    template <int ImmT>                                                                  \
54    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b)                  \
55    {                                                                                    \
56        return Float{                                                                    \
57            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
58            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
59        };                                                                               \
60    }
61
62#define SIMD_WRAPPER_2I_1(op)                                           \
63    template <int ImmT>                                                 \
64    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
65    {                                                                   \
66        return Float{                                                   \
67            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),              \
68            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),              \
69        };                                                              \
70    }
71
72#define SIMD_WRAPPER_3(op)                                                              \
73    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
74    {                                                                                   \
75        return Float{                                                                   \
76            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                    \
77            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                    \
78        };                                                                              \
79    }
80
81#define SIMD_IWRAPPER_1(op)                                 \
82    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
83    {                                                       \
84        return Integer{                                     \
85            SIMD256T::op(a.v8[0]),                          \
86            SIMD256T::op(a.v8[1]),                          \
87        };                                                  \
88    }
89
90#define SIMD_IWRAPPER_2(op)                                                   \
91    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
92    {                                                                         \
93        return Integer{                                                       \
94            SIMD256T::op(a.v8[0], b.v8[0]),                                   \
95            SIMD256T::op(a.v8[1], b.v8[1]),                                   \
96        };                                                                    \
97    }
98
99#define SIMD_IWRAPPER_2I(op)                                                             \
100    template <int ImmT>                                                                  \
101    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b)            \
102    {                                                                                    \
103        return Integer{                                                                  \
104            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
105            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
106        };                                                                               \
107    }
108
109#define SIMD_IWRAPPER_2I_1(op)                                                \
110    template <int ImmT>                                                       \
111    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
112    {                                                                         \
113        return Integer{                                                       \
114            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),                    \
115            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),                    \
116        };                                                                    \
117    }
118
119#define SIMD_IWRAPPER_2I_2(op)                                                \
120    template <int ImmT>                                                       \
121    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
122    {                                                                         \
123        return Integer{                                                       \
124            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),              \
125            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),       \
126        };                                                                    \
127    }
128
129#define SIMD_IWRAPPER_3(op)                                                                     \
130    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
131    {                                                                                           \
132        return Integer{                                                                         \
133            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                            \
134            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                            \
135        };                                                                                      \
136    }
137
138//-----------------------------------------------------------------------
139// Single precision floating point arithmetic operations
140//-----------------------------------------------------------------------
141SIMD_WRAPPER_2(add_ps);   // return a + b
142SIMD_WRAPPER_2(div_ps);   // return a / b
143SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
144SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
145SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
146SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
147SIMD_WRAPPER_2(mul_ps);   // return a * b
148SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
149SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
150SIMD_WRAPPER_2(sub_ps);   // return a - b
151
152template <RoundMode RMT>
153static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
154{
155    return Float{
156        SIMD256T::template round_ps<RMT>(a.v8[0]),
157        SIMD256T::template round_ps<RMT>(a.v8[1]),
158    };
159}
160
161static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
162{
163    return round_ps<RoundMode::CEIL_NOEXC>(a);
164}
165static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
166{
167    return round_ps<RoundMode::FLOOR_NOEXC>(a);
168}
169
170//-----------------------------------------------------------------------
171// Integer (various width) arithmetic operations
172//-----------------------------------------------------------------------
173SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
174SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
175SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
176SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
177SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
178SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
179SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
180SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
181SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
182
183// return (a * b) & 0xFFFFFFFF
184//
185// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
186// and store the low 32 bits of the intermediate integers in dst.
187SIMD_IWRAPPER_2(mullo_epi32);
188SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
189SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
190SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
191
192//-----------------------------------------------------------------------
193// Logical operations
194//-----------------------------------------------------------------------
195SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
196SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
197SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
198SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
199SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
200SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
201SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
202SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
203
204//-----------------------------------------------------------------------
205// Shift operations
206//-----------------------------------------------------------------------
207template <int ImmT>
208static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
209{
210    return Integer{
211        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
212        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
213    };
214}
215
216SIMD_IWRAPPER_2(sllv_epi32); // return a << b      (uint32)
217
218template <int ImmT>
219static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT   (int32)
220{
221    return Integer{
222        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
223        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
224    };
225}
226
227template <int ImmT>
228static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT   (uint32)
229{
230    return Integer{
231        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
232        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
233    };
234}
235
236template <int ImmT>                                          // for each 128-bit lane:
237static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) //  return a >> (ImmT*8) (uint)
238{
239    return Integer{
240        SIMD256T::template srli_si<ImmT>(a.v8[0]),
241        SIMD256T::template srli_si<ImmT>(a.v8[1]),
242    };
243}
244template <int ImmT>
245static SIMDINLINE Float SIMDCALL
246                        srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
247{
248    return Float{
249        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
250        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
251    };
252}
253
254SIMD_IWRAPPER_2(srlv_epi32); // return a >> b      (uint32)
255
256//-----------------------------------------------------------------------
257// Conversion operations
258//-----------------------------------------------------------------------
259static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
260{
261    return Float{
262        SIMD256T::castpd_ps(a.v8[0]),
263        SIMD256T::castpd_ps(a.v8[1]),
264    };
265}
266
267static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
268{
269    return Integer{
270        SIMD256T::castps_si(a.v8[0]),
271        SIMD256T::castps_si(a.v8[1]),
272    };
273}
274
275static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
276{
277    return Double{
278        SIMD256T::castsi_pd(a.v8[0]),
279        SIMD256T::castsi_pd(a.v8[1]),
280    };
281}
282
283static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
284{
285    return Double{
286        SIMD256T::castps_pd(a.v8[0]),
287        SIMD256T::castps_pd(a.v8[1]),
288    };
289}
290
291static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
292{
293    return Float{
294        SIMD256T::castsi_ps(a.v8[0]),
295        SIMD256T::castsi_ps(a.v8[1]),
296    };
297}
298
299static SIMDINLINE Float SIMDCALL
300                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
301{
302    return Float{
303        SIMD256T::cvtepi32_ps(a.v8[0]),
304        SIMD256T::cvtepi32_ps(a.v8[1]),
305    };
306}
307
308static SIMDINLINE Integer SIMDCALL
309                          cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a    (uint8 --> int16)
310{
311    return Integer{
312        SIMD256T::cvtepu8_epi16(a.v4[0]),
313        SIMD256T::cvtepu8_epi16(a.v4[1]),
314    };
315}
316
317static SIMDINLINE Integer SIMDCALL
318                          cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint8 --> int32)
319{
320    return Integer{
321        SIMD256T::cvtepu8_epi32(a.v4[0]),
322        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
323    };
324}
325
326static SIMDINLINE Integer SIMDCALL
327                          cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint16 --> int32)
328{
329    return Integer{
330        SIMD256T::cvtepu16_epi32(a.v4[0]),
331        SIMD256T::cvtepu16_epi32(a.v4[1]),
332    };
333}
334
335static SIMDINLINE Integer SIMDCALL
336                          cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint16 --> int64)
337{
338    return Integer{
339        SIMD256T::cvtepu16_epi64(a.v4[0]),
340        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
341    };
342}
343
344static SIMDINLINE Integer SIMDCALL
345                          cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint32 --> int64)
346{
347    return Integer{
348        SIMD256T::cvtepu32_epi64(a.v4[0]),
349        SIMD256T::cvtepu32_epi64(a.v4[1]),
350    };
351}
352
353static SIMDINLINE Integer SIMDCALL
354                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
355{
356    return Integer{
357        SIMD256T::cvtps_epi32(a.v8[0]),
358        SIMD256T::cvtps_epi32(a.v8[1]),
359    };
360}
361
362static SIMDINLINE Integer SIMDCALL
363                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
364{
365    return Integer{
366        SIMD256T::cvtps_epi32(a.v8[0]),
367        SIMD256T::cvtps_epi32(a.v8[1]),
368    };
369}
370
371//-----------------------------------------------------------------------
372// Comparison operations
373//-----------------------------------------------------------------------
374template <CompareType CmpTypeT>
375static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
376{
377    return Float{
378        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
379        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
380    };
381}
382static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
383{
384    return cmp_ps<CompareType::LT_OQ>(a, b);
385}
386static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
387{
388    return cmp_ps<CompareType::GT_OQ>(a, b);
389}
390static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
391{
392    return cmp_ps<CompareType::NEQ_OQ>(a, b);
393}
394static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
395{
396    return cmp_ps<CompareType::EQ_OQ>(a, b);
397}
398static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
399{
400    return cmp_ps<CompareType::GE_OQ>(a, b);
401}
402static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
403{
404    return cmp_ps<CompareType::LE_OQ>(a, b);
405}
406
407template <CompareType CmpTypeT>
408static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
409{
410    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
411}
412
413SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
414SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
415SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
416SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
417SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
418SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
419SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
420SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
421SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
422
423static SIMDINLINE bool SIMDCALL
424                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
425{
426    return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
427}
428
429static SIMDINLINE bool SIMDCALL
430                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
431{
432    return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
433}
434
435//-----------------------------------------------------------------------
436// Blend / shuffle / permute operations
437//-----------------------------------------------------------------------
438SIMD_WRAPPER_2I(blend_ps);     // return ImmT ? b : a  (float)
439SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
440SIMD_WRAPPER_3(blendv_ps);     // return mask ? b : a  (float)
441static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
442                                                Integer const& b,
443                                                Float const&   mask) // return mask ? b : a (int)
444{
445    return Integer{
446        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
447        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
448    };
449}
450
451static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
452                                                Integer const& b,
453                                                Integer const& mask) // return mask ? b : a (int)
454{
455    return Integer{
456        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
457        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
458    };
459}
460
461static SIMDINLINE Float SIMDCALL
462                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
463{
464    float f = *p;
465    return Float{
466        SIMD256T::set1_ps(f),
467        SIMD256T::set1_ps(f),
468    };
469}
470
471template <int imm>
472static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
473{
474    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
475    return a.v8[imm];
476}
477
478template <int imm>
479static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
480{
481    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
482    return a.v8[imm];
483}
484
485template <int imm>
486static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
487{
488    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
489    return a.v8[imm];
490}
491
492template <int imm>
493static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
494{
495    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
496    Float r   = a;
497    r.v8[imm] = b;
498    return r;
499}
500
501template <int imm>
502static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
503{
504    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
505    Double r  = a;
506    r.v8[imm] = b;
507    return r;
508}
509
510template <int imm>
511static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
512{
513    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
514    Integer r = a;
515    r.v8[imm] = b;
516    return r;
517}
518
519SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
520SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
521SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
522SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
523
524template <int ImmT>
525static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
526{
527    return Float{
528        SIMD256T::template permute_ps<ImmT>(a.v8[0]),
529        SIMD256T::template permute_ps<ImmT>(a.v8[1]),
530    };
531}
532
533static SIMDINLINE Integer SIMDCALL permute_epi32(
534    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
535{
536    return castps_si(permute_ps(castsi_ps(a), swiz));
537}
538
539static SIMDINLINE Float SIMDCALL
540                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
541{
542    const auto mask = SIMD256T::set1_epi32(7);
543
544    auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
545    auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
546
547    auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
548    auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
549
550    return Float{
551        SIMD256T::blendv_ps(
552            lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
553        SIMD256T::blendv_ps(
554            hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
555    };
556}
557
558// All of the 512-bit permute2f128_XX intrinsics do the following:
559//
560//      SELECT4(src, control) {
561//          CASE(control[1:0])
562//              0 : tmp[127:0] : = src[127:0]
563//              1 : tmp[127:0] : = src[255:128]
564//              2 : tmp[127:0] : = src[383:256]
565//              3 : tmp[127:0] : = src[511:384]
566//              ESAC
567//              RETURN tmp[127:0]
568//      }
569//
570//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
571//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
572//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
573//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
574//      dst[MAX:512] : = 0
575//
576// Since the 256-bit AVX instructions use a 4-bit control field (instead
577// of 2-bit for AVX512), we need to expand the control bits sent to the
578// AVX instructions for emulation.
579//
580template <int shuf>
581static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
582{
583    return Float{
584        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
585                                                                                        a.v8[1]),
586        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
587                                                                                        b.v8[1]),
588    };
589}
590
591template <int shuf>
592static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
593{
594    return Double{
595        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
596                                                                                        a.v8[1]),
597        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
598                                                                                        b.v8[1]),
599    };
600}
601
602template <int shuf>
603static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
604{
605    return Integer{
606        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
607                                                                                        a.v8[1]),
608        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
609                                                                                        b.v8[1]),
610    };
611}
612
613SIMD_IWRAPPER_2I_1(shuffle_epi32);
614SIMD_IWRAPPER_2I_2(shuffle_epi64);
615SIMD_IWRAPPER_2(shuffle_epi8);
616SIMD_WRAPPER_2I_1(shuffle_pd);
617SIMD_WRAPPER_2I_1(shuffle_ps);
618SIMD_IWRAPPER_2(unpackhi_epi16);
619SIMD_IWRAPPER_2(unpackhi_epi32);
620SIMD_IWRAPPER_2(unpackhi_epi64);
621SIMD_IWRAPPER_2(unpackhi_epi8);
622SIMD_WRAPPER_2(unpackhi_pd);
623SIMD_WRAPPER_2(unpackhi_ps);
624SIMD_IWRAPPER_2(unpacklo_epi16);
625SIMD_IWRAPPER_2(unpacklo_epi32);
626SIMD_IWRAPPER_2(unpacklo_epi64);
627SIMD_IWRAPPER_2(unpacklo_epi8);
628SIMD_WRAPPER_2(unpacklo_pd);
629SIMD_WRAPPER_2(unpacklo_ps);
630
631//-----------------------------------------------------------------------
632// Load / store operations
633//-----------------------------------------------------------------------
634template <ScaleFactor ScaleT = ScaleFactor::SF_1>
635static SIMDINLINE Float SIMDCALL
636                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
637{
638    return Float{
639        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
640        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
641    };
642}
643
644template <ScaleFactor ScaleT = ScaleFactor::SF_1>
645static SIMDINLINE Float SIMDCALL
646                        sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
647{
648    return Float{
649        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
650        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
651    };
652}
653
654static SIMDINLINE Float SIMDCALL
655                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
656{
657    return broadcast_ss(p);
658}
659
660static SIMDINLINE Float SIMDCALL
661                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
662{
663    return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
664}
665
666static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
667{
668    return Integer{
669        SIMD256T::load_si(&p->v8[0]),
670        SIMD256T::load_si(&p->v8[1]),
671    };
672}
673
674static SIMDINLINE Float SIMDCALL
675                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
676{
677    return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
678}
679
680static SIMDINLINE Integer SIMDCALL
681                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
682{
683    return Integer{
684        SIMD256T::loadu_si(&p->v8[0]),
685        SIMD256T::loadu_si(&p->v8[1]),
686    };
687}
688
689// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
690template <ScaleFactor ScaleT = ScaleFactor::SF_1>
691static SIMDINLINE Float SIMDCALL
692                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
693{
694    return Float{
695        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
696        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
697    };
698}
699
700template <ScaleFactor ScaleT = ScaleFactor::SF_1>
701static SIMDINLINE Float SIMDCALL
702                        sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
703{
704    return Float{
705        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
706        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
707    };
708}
709
710static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
711{
712    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
713    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
714}
715
716static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
717{
718    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
719    mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
720
721    return mask;
722}
723
724static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
725{
726    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
727    mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
728
729    return mask;
730}
731static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
732{
733    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
734    mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
735
736    return mask;
737}
738
739static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
740{
741    return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
742}
743
744static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
745{
746    return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
747}
748
749static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
750{
751    return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
752}
753
754static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
755{
756    return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
757}
758
759static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
760{
761    return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
762}
763
764static SIMDINLINE void SIMDCALL
765                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
766{
767    SIMD256T::store_ps(p, a.v8[0]);
768    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
769}
770
771static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
772{
773    SIMD256T::store_si(&p->v8[0], a.v8[0]);
774    SIMD256T::store_si(&p->v8[1], a.v8[1]);
775}
776
777static SIMDINLINE void SIMDCALL
778                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
779{
780    SIMD256T::stream_ps(p, a.v8[0]);
781    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
782}
783
784static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
785                                             int i14,
786                                             int i13,
787                                             int i12,
788                                             int i11,
789                                             int i10,
790                                             int i9,
791                                             int i8,
792                                             int i7,
793                                             int i6,
794                                             int i5,
795                                             int i4,
796                                             int i3,
797                                             int i2,
798                                             int i1,
799                                             int i0)
800{
801    return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
802                   SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
803}
804
805static SIMDINLINE Integer SIMDCALL
806                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
807{
808    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
809}
810
811static SIMDINLINE Float SIMDCALL set_ps(float i15,
812                                        float i14,
813                                        float i13,
814                                        float i12,
815                                        float i11,
816                                        float i10,
817                                        float i9,
818                                        float i8,
819                                        float i7,
820                                        float i6,
821                                        float i5,
822                                        float i4,
823                                        float i3,
824                                        float i2,
825                                        float i1,
826                                        float i0)
827{
828    return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
829                 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
830}
831
832static SIMDINLINE Float SIMDCALL
833                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
834{
835    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
836}
837
838static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
839{
840    return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
841}
842
843#undef SIMD_WRAPPER_1
844#undef SIMD_WRAPPER_2
845#undef SIMD_WRAPPER_2I
846#undef SIMD_WRAPPER_2I_1
847#undef SIMD_WRAPPER_3
848#undef SIMD_IWRAPPER_1
849#undef SIMD_IWRAPPER_2
850#undef SIMD_IWRAPPER_2I
851#undef SIMD_IWRAPPER_2I_1
852#undef SIMD_IWRAPPER_3
853