1 /****************************************************************************
2  * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  ****************************************************************************/
23 #pragma once
24 #if 0
25 //===========================================================================
26 // Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
27 //===========================================================================
28 struct SIMD256 // or SIMD4 or SIMD16
29 {
30     //=======================================================================
31     // SIMD Types
32     //
33     // These typedefs are examples. The SIMD256 and SIMD16 implementations will
34     // use different base types with this same naming.
35     using Float     = __m256;  // Packed single-precision float vector
36     using Double    = __m256d; // Packed double-precision float vector
37     using Integer   = __m256i; // Packed integer vector (mutable element widths)
38     using Mask      = uint8_t; // Integer representing mask bits
39 
40     //=======================================================================
41     // Standard interface
42     // (available in both SIMD256 and SIMD16 widths)
43     //=======================================================================
44 
45     //-----------------------------------------------------------------------
46     // Single precision floating point arithmetic operations
47     //-----------------------------------------------------------------------
48     static Float    add_ps(Float a, Float b);               // return a + b
49     static Float    div_ps(Float a, Float b);               // return a / b
50     static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
51     static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
52     static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
53     static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
54     static Float    mul_ps(Float a, Float b);               // return a * b
55     static Float    rcp_ps(Float a);                        // return 1.0f / a
56     static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
57     static Float    sub_ps(Float a, Float b);               // return a - b
58 
59     enum class RoundMode
60     {
61         TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
62         TO_NEG_INF      = 0x01, // Round to negative infinity
63         TO_POS_INF      = 0x02, // Round to positive infinity
64         TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
65         CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
66 
67         RAISE_EXC       = 0x00, // Raise exception on overflow
68         NO_EXC          = 0x08, // Suppress exceptions
69 
70         NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
71         NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
72         FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
73         FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
74         CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
75         CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
76         TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
77         TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
78         RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
79         NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
80     };
81 
82     // return round_func(a)
83     //
84     // round_func is chosen on the RMT template parameter.  See the documentation
85     // for the RoundMode enumeration above.
86     template <RoundMode RMT>
87     static Float    round_ps(Float a);                  // return round(a)
88 
89 
90     //-----------------------------------------------------------------------
91     // Integer (various width) arithmetic operations
92     //-----------------------------------------------------------------------
93     static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
94     static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
95     static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
96     static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
97     static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
98     static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
99     static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
100     static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
101     static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
102 
103     // return (a * b) & 0xFFFFFFFF
104     //
105     // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
106     // and store the low 32 bits of the intermediate integers in dst.
107     static Float    mullo_epi32(Integer a, Integer b);
108 
109     static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
110     static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
111     static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
112 
113     //-----------------------------------------------------------------------
114     // Logical operations
115     //-----------------------------------------------------------------------
116     static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
117     static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
118     static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
119     static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
120     static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
121     static Float    or_si(Integer a, Integer b);        // return a | b       (int)
122     static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
123     static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
124 
125     //-----------------------------------------------------------------------
126     // Shift operations
127     //-----------------------------------------------------------------------
128     template<int ImmT>
129     static Integer  slli_epi32(Integer a);              // return a << ImmT
130     static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
131     template<int ImmT>
132     static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
133     template<int ImmT>
134     static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
135     template<int ImmT>                                  // for each 128-bit lane:
136     static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
137     template<int ImmT>
138     static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
139     static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
140 
141     //-----------------------------------------------------------------------
142     // Conversion operations
143     //-----------------------------------------------------------------------
144     static Float    castpd_ps(Double a);                // return *(Float*)(&a)
145     static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
146     static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
147     static Double   castps_pd(Float a);                 // return *(Double*)(&a)
148     static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
149     static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
150     static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
151     static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
152     static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
153     static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
154     static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
155     static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
156     static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
157 
158     //-----------------------------------------------------------------------
159     // Comparison operations
160     //-----------------------------------------------------------------------
161 
162     // Comparison types used with cmp_ps:
163     //   - ordered comparisons are always false if either operand is NaN
164     //   - unordered comparisons are always true if either operand is NaN
165     //   - signaling comparisons raise an exception if either operand is NaN
166     //   - non-signaling comparisons will never raise an exception
167     //
168     // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
169     // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
170     enum class CompareType
171     {
172         EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
173         LT_OS      = 0x01, // Less-than (ordered, signaling)
174         LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
175         UNORD_Q    = 0x03, // Unordered (nonsignaling)
176         NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
177         NLT_US     = 0x05, // Not-less-than (unordered, signaling)
178         NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
179         ORD_Q      = 0x07, // Ordered (nonsignaling)
180         EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
181         NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
182         NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
183         FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
184         NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
185         GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
186         GT_OS      = 0x0E, // Greater-than (ordered, signaling)
187         TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
188         EQ_OS      = 0x10, // Equal (ordered, signaling)
189         LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
190         LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
191         UNORD_S    = 0x13, // Unordered (signaling)
192         NEQ_US     = 0x14, // Not-equal (unordered, signaling)
193         NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
194         NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
195         ORD_S      = 0x17, // Ordered (signaling)
196         EQ_US      = 0x18, // Equal (unordered, signaling)
197         NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
198         NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
199         FALSE_OS   = 0x1B, // False (ordered, signaling)
200         NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
201         GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
202         GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
203         TRUE_US    = 0x1F, // True (unordered, signaling)
204     };
205 
206     // return a (CmpTypeT) b (float)
207     //
208     // See documentation for CompareType above for valid values for CmpTypeT.
209     template<CompareType CmpTypeT>
210     static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
211     static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
212     static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
213     static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
214     static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
215     static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
216     static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
217     static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
218     static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
219     static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
220     static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
221     static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
222     static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
223     static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
224     static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
225     static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
226     static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
227     static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
228 
229     //-----------------------------------------------------------------------
230     // Blend / shuffle / permute operations
231     //-----------------------------------------------------------------------
232     template<int ImmT>
233     static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
234     static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
235     static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
236     static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
237     static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
238     static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
239     static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
240     static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
241     static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
242     static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
243     template<int SwizT>
244     static Integer  shuffle_epi32(Integer a, Integer b);
245     template<int SwizT>
246     static Integer  shuffle_epi64(Integer a, Integer b);
247     static Integer  shuffle_epi8(Integer a, Integer b);
248     template<int SwizT>
249     static Float    shuffle_pd(Double a, Double b);
250     template<int SwizT>
251     static Float    shuffle_ps(Float a, Float b);
252     static Integer  unpackhi_epi16(Integer a, Integer b);
253     static Integer  unpackhi_epi32(Integer a, Integer b);
254     static Integer  unpackhi_epi64(Integer a, Integer b);
255     static Integer  unpackhi_epi8(Integer a, Integer b);
256     static Float    unpackhi_pd(Double a, Double b);
257     static Float    unpackhi_ps(Float a, Float b);
258     static Integer  unpacklo_epi16(Integer a, Integer b);
259     static Integer  unpacklo_epi32(Integer a, Integer b);
260     static Integer  unpacklo_epi64(Integer a, Integer b);
261     static Integer  unpacklo_epi8(Integer a, Integer b);
262     static Float    unpacklo_pd(Double a, Double b);
263     static Float    unpacklo_ps(Float a, Float b);
264 
265     //-----------------------------------------------------------------------
266     // Load / store operations
267     //-----------------------------------------------------------------------
268     enum class ScaleFactor
269     {
270         SF_1,   // No scaling
271         SF_2,   // Scale offset by 2
272         SF_4,   // Scale offset by 4
273         SF_8,   // Scale offset by 8
274     };
275 
276     template<ScaleFactor ScaleT = ScaleFactor::SF_1>
277     static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
278     static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
279     static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
280     static Integer  load_si(Integer const *p);                  // return *p
281     static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
282     static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
283 
284     // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
285     template<int ScaleT>
286     static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
287 
288     static void     maskstore_ps(float *p, Integer mask, Float src);
289     static int      movemask_epi8(Integer a);
290     static int      movemask_pd(Double a);
291     static int      movemask_ps(Float a);
292     static Integer  set1_epi32(int i);                          // return i (all elements are same value)
293     static Integer  set1_epi8(char i);                          // return i (all elements are same value)
294     static Float    set1_ps(float f);                           // return f (all elements are same value)
295     static Float    setzero_ps();                               // return 0 (float)
296     static Integer  setzero_si();                               // return 0 (integer)
297     static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
298     static void     store_si(Integer *p, Integer a);            // *p = a
299     static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
300 
301     //=======================================================================
302     // Legacy interface (available only in SIMD256 width)
303     //=======================================================================
304 
305     static Float    broadcast_ps(__m128 const *p);
306     template<int ImmT>
307     static __m128d  extractf128_pd(Double a);
308     template<int ImmT>
309     static __m128   extractf128_ps(Float a);
310     template<int ImmT>
311     static __m128i  extractf128_si(Integer a);
312     template<int ImmT>
313     static Double   insertf128_pd(Double a, __m128d b);
314     template<int ImmT>
315     static Float    insertf128_ps(Float a, __m128 b);
316     template<int ImmT>
317     static Integer  insertf128_si(Integer a, __m128i b);
318     static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
319     template<int ImmT>
320     static Double   permute2f128_pd(Double a, Double b);
321     template<int ImmT>
322     static Float    permute2f128_ps(Float a, Float b);
323     template<int ImmT>
324     static Integer  permute2f128_si(Integer a, Integer b);
325     static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
326     static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
327 
328     //=======================================================================
329     // Advanced masking interface (currently available only in SIMD16 width)
330     //=======================================================================
331 };
332 #endif // #if 0
333