1 /*  This file is part of the Vc library. {{{
2 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_SSE_SHUFFLE_H_
29 #define VC_SSE_SHUFFLE_H_
30 
31 #include "intrinsics.h"
32 #include "macros.h"
33 
34 namespace Vc_VERSIONED_NAMESPACE
35 {
36     enum VecPos {
37         X0, X1, X2, X3, X4, X5, X6, X7,
38         Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7,
39         Const0
40     };
41 
42 namespace Mem
43 {
44         // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
shuffle(__m128 x,__m128 y)45         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
46             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
47             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
48             return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
49         }
50 
51         // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
shuffle(__m128d x,__m128d y)52         template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
53             static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range");
54             static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range");
55             return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
56         }
57 
58         // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
59         template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3>
shuffle(__m128i x,__m128i y)60         Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y)
61         {
62             return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x),
63                                                                     _mm_castsi128_ps(y)));
64         }
65 
66         // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
blend(__m128d x,__m128d y)67         template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
68             static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
69             static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
70             return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y);
71         }
72 
73         // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
blend(__m128 x,__m128 y)74         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
75             static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
76             static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
77             static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
78             static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
79             return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
80                                                (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y);
81         }
82 
83         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
blend(__m128i x,__m128i y)84         static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
85             static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
86             static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
87             static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
88             static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
89             static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
90             static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
91             static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
92             static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
93             return Vc::SseIntrinsics::blend_epi16<
94                 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
95                 (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 +
96                 (Dst7 / Y7) * 128>(x, y);
97         }
98 
99         // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
permute(__m128 x)100         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
101             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
102             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
103             return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
104         }
105 
permute(__m128d x)106         template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) {
107             static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
108             static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
109             return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4);
110         }
111 
permute(__m128i x)112         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
113             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
114             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
115             return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
116         }
117 
permuteLo(__m128i x)118         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
119             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
120             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
121             return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
122         }
123 
permuteHi(__m128i x)124         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
125             static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
126             static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
127             return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
128         }
129 
130         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
permute(__m128i x)131             static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
132             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
133             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
134             static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range");
135             static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range");
136             if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
137                 x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
138             }
139             if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
140                 x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
141             }
142             return x;
143         }
144 }  // namespace Mem
145 
146     // The shuffles and permutes above use memory ordering. The ones below use register ordering:
147 namespace Reg
148 {
149         // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
shuffle(__m128 x,__m128 y)150         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
151             return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
152         }
153 
154         // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
shuffle(__m128d x,__m128d y)155         template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
156             return Mem::shuffle<Dst0, Dst1>(x, y);
157         }
158 
159         // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
permute(__m128i x)160         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
161             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
162             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
163             return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
164         }
165 
166         // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
shuffle(__m128i x,__m128i y)167         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
168             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
169             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
170             return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
171         }
172 
173         // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
blend(__m128d x,__m128d y)174         template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
175             return Mem::blend<Dst0, Dst1>(x, y);
176         }
177 
blend(__m128 x,__m128 y)178         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
179             return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
180         }
181 }  // namespace Reg
182 }  // namespace Vc
183 
184 #endif // VC_SSE_SHUFFLE_H_
185