1 /* This file is part of the Vc library. {{{ 2 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org> 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the names of contributing organizations nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY 19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26 }}}*/ 27 28 #ifndef VC_SSE_SHUFFLE_H_ 29 #define VC_SSE_SHUFFLE_H_ 30 31 #include "intrinsics.h" 32 #include "macros.h" 33 34 namespace Vc_VERSIONED_NAMESPACE 35 { 36 enum VecPos { 37 X0, X1, X2, X3, X4, X5, X6, X7, 38 Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, 39 Const0 40 }; 41 42 namespace Mem 43 { 44 // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] shuffle(__m128 x,__m128 y)45 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { 46 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); 47 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); 48 return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); 49 } 50 51 // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0] shuffle(__m128d x,__m128d y)52 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { 53 static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range"); 54 static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range"); 55 return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2); 56 } 57 58 // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] 59 template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> shuffle(__m128i x,__m128i y)60 Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y) 61 { 62 return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x), 63 _mm_castsi128_ps(y))); 64 } 65 66 // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1] blend(__m128d x,__m128d y)67 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { 68 static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); 69 static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); 70 return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y); 71 } 72 73 // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1] blend(__m128 x,__m128 y)74 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { 75 static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); 76 static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); 77 static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range"); 78 static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range"); 79 return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + 80 (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y); 81 } 82 83 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7> blend(__m128i x,__m128i y)84 static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) { 85 static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); 86 static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); 87 static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range"); 88 static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range"); 89 static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range"); 90 static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range"); 91 static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range"); 92 static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range"); 93 return Vc::SseIntrinsics::blend_epi16< 94 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + 95 (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 + 96 (Dst7 / Y7) * 128>(x, y); 97 } 98 99 // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] permute(__m128 x)100 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { 101 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); 102 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); 103 return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); 104 } 105 permute(__m128d x)106 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) { 107 static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range"); 108 static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range"); 109 return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4); 110 } 111 permute(__m128i x)112 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { 113 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); 114 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); 115 return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); 116 } 117 permuteLo(__m128i x)118 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) { 119 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); 120 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); 121 return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); 122 } 123 permuteHi(__m128i x)124 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) { 125 static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range"); 126 static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range"); 127 return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); 128 } 129 130 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7> permute(__m128i x)131 static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { 132 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); 133 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); 134 static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range"); 135 static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range"); 136 if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) { 137 x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); 138 } 139 if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) { 140 x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); 141 } 142 return x; 143 } 144 } // namespace Mem 145 146 // The shuffles and permutes above use memory ordering. The ones below use register ordering: 147 namespace Reg 148 { 149 // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] shuffle(__m128 x,__m128 y)150 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { 151 return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y); 152 } 153 154 // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1] shuffle(__m128d x,__m128d y)155 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { 156 return Mem::shuffle<Dst0, Dst1>(x, y); 157 } 158 159 // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1] permute(__m128i x)160 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { 161 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); 162 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); 163 return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); 164 } 165 166 // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] shuffle(__m128i x,__m128i y)167 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) { 168 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); 169 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); 170 return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64)); 171 } 172 173 // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0] blend(__m128d x,__m128d y)174 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { 175 return Mem::blend<Dst0, Dst1>(x, y); 176 } 177 blend(__m128 x,__m128 y)178 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { 179 return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y); 180 } 181 } // namespace Reg 182 } // namespace Vc 183 184 #endif // VC_SSE_SHUFFLE_H_ 185