1 /* This file is part of the Vc library. {{{
2 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #include <Vc/avx/vector.h>
29 #include <Vc/avx/debug.h>
30 #include <Vc/avx/macros.h>
31
32 namespace Vc_VERSIONED_NAMESPACE
33 {
34 namespace Detail
35 {
36 #ifdef Vc_IMPL_AVX2
37 template <>
sorted(AVX2::short_v x_)38 Vc_CONST AVX2::short_v sorted<CurrentImplementation::current()>(AVX2::short_v x_)
39 {
40 // ab cd ef gh ij kl mn op
41 // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
42 // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
43 // ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮
44 // ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮
45 // <> <> <> <> <> <> <> <>
46 // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
47 // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
48 // ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮
49 // ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮
50 // 01 23 01 23 01 23 01 23
51 // ⎮⎮ ⎮⎮ ╳ ⎮⎮ ⎮⎮ ╳
52 // 01 23 32 10 01 23 32 10
53 // ⎮⎝ ⎮⎝ ⎠⎮ ⎠⎮ ⎮⎝ ⎮⎝ ⎠⎮ ⎠⎮
54 // ⎮ ╲⎮ ╳ ⎮╱ ⎮ ⎮ ╲⎮ ╳ ⎮╱ ⎮
55 // ⎮ ╲╱ ╲╱ ⎮ ⎮ ╲╱ ╲╱ ⎮
56 // ⎮ ╱╲ ╱╲ ⎮ ⎮ ╱╲ ╱╲ ⎮
57 // ⎮ ╱⎮ ╳ ⎮╲ ⎮ ⎮ ╱⎮ ╳ ⎮╲ ⎮
58 // ⎮⎛ ⎮⎛ ⎞⎮ ⎞⎮ ⎮⎛ ⎮⎛ ⎞⎮ ⎞⎮
59 // <> <> <> <> <> <> <> <>
60 // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
61 // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
62 // ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮
63 // ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮
64 // <> <> <> <> <> <> <> <>
65 // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
66 // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
67 // ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮
68 // ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮
69 // 01 23 01 23 01 23 01 23
70
71 // sort pairs (one min/max)
72 auto x = AVX::lo128(x_.data());
73 auto y = AVX::hi128(x_.data());
74 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
75 auto l = _mm_min_epi16(x, y);
76 auto h = _mm_max_epi16(x, y);
77 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
78
79 // merge left & right quads (two min/max)
80 x = _mm_unpacklo_epi16(l, h);
81 y = _mm_unpackhi_epi16(h, l);
82 Vc_DEBUG << "8x2 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
83 l = _mm_min_epi16(x, y);
84 h = _mm_max_epi16(x, y);
85 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
86 x = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
87 y = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
88 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
89 l = _mm_min_epi16(x, y);
90 h = _mm_max_epi16(x, y);
91 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
92
93 // merge quads into octs (three min/max)
94 x = _mm_unpacklo_epi16(h, l);
95 y = _mm_unpackhi_epi16(l, h);
96 Vc_DEBUG << "4x4 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
97 l = _mm_min_epi16(x, y);
98 h = _mm_max_epi16(x, y);
99 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
100 x = Mem::permuteLo<X2, X3, X0, X1>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(h, l));
101 y = Mem::permuteHi<X6, X7, X4, X5>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(l, h));
102 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
103 l = _mm_min_epi16(x, y);
104 h = _mm_max_epi16(x, y);
105 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
106 x = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
107 y = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
108 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
109 l = _mm_min_epi16(x, y);
110 h = _mm_max_epi16(x, y);
111 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h) << " done?";
112
113 // merge octs into hexa (four min/max)
114 x = _mm_unpacklo_epi16(l, h);
115 y = _mm_unpackhi_epi16(h, l);
116 Vc_DEBUG << "2x8 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
117 l = _mm_min_epi16(x, y);
118 h = _mm_max_epi16(x, y);
119 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
120 x = _mm_unpacklo_epi64(l, h);
121 y = _mm_unpackhi_epi64(l, h);
122 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
123 l = _mm_min_epi16(x, y);
124 h = _mm_max_epi16(x, y);
125 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
126 x = _mm_castps_si128(Mem::permute<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(h), _mm_castsi128_ps(l))));
127 y = _mm_castps_si128(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(l), _mm_castsi128_ps(h)));
128 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
129 l = _mm_min_epi16(x, y);
130 h = _mm_max_epi16(x, y);
131 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
132 x = Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h);
133 y = Mem::permuteLo<X1, X0, X3, X2>(
134 Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l)));
135 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
136 l = _mm_min_epi16(x, y);
137 h = _mm_max_epi16(x, y);
138 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
139 x = _mm_unpacklo_epi16(l, h);
140 y = _mm_unpackhi_epi16(l, h);
141 return AVX::concat(x, y);
142 }
143
144 template <>
sorted(AVX2::ushort_v x_)145 Vc_CONST AVX2::ushort_v sorted<CurrentImplementation::current()>(AVX2::ushort_v x_)
146 {
147 // sort pairs (one min/max)
148 auto x = AVX::lo128(x_.data());
149 auto y = AVX::hi128(x_.data());
150 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
151 auto l = _mm_min_epu16(x, y);
152 auto h = _mm_max_epu16(x, y);
153 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
154
155 // merge left & right quads (two min/max)
156 x = _mm_unpacklo_epi16(l, h);
157 y = _mm_unpackhi_epi16(h, l);
158 Vc_DEBUG << "8x2 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
159 l = _mm_min_epu16(x, y);
160 h = _mm_max_epu16(x, y);
161 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
162 x = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
163 y = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
164 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
165 l = _mm_min_epu16(x, y);
166 h = _mm_max_epu16(x, y);
167 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
168
169 // merge quads into octs (three min/max)
170 x = _mm_unpacklo_epi16(h, l);
171 y = _mm_unpackhi_epi16(l, h);
172 Vc_DEBUG << "4x4 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
173 l = _mm_min_epu16(x, y);
174 h = _mm_max_epu16(x, y);
175 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
176 x = Mem::permuteLo<X2, X3, X0, X1>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(h, l));
177 y = Mem::permuteHi<X6, X7, X4, X5>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(l, h));
178 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
179 l = _mm_min_epu16(x, y);
180 h = _mm_max_epu16(x, y);
181 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
182 x = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
183 y = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
184 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
185 l = _mm_min_epu16(x, y);
186 h = _mm_max_epu16(x, y);
187 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h) << " done?";
188
189 // merge octs into hexa (four min/max)
190 x = _mm_unpacklo_epi16(l, h);
191 y = _mm_unpackhi_epi16(h, l);
192 Vc_DEBUG << "2x8 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
193 l = _mm_min_epu16(x, y);
194 h = _mm_max_epu16(x, y);
195 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
196 x = _mm_unpacklo_epi64(l, h);
197 y = _mm_unpackhi_epi64(l, h);
198 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
199 l = _mm_min_epu16(x, y);
200 h = _mm_max_epu16(x, y);
201 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
202 x = _mm_castps_si128(Mem::permute<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(h), _mm_castsi128_ps(l))));
203 y = _mm_castps_si128(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(l), _mm_castsi128_ps(h)));
204 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
205 l = _mm_min_epu16(x, y);
206 h = _mm_max_epu16(x, y);
207 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
208 x = Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h);
209 y = Mem::permuteLo<X1, X0, X3, X2>(
210 Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l)));
211 Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
212 l = _mm_min_epu16(x, y);
213 h = _mm_max_epu16(x, y);
214 Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
215 x = _mm_unpacklo_epi16(l, h);
216 y = _mm_unpackhi_epi16(l, h);
217 return AVX::concat(x, y);
218 }
219
sorted(AVX2::int_v x_)220 template <> Vc_CONST AVX2::int_v sorted<CurrentImplementation::current()>(AVX2::int_v x_)
221 {
222 using namespace AVX;
223 const __m256i hgfedcba = x_.data();
224 const __m128i hgfe = hi128(hgfedcba);
225 const __m128i dcba = lo128(hgfedcba);
226 __m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
227 __m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
228
229 __m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
230 __m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
231
232 l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
233 h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
234
235 x = _mm_min_epi32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
236 y = _mm_max_epi32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
237
238 __m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
239 __m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
240
241 // _mm_extract_epi32 may return an unsigned int, breaking these comparisons.
242 if (Vc_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 2)) >= static_cast<int>(_mm_extract_epi32(y, 1)))) {
243 return concat(Reg::permute<X0, X1, X2, X3>(b), a);
244 } else if (Vc_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 0)) >= static_cast<int>(_mm_extract_epi32(y, 3)))) {
245 return concat(a, Reg::permute<X0, X1, X2, X3>(b));
246 }
247
248 // merge
249 l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
250 h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
251
252 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
253 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
254 l = _mm_min_epi32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
255 h = _mm_max_epi32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
256
257 a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
258 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
259 l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
260 h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
261
262 return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
263 }
264
265 template <>
sorted(AVX2::uint_v x_)266 Vc_CONST AVX2::uint_v sorted<CurrentImplementation::current()>(AVX2::uint_v x_)
267 {
268 using namespace AVX;
269 const __m256i hgfedcba = x_.data();
270 const __m128i hgfe = hi128(hgfedcba);
271 const __m128i dcba = lo128(hgfedcba);
272 __m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
273 __m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
274
275 __m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
276 __m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
277
278 l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
279 h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
280
281 x = _mm_min_epu32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
282 y = _mm_max_epu32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
283
284 __m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
285 __m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
286
287 if (Vc_IS_UNLIKELY(extract_epu32<2>(x) >= extract_epu32<1>(y))) {
288 return concat(Reg::permute<X0, X1, X2, X3>(b), a);
289 } else if (Vc_IS_UNLIKELY(extract_epu32<0>(x) >= extract_epu32<3>(y))) {
290 return concat(a, Reg::permute<X0, X1, X2, X3>(b));
291 }
292
293 // merge
294 l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
295 h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
296
297 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
298 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
299 l = _mm_min_epu32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
300 h = _mm_max_epu32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
301
302 a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
303 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
304 l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
305 h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
306
307 return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
308 }
309 #endif // AVX2
310
311 template <>
sorted(AVX2::float_v x_)312 Vc_CONST AVX2::float_v sorted<CurrentImplementation::current()>(AVX2::float_v x_)
313 {
314 __m256 hgfedcba = x_.data();
315 const __m128 hgfe = AVX::hi128(hgfedcba);
316 const __m128 dcba = AVX::lo128(hgfedcba);
317 __m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
318 __m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
319
320 __m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea
321 __m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc
322
323 l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
324 h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
325
326 x = _mm_min_ps(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
327 y = _mm_max_ps(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
328
329 __m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0
330 __m128 b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
331
332 // merge
333 l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
334 h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
335
336 a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
337 b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
338 l = _mm_min_ps(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
339 h = _mm_max_ps(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
340
341 a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
342 b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
343 l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2),↑(↓a1b1,↓a3b3)) ↓a0b3
344 h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2),↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
345
346 return AVX::concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h));
347 }
348
349 #if 0
350 template<> void SortHelper<double>::sort(__m256d &Vc_RESTRICT x, __m256d &Vc_RESTRICT y)
351 {
352 __m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0
353 __m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0
354 x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0
355 y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1
356 l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0
357 h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1)
358 x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0
359 y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0
360 l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0
361 h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0
362 __m256d a = Reg::permute<X2, X3, X1, X0>(Reg::permute128<X0, X1>(h, h)); // h0 h1 h3 h2
363 __m256d b = Reg::permute<X2, X3, X1, X0>(l); // l2 l3 l1 l0
364
365 // a3 >= a2 >= b1 >= b0
366 // b3 <= b2 <= a1 <= a0
367
368 // merge
369 l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
370 h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
371
372 x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0
373 y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1
374 l = _mm256_min_pd(x, y); // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0
375 h = _mm256_min_pd(x, y); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1)
376
377 x = Reg::permute128<Y0, X0>(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0
378 y = Reg::permute128<Y1, X1>(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3
379 l = _mm256_min_pd(x, y); // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3
380 h = _mm256_min_pd(x, y); // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3)
381
382 x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0
383 y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1
384 }
385 #endif
386 template <>
sorted(AVX2::double_v x_)387 Vc_CONST AVX2::double_v sorted<CurrentImplementation::current()>(AVX2::double_v x_)
388 {
389 __m256d dcba = x_.data();
390 /*
391 * to find the second largest number find
392 * max(min(max(ab),max(cd)), min(max(ad),max(bc)))
393 * or
394 * max(max(min(ab),min(cd)), min(max(ab),max(cd)))
395 *
396 const __m256d adcb = avx_cast<__m256d>(AVX::concat(_mm_alignr_epi8(avx_cast<__m128i>(dc), avx_cast<__m128i>(ba), 8), _mm_alignr_epi8(avx_cast<__m128i>(ba), avx_cast<__m128i>(dc), 8)));
397 const __m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab)
398 const __m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab)
399 // max(h3, h1)
400 // max(min(h0,h2), min(h3,h1))
401 // min(max(l0,l2), max(l3,l1))
402 // min(l3, l1)
403
404 const __m256d ll = _mm256_min_pd(h, Reg::permute128<X0, X1>(h, h)); // min(h3h1 h2h0 h1h3 h0h2)
405 //const __m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2);
406 const __m256d hh = _mm256_max_pd(
407 Reg::permute128<X1, Y0>(_mm256_unpackhi_pd(ll, h), l),
408 Reg::permute128<X0, Y1>(_mm256_blend_pd(h ll, 0x1), l));
409 _mm256_min_pd(hh0, hh1
410 */
411
412 //////////////////////////////////////////////////////////////////////////////////
413 // max(max(ac), max(bd))
414 // max(max(min(ac),min(bd)), min(max(ac),max(bd)))
415 // min(max(min(ac),min(bd)), min(max(ac),max(bd)))
416 // min(min(ac), min(bd))
417 __m128d l = _mm_min_pd(AVX::lo128(dcba), AVX::hi128(dcba)); // min(bd) min(ac)
418 __m128d h = _mm_max_pd(AVX::lo128(dcba), AVX::hi128(dcba)); // max(bd) max(ac)
419 __m128d h0_l0 = _mm_unpacklo_pd(l, h);
420 __m128d h1_l1 = _mm_unpackhi_pd(l, h);
421 l = _mm_min_pd(h0_l0, h1_l1);
422 h = _mm_max_pd(h0_l0, h1_l1);
423 return AVX::concat(
424 _mm_min_pd(l, Reg::permute<X0, X0>(h)),
425 _mm_max_pd(h, Reg::permute<X1, X1>(l))
426 );
427 // extract: 1 cycle
428 // min/max: 4 cycles
429 // unpacklo/hi: 2 cycles
430 // min/max: 4 cycles
431 // permute: 1 cycle
432 // min/max: 4 cycles
433 // insert: 1 cycle
434 // ----------------------
435 // total: 17 cycles
436
437 /*
438 __m256d cdab = Reg::permute<X2, X3, X0, X1>(dcba);
439 __m256d l = _mm256_min_pd(dcba, cdab);
440 __m256d h = _mm256_max_pd(dcba, cdab);
441 __m256d maxmin_ba = Reg::permute128<X0, Y0>(l, h);
442 __m256d maxmin_dc = Reg::permute128<X1, Y1>(l, h);
443
444 l = _mm256_min_pd(maxmin_ba, maxmin_dc);
445 h = _mm256_max_pd(maxmin_ba, maxmin_dc);
446
447 return _mm256_blend_pd(h, l, 0x55);
448 */
449
450 /*
451 // a b c d
452 // b a d c
453 // sort pairs
454 __m256d y, l, h;
455 __m128d l2, h2;
456 y = shuffle<X1, Y0, X3, Y2>(x, x);
457 l = _mm256_min_pd(x, y); // min[ab ab cd cd]
458 h = _mm256_max_pd(x, y); // max[ab ab cd cd]
459
460 // 1 of 2 is at [0]
461 // 1 of 4 is at [1]
462 // 1 of 4 is at [2]
463 // 1 of 2 is at [3]
464
465 // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps
466 x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd
467 l2 = _mm_min_pd(AVX::lo128(x), AVX::hi128(x)); // l_abcd l(h_ab hcd)
468 h2 = _mm_max_pd(AVX::lo128(x), AVX::hi128(x)); // h(l_ab l_cd) h_abcd
469
470 // either it is:
471 return AVX::concat(l2, h2);
472 // or:
473 // AVX::concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2));
474
475 // I'd like to have four useful compares
476 const __m128d dc = AVX::hi128(dcba);
477 const __m128d ba = AVX::lo128(dcba);
478 const __m256d adcb = avx_cast<__m256d>(AVX::concat(_mm_alignr_epi8(avx_cast<__m128i>(dc), avx_cast<__m128i>(ba), 8), _mm_alignr_epi8(avx_cast<__m128i>(ba), avx_cast<__m128i>(dc), 8)));
479
480 const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba));
481 // 0x0: d <= b && c <= a
482 // 0x1: d <= b && c > a
483 // 0x2: d > b && c <= a
484 // 0x3: d > b && c > a
485
486 switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) {
487 // impossible: 0x0, 0xf
488 case 0x1: // a <= b && b <= c && c <= d && d > a
489 // abcd
490 return Reg::permute<X2, X3, X0, X1>(Reg::permute<X0, X1>(dcba, dcba));
491 case 0x2: // a <= b && b <= c && c > d && d <= a
492 // dabc
493 return Reg::permute<X2, X3, X0, X1>(adcb);
494 case 0x3: // a <= b && b <= c && c > d && d > a
495 // a[bd]c
496 if (extraCmp & 2) {
497 // abdc
498 return Reg::permute<X2, X3, X1, X0>(Reg::permute<X0, X1>(dcba, dcba));
499 } else {
500 // adbc
501 return Reg::permute<X3, X2, X0, X1>(adcb);
502 }
503 case 0x4: // a <= b && b > c && c <= d && d <= a
504 // cdab;
505 return Reg::permute<X2, X3, X0, X1>(dcba);
506 case 0x5: // a <= b && b > c && c <= d && d > a
507 // [ac] < [bd]
508 switch (extraCmp) {
509 case 0x0: // d <= b && c <= a
510 // cadb
511 return shuffle<>(dcba, bcda);
512 case 0x1: // d <= b && c > a
513 case 0x2: // d > b && c <= a
514 case 0x3: // d > b && c > a
515 }
516 case 0x6: // a <= b && b > c && c > d && d <= a
517 // d[ac]b
518 case 0x7: // a <= b && b > c && c > d && d > a
519 // adcb;
520 return permute<X1, X0, X3, X2>(permute128<X1, X0>(bcda, bcda));
521 case 0x8: // a > b && b <= c && c <= d && d <= a
522 return bcda;
523 case 0x9: // a > b && b <= c && c <= d && d > a
524 // b[ac]d;
525 case 0xa: // a > b && b <= c && c > d && d <= a
526 // [ac] > [bd]
527 case 0xb: // a > b && b <= c && c > d && d > a
528 // badc;
529 return permute128<X1, X0>(dcba);
530 case 0xc: // a > b && b > c && c <= d && d <= a
531 // c[bd]a;
532 case 0xd: // a > b && b > c && c <= d && d > a
533 // cbad;
534 return permute<X1, X0, X3, X2>(bcda);
535 case 0xe: // a > b && b > c && c > d && d <= a
536 return dcba;
537 }
538 */
539 }
540
541 } // namespace Detail
542 } // namespace Vc
543
544 // vim: foldmethod=marker
545