1 /*  This file is part of the Vc library. {{{
2 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #include <Vc/avx/vector.h>
29 #include <Vc/avx/debug.h>
30 #include <Vc/avx/macros.h>
31 
32 namespace Vc_VERSIONED_NAMESPACE
33 {
34 namespace Detail
35 {
36 #ifdef Vc_IMPL_AVX2
37 template <>
sorted(AVX2::short_v x_)38 Vc_CONST AVX2::short_v sorted<CurrentImplementation::current()>(AVX2::short_v x_)
39 {
40     // ab cd ef gh ij kl mn op
41     // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
42     // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
43     // ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮
44     // ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮
45     // <> <> <> <> <> <> <> <>
46     // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
47     // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
48     // ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮
49     // ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮
50     // 01 23 01 23 01 23 01 23
51     // ⎮⎮ ⎮⎮   ╳   ⎮⎮ ⎮⎮   ╳
52     // 01 23 32 10 01 23 32 10
53     // ⎮⎝ ⎮⎝ ⎠⎮ ⎠⎮ ⎮⎝ ⎮⎝ ⎠⎮ ⎠⎮
54     // ⎮ ╲⎮ ╳ ⎮╱ ⎮ ⎮ ╲⎮ ╳ ⎮╱ ⎮
55     // ⎮  ╲╱ ╲╱  ⎮ ⎮  ╲╱ ╲╱  ⎮
56     // ⎮  ╱╲ ╱╲  ⎮ ⎮  ╱╲ ╱╲  ⎮
57     // ⎮ ╱⎮ ╳ ⎮╲ ⎮ ⎮ ╱⎮ ╳ ⎮╲ ⎮
58     // ⎮⎛ ⎮⎛ ⎞⎮ ⎞⎮ ⎮⎛ ⎮⎛ ⎞⎮ ⎞⎮
59     // <> <> <> <> <> <> <> <>
60     // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
61     // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
62     // ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮ ⎮ ╳ ⎮
63     // ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮ ⎮⎛ ⎞⎮
64     // <> <> <> <> <> <> <> <>
65     // ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑ ↓↑
66     // ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮ ⎮⎝ ⎠⎮
67     // ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮ ⎮ o ⎮
68     // ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮ ⎮↓ ↑⎮
69     // 01 23 01 23 01 23 01 23
70 
71     // sort pairs (one min/max)
72     auto x = AVX::lo128(x_.data());
73     auto y = AVX::hi128(x_.data());
74     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
75     auto l = _mm_min_epi16(x, y);
76     auto h = _mm_max_epi16(x, y);
77     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
78 
79     // merge left & right quads (two min/max)
80     x = _mm_unpacklo_epi16(l, h);
81     y = _mm_unpackhi_epi16(h, l);
82     Vc_DEBUG << "8x2 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
83     l = _mm_min_epi16(x, y);
84     h = _mm_max_epi16(x, y);
85     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
86     x = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
87     y = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
88     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
89     l = _mm_min_epi16(x, y);
90     h = _mm_max_epi16(x, y);
91     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
92 
93     // merge quads into octs (three min/max)
94     x = _mm_unpacklo_epi16(h, l);
95     y = _mm_unpackhi_epi16(l, h);
96     Vc_DEBUG << "4x4 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
97     l = _mm_min_epi16(x, y);
98     h = _mm_max_epi16(x, y);
99     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
100     x = Mem::permuteLo<X2, X3, X0, X1>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(h, l));
101     y = Mem::permuteHi<X6, X7, X4, X5>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(l, h));
102     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
103     l = _mm_min_epi16(x, y);
104     h = _mm_max_epi16(x, y);
105     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
106     x = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
107     y = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
108     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
109     l = _mm_min_epi16(x, y);
110     h = _mm_max_epi16(x, y);
111     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h) << " done?";
112 
113     // merge octs into hexa (four min/max)
114     x = _mm_unpacklo_epi16(l, h);
115     y = _mm_unpackhi_epi16(h, l);
116     Vc_DEBUG << "2x8 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
117     l = _mm_min_epi16(x, y);
118     h = _mm_max_epi16(x, y);
119     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
120     x = _mm_unpacklo_epi64(l, h);
121     y = _mm_unpackhi_epi64(l, h);
122     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
123     l = _mm_min_epi16(x, y);
124     h = _mm_max_epi16(x, y);
125     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
126     x = _mm_castps_si128(Mem::permute<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(h), _mm_castsi128_ps(l))));
127     y = _mm_castps_si128(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(l), _mm_castsi128_ps(h)));
128     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
129     l = _mm_min_epi16(x, y);
130     h = _mm_max_epi16(x, y);
131     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
132     x = Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h);
133     y = Mem::permuteLo<X1, X0, X3, X2>(
134         Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l)));
135     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
136     l = _mm_min_epi16(x, y);
137     h = _mm_max_epi16(x, y);
138     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
139     x = _mm_unpacklo_epi16(l, h);
140     y = _mm_unpackhi_epi16(l, h);
141     return AVX::concat(x, y);
142 }
143 
144 template <>
sorted(AVX2::ushort_v x_)145 Vc_CONST AVX2::ushort_v sorted<CurrentImplementation::current()>(AVX2::ushort_v x_)
146 {
147     // sort pairs (one min/max)
148     auto x = AVX::lo128(x_.data());
149     auto y = AVX::hi128(x_.data());
150     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
151     auto l = _mm_min_epu16(x, y);
152     auto h = _mm_max_epu16(x, y);
153     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
154 
155     // merge left & right quads (two min/max)
156     x = _mm_unpacklo_epi16(l, h);
157     y = _mm_unpackhi_epi16(h, l);
158     Vc_DEBUG << "8x2 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
159     l = _mm_min_epu16(x, y);
160     h = _mm_max_epu16(x, y);
161     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
162     x = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
163     y = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
164     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
165     l = _mm_min_epu16(x, y);
166     h = _mm_max_epu16(x, y);
167     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
168 
169     // merge quads into octs (three min/max)
170     x = _mm_unpacklo_epi16(h, l);
171     y = _mm_unpackhi_epi16(l, h);
172     Vc_DEBUG << "4x4 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
173     l = _mm_min_epu16(x, y);
174     h = _mm_max_epu16(x, y);
175     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
176     x = Mem::permuteLo<X2, X3, X0, X1>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(h, l));
177     y = Mem::permuteHi<X6, X7, X4, X5>(Mem::blend<X0, X1, Y2, Y3, X4, X5, Y6, Y7>(l, h));
178     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
179     l = _mm_min_epu16(x, y);
180     h = _mm_max_epu16(x, y);
181     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
182     x = Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h));
183     y = Mem::permuteLo<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l));
184     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
185     l = _mm_min_epu16(x, y);
186     h = _mm_max_epu16(x, y);
187     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h) << " done?";
188 
189     // merge octs into hexa (four min/max)
190     x = _mm_unpacklo_epi16(l, h);
191     y = _mm_unpackhi_epi16(h, l);
192     Vc_DEBUG << "2x8 sorted xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
193     l = _mm_min_epu16(x, y);
194     h = _mm_max_epu16(x, y);
195     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
196     x = _mm_unpacklo_epi64(l, h);
197     y = _mm_unpackhi_epi64(l, h);
198     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
199     l = _mm_min_epu16(x, y);
200     h = _mm_max_epu16(x, y);
201     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
202     x = _mm_castps_si128(Mem::permute<X1, X0, X3, X2>(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(h), _mm_castsi128_ps(l))));
203     y = _mm_castps_si128(Mem::blend<X0, Y1, X2, Y3>(_mm_castsi128_ps(l), _mm_castsi128_ps(h)));
204     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
205     l = _mm_min_epu16(x, y);
206     h = _mm_max_epu16(x, y);
207     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
208     x = Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(l, h);
209     y = Mem::permuteLo<X1, X0, X3, X2>(
210         Mem::permuteHi<X5, X4, X7, X6>(Mem::blend<X0, Y1, X2, Y3, X4, Y5, X6, Y7>(h, l)));
211     Vc_DEBUG << "xy: " << AVX::addType<short>(x) << AVX::addType<short>(y);
212     l = _mm_min_epu16(x, y);
213     h = _mm_max_epu16(x, y);
214     Vc_DEBUG << "lh: " << AVX::addType<short>(l) << AVX::addType<short>(h);
215     x = _mm_unpacklo_epi16(l, h);
216     y = _mm_unpackhi_epi16(l, h);
217     return AVX::concat(x, y);
218 }
219 
sorted(AVX2::int_v x_)220 template <> Vc_CONST AVX2::int_v sorted<CurrentImplementation::current()>(AVX2::int_v x_)
221 {
222     using namespace AVX;
223     const __m256i hgfedcba = x_.data();
224     const __m128i hgfe = hi128(hgfedcba);
225     const __m128i dcba = lo128(hgfedcba);
226     __m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
227     __m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
228 
229     __m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
230     __m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
231 
232     l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
233     h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
234 
235     x = _mm_min_epi32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
236     y = _mm_max_epi32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
237 
238     __m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
239     __m128i a = _mm_unpackhi_epi64(x, y);           // a3 >= a2 >= a1 >= a0
240 
241     // _mm_extract_epi32 may return an unsigned int, breaking these comparisons.
242     if (Vc_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 2)) >= static_cast<int>(_mm_extract_epi32(y, 1)))) {
243         return concat(Reg::permute<X0, X1, X2, X3>(b), a);
244     } else if (Vc_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 0)) >= static_cast<int>(_mm_extract_epi32(y, 3)))) {
245         return concat(a, Reg::permute<X0, X1, X2, X3>(b));
246     }
247 
248     // merge
249     l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
250     h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
251 
252     a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
253     b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
254     l = _mm_min_epi32(a, b);      // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
255     h = _mm_max_epi32(a, b);      // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
256 
257     a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
258     b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
259     l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
260     h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
261 
262     return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
263 }
264 
265 template <>
sorted(AVX2::uint_v x_)266 Vc_CONST AVX2::uint_v sorted<CurrentImplementation::current()>(AVX2::uint_v x_)
267 {
268     using namespace AVX;
269     const __m256i hgfedcba = x_.data();
270     const __m128i hgfe = hi128(hgfedcba);
271     const __m128i dcba = lo128(hgfedcba);
272     __m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
273     __m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
274 
275     __m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
276     __m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
277 
278     l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
279     h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
280 
281     x = _mm_min_epu32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
282     y = _mm_max_epu32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
283 
284     __m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
285     __m128i a = _mm_unpackhi_epi64(x, y);           // a3 >= a2 >= a1 >= a0
286 
287     if (Vc_IS_UNLIKELY(extract_epu32<2>(x) >= extract_epu32<1>(y))) {
288         return concat(Reg::permute<X0, X1, X2, X3>(b), a);
289     } else if (Vc_IS_UNLIKELY(extract_epu32<0>(x) >= extract_epu32<3>(y))) {
290         return concat(a, Reg::permute<X0, X1, X2, X3>(b));
291     }
292 
293     // merge
294     l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
295     h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
296 
297     a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
298     b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
299     l = _mm_min_epu32(a, b);      // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
300     h = _mm_max_epu32(a, b);      // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
301 
302     a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
303     b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
304     l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
305     h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
306 
307     return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
308 }
309 #endif  // AVX2
310 
311 template <>
sorted(AVX2::float_v x_)312 Vc_CONST AVX2::float_v sorted<CurrentImplementation::current()>(AVX2::float_v x_)
313 {
314     __m256 hgfedcba = x_.data();
315     const __m128 hgfe = AVX::hi128(hgfedcba);
316     const __m128 dcba = AVX::lo128(hgfedcba);
317     __m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
318     __m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
319 
320     __m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea
321     __m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc
322 
323     l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
324     h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
325 
326     x = _mm_min_ps(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
327     y = _mm_max_ps(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
328 
329     __m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0
330     __m128 b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
331 
332     // merge
333     l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
334     h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
335 
336     a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
337     b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
338     l = _mm_min_ps(a, b);      // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
339     h = _mm_max_ps(a, b);      // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
340 
341     a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
342     b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
343     l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2),↑(↓a1b1,↓a3b3)) ↓a0b3
344     h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2),↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
345 
346     return AVX::concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h));
347 }
348 
349 #if 0
350 template<> void SortHelper<double>::sort(__m256d &Vc_RESTRICT x, __m256d &Vc_RESTRICT y)
351 {
352     __m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0
353     __m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0
354     x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0
355     y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1
356     l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0
357     h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1)
358     x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0
359     y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0
360     l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0
361     h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0
362     __m256d a = Reg::permute<X2, X3, X1, X0>(Reg::permute128<X0, X1>(h, h)); // h0 h1 h3 h2
363     __m256d b = Reg::permute<X2, X3, X1, X0>(l);                             // l2 l3 l1 l0
364 
365     // a3 >= a2 >= b1 >= b0
366     // b3 <= b2 <= a1 <= a0
367 
368     // merge
369     l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
370     h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
371 
372     x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0
373     y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1
374     l = _mm256_min_pd(x, y);      // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0
375     h = _mm256_min_pd(x, y);      // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1)
376 
377     x = Reg::permute128<Y0, X0>(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0
378     y = Reg::permute128<Y1, X1>(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3
379     l = _mm256_min_pd(x, y);      // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3
380     h = _mm256_min_pd(x, y);      // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3)
381 
382     x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0
383     y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1
384 }
385 #endif
386 template <>
sorted(AVX2::double_v x_)387 Vc_CONST AVX2::double_v sorted<CurrentImplementation::current()>(AVX2::double_v x_)
388 {
389     __m256d dcba = x_.data();
390     /*
391      * to find the second largest number find
392      * max(min(max(ab),max(cd)), min(max(ad),max(bc)))
393      *  or
394      * max(max(min(ab),min(cd)), min(max(ab),max(cd)))
395      *
396     const __m256d adcb = avx_cast<__m256d>(AVX::concat(_mm_alignr_epi8(avx_cast<__m128i>(dc), avx_cast<__m128i>(ba), 8), _mm_alignr_epi8(avx_cast<__m128i>(ba), avx_cast<__m128i>(dc), 8)));
397     const __m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab)
398     const __m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab)
399     // max(h3, h1)
400     // max(min(h0,h2), min(h3,h1))
401     // min(max(l0,l2), max(l3,l1))
402     // min(l3, l1)
403 
404     const __m256d ll = _mm256_min_pd(h, Reg::permute128<X0, X1>(h, h)); // min(h3h1 h2h0 h1h3 h0h2)
405     //const __m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2);
406     const __m256d hh = _mm256_max_pd(
407             Reg::permute128<X1, Y0>(_mm256_unpackhi_pd(ll, h), l),
408             Reg::permute128<X0, Y1>(_mm256_blend_pd(h ll, 0x1), l));
409     _mm256_min_pd(hh0, hh1
410      */
411 
412     //////////////////////////////////////////////////////////////////////////////////
413     // max(max(ac), max(bd))
414     // max(max(min(ac),min(bd)), min(max(ac),max(bd)))
415     // min(max(min(ac),min(bd)), min(max(ac),max(bd)))
416     // min(min(ac), min(bd))
417     __m128d l = _mm_min_pd(AVX::lo128(dcba), AVX::hi128(dcba)); // min(bd) min(ac)
418     __m128d h = _mm_max_pd(AVX::lo128(dcba), AVX::hi128(dcba)); // max(bd) max(ac)
419     __m128d h0_l0 = _mm_unpacklo_pd(l, h);
420     __m128d h1_l1 = _mm_unpackhi_pd(l, h);
421     l = _mm_min_pd(h0_l0, h1_l1);
422     h = _mm_max_pd(h0_l0, h1_l1);
423     return AVX::concat(
424         _mm_min_pd(l, Reg::permute<X0, X0>(h)),
425         _mm_max_pd(h, Reg::permute<X1, X1>(l))
426             );
427     // extract: 1 cycle
428     // min/max: 4 cycles
429     // unpacklo/hi: 2 cycles
430     // min/max: 4 cycles
431     // permute: 1 cycle
432     // min/max: 4 cycles
433     // insert:  1 cycle
434     // ----------------------
435     // total:   17 cycles
436 
437     /*
438     __m256d cdab = Reg::permute<X2, X3, X0, X1>(dcba);
439     __m256d l = _mm256_min_pd(dcba, cdab);
440     __m256d h = _mm256_max_pd(dcba, cdab);
441     __m256d maxmin_ba = Reg::permute128<X0, Y0>(l, h);
442     __m256d maxmin_dc = Reg::permute128<X1, Y1>(l, h);
443 
444     l = _mm256_min_pd(maxmin_ba, maxmin_dc);
445     h = _mm256_max_pd(maxmin_ba, maxmin_dc);
446 
447     return _mm256_blend_pd(h, l, 0x55);
448     */
449 
450     /*
451     // a b c d
452     // b a d c
453     // sort pairs
454     __m256d y, l, h;
455     __m128d l2, h2;
456     y = shuffle<X1, Y0, X3, Y2>(x, x);
457     l = _mm256_min_pd(x, y); // min[ab ab cd cd]
458     h = _mm256_max_pd(x, y); // max[ab ab cd cd]
459 
460     // 1 of 2 is at [0]
461     // 1 of 4 is at [1]
462     // 1 of 4 is at [2]
463     // 1 of 2 is at [3]
464 
465     // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps
466     x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd
467     l2 = _mm_min_pd(AVX::lo128(x), AVX::hi128(x)); // l_abcd l(h_ab hcd)
468     h2 = _mm_max_pd(AVX::lo128(x), AVX::hi128(x)); // h(l_ab l_cd) h_abcd
469 
470     // either it is:
471     return AVX::concat(l2, h2);
472     // or:
473     // AVX::concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2));
474 
475     // I'd like to have four useful compares
476     const __m128d dc = AVX::hi128(dcba);
477     const __m128d ba = AVX::lo128(dcba);
478     const __m256d adcb = avx_cast<__m256d>(AVX::concat(_mm_alignr_epi8(avx_cast<__m128i>(dc), avx_cast<__m128i>(ba), 8), _mm_alignr_epi8(avx_cast<__m128i>(ba), avx_cast<__m128i>(dc), 8)));
479 
480     const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba));
481     // 0x0: d <= b && c <= a
482     // 0x1: d <= b && c >  a
483     // 0x2: d >  b && c <= a
484     // 0x3: d >  b && c >  a
485 
486     switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) {
487     // impossible: 0x0, 0xf
488     case 0x1: // a <= b && b <= c && c <= d && d >  a
489         // abcd
490         return Reg::permute<X2, X3, X0, X1>(Reg::permute<X0, X1>(dcba, dcba));
491     case 0x2: // a <= b && b <= c && c >  d && d <= a
492         // dabc
493         return Reg::permute<X2, X3, X0, X1>(adcb);
494     case 0x3: // a <= b && b <= c && c >  d && d >  a
495         // a[bd]c
496         if (extraCmp & 2) {
497             // abdc
498             return Reg::permute<X2, X3, X1, X0>(Reg::permute<X0, X1>(dcba, dcba));
499         } else {
500             // adbc
501             return Reg::permute<X3, X2, X0, X1>(adcb);
502         }
503     case 0x4: // a <= b && b >  c && c <= d && d <= a
504         // cdab;
505         return Reg::permute<X2, X3, X0, X1>(dcba);
506     case 0x5: // a <= b && b >  c && c <= d && d >  a
507         // [ac] < [bd]
508         switch (extraCmp) {
509         case 0x0: // d <= b && c <= a
510             // cadb
511             return shuffle<>(dcba, bcda);
512         case 0x1: // d <= b && c >  a
513         case 0x2: // d >  b && c <= a
514         case 0x3: // d >  b && c >  a
515         }
516     case 0x6: // a <= b && b >  c && c >  d && d <= a
517         // d[ac]b
518     case 0x7: // a <= b && b >  c && c >  d && d >  a
519         // adcb;
520         return permute<X1, X0, X3, X2>(permute128<X1, X0>(bcda, bcda));
521     case 0x8: // a >  b && b <= c && c <= d && d <= a
522         return bcda;
523     case 0x9: // a >  b && b <= c && c <= d && d >  a
524         // b[ac]d;
525     case 0xa: // a >  b && b <= c && c >  d && d <= a
526         // [ac] > [bd]
527     case 0xb: // a >  b && b <= c && c >  d && d >  a
528         // badc;
529         return permute128<X1, X0>(dcba);
530     case 0xc: // a >  b && b >  c && c <= d && d <= a
531         // c[bd]a;
532     case 0xd: // a >  b && b >  c && c <= d && d >  a
533         // cbad;
534         return permute<X1, X0, X3, X2>(bcda);
535     case 0xe: // a >  b && b >  c && c >  d && d <= a
536         return dcba;
537     }
538     */
539 }
540 
541 }  // namespace Detail
542 }  // namespace Vc
543 
544 // vim: foldmethod=marker
545