1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
9 #define LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/make_shuffle_bytes_mask.h>
17 #include <simdpp/core/bit_and.h>
18 #include <simdpp/core/shuffle2.h>
19 #include <simdpp/detail/insn/transpose.h>
20 #include <simdpp/detail/neon/shuffle.h>
21 #include <simdpp/detail/null/transpose.h>
22 
23 namespace simdpp {
24 namespace SIMDPP_ARCH_NAMESPACE {
25 
26 /** Transposes four 2x2 16-bit matrices within two int16x8 vectors
27 
28     Mask or expression vectors are not supported.
29 
30     @code
31     r0 = [ a0_0; a1_0 ; ... ; a0_6; a1_6 ]
32     r1 = [ a0_1; a1_1 ; ... ; a0_7; a0_7 ]
33     @endcode
34 
35     @par 128-bit version:
36     @icost{SSE2-AVX2, 4}
37     @icost{ALTIVEC, 2-4}
38 
39     @par 256-bit version:
40     The lower and higher 128-bit halves are processed as if 128-bit instruction
41     was applied to each of them separately.
42 
43     @icost{SSE2-AVX, 8}
44     @icost{AVX2, 4}
45     @icost{NEON, 2}
46     @icost{ALTIVEC, 4-6}
47 */
48 template<unsigned N, class V> SIMDPP_INL
transpose2(any_int16<N,V> & a0,any_int16<N,V> & a1)49 void transpose2(any_int16<N,V>& a0, any_int16<N,V>& a1)
50 {
51     static_assert(!is_mask<V>::value, "Mask vectors are not supported");
52     static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
53     uint16<N> qa0 = a0.wrapped();
54     uint16<N> qa1 = a1.wrapped();
55     detail::insn::i_transpose2(qa0, qa1);
56     a0.wrapped() = qa0;
57     a1.wrapped() = qa1;
58 }
59 
60 /** Transposes two 2x2 32-bit matrices within two int32x4 vectors
61 
62     @code
63     r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
64     r1 = [ a0_1; a1_1 ; a1_3; a0_3 ]
65     @endcode
66 
67     @par 128-bit version:
68     @icost{SSE2-AVX2, 4}
69     @icost{ALTIVEC, 2-4}
70 
71     @par 256-bit version:
72     The lower and higher 128-bit halves are processed as if 128-bit instruction
73     was applied to each of them separately.
74 
75     @icost{SSE2-AVX, 8}
76     @icost{AVX2, 4}
77     @icost{NEON, 2}
78     @icost{ALTIVEC, 4-6}
79 */
80 template<unsigned N, class V> SIMDPP_INL
transpose2(any_int32<N,V> & a0,any_int32<N,V> & a1)81 void transpose2(any_int32<N,V>& a0, any_int32<N,V>& a1)
82 {
83     static_assert(!is_mask<V>::value, "Mask vectors are not supported");
84     static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
85     uint32<N> qa0 = a0.wrapped();
86     uint32<N> qa1 = a1.wrapped();
87     detail::insn::i_transpose2(qa0, qa1);
88     a0.wrapped() = qa0;
89     a1.wrapped() = qa1;
90 }
91 
92 /** Transposes a 2x2 64-bit matrix within two int64x2 vectors
93 
94     @code
95     r0 = [ a0_0; a1_0 ]
96     r1 = [ a0_1; a1_1 ]
97     @endcode
98 
99     @par 128-bit version:
100     @icost{SSE2-AVX2, 2}
101     @icost{ALTIVEC, 2-4}
102 
103     @par 256-bit version:
104     The lower and higher 128-bit halves are processed as if 128-bit instruction
105     was applied to each of them separately.
106 
107     @icost{SSE2-AVX, 4}
108     @icost{AVX2, 2}
109     @icost{NEON, 2}
110     @icost{ALTIVEC, 4-6}
111 */
112 template<unsigned N, class V> SIMDPP_INL
transpose2(any_int64<N,V> & a0,any_int64<N,V> & a1)113 void transpose2(any_int64<N,V>& a0, any_int64<N,V>& a1)
114 {
115     static_assert(!is_mask<V>::value, "Mask vectors are not supported");
116     static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
117     uint64<N> qa0 = a0.wrapped();
118     uint64<N> qa1 = a1.wrapped();
119     detail::insn::i_transpose2(qa0, qa1);
120     a0.wrapped() = qa0;
121     a1.wrapped() = qa1;
122 }
123 
124 /** Transposes two 2x2 32-bit matrices within two float32x4 vectors
125 
126     @code
127     r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
128     r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
129     @endcode
130 
131     @par 128-bit version:
132     @icost{SSE2-AVX2, 4}
133     @icost{ALTIVEC, 2-4}
134 
135     @par 256-bit version:
136     The lower and higher 128-bit halves are processed as if 128-bit instruction
137     was applied to each of them separately.
138 
139     @icost{SSE2-SSE4.1, 8}
140     @icost{AVX-AVX2, 4}
141     @icost{ALTIVEC, 4-6}
142     @icost{NEON, 2}
143 */
144 template<unsigned N> SIMDPP_INL
transpose2(float32<N> & a0,float32<N> & a1)145 void transpose2(float32<N>& a0, float32<N>& a1)
146 {
147     detail::insn::i_transpose2(a0, a1);
148 }
149 
150 /** Transposes a 2x2 64-bit matrix within two int64x2 vectors
151 
152     @code
153     r0 = [ a0_0; a1_0 ]
154     r1 = [ a0_1; a1_1 ]
155     @endcode
156 
157     @par 128-bit version:
158     @icost{SSE2-AVX2, 2}
159     @novec{NEON, ALTIVEC}
160 
161     @par 256-bit version:
162     The lower and higher 128-bit halves are processed as if 128-bit instruction
163     was applied to each of them separately.
164 
165     @icost{SSE2-SSE4.1, 4}
166     @icost{AVX-AVX2, 2}
167     @novec{NEON, ALTIVEC}
168 */
169 template<unsigned N> SIMDPP_INL
transpose2(float64<N> & a0,float64<N> & a1)170 void transpose2(float64<N>& a0, float64<N>& a1)
171 {
172     detail::insn::i_transpose2(a0, a1);
173 }
174 
175 /** Transposes four 4x4 8-bit matrix within four int8x16 vectors
176 
177     Mask or expression vectors are not supported.
178 
179     @code
180     r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
181     r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
182     r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
183     r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
184     @endcode
185 
186     @par 128-bit version:
187     @icost{SSE2-AVX2, 16}
188     @icost{NEON, 4}
189     @icost{ALTIVEC, 8-12}
190 
191     @par 256-bit version:
192     The lower and higher 128-bit halves are processed as if 128-bit instruction
193     was applied to each of them separately.
194 
195     @icost{SSE2-AVX, 32}
196     @icost{AVX2, 16}
197     @icost{NEON, 8}
198     @icost{ALTIVEC, 16-20}
199 */
200 template<unsigned N, class V> SIMDPP_INL
transpose4(any_int8<N,V> & a0,any_int8<N,V> & a1,any_int8<N,V> & a2,any_int8<N,V> & a3)201 void transpose4(any_int8<N,V>& a0, any_int8<N,V>& a1,
202                 any_int8<N,V>& a2, any_int8<N,V>& a3)
203 {
204     static_assert(!is_mask<V>::value, "Mask vectors are not supported");
205     static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
206     uint8<N> qa0, qa1, qa2, qa3;
207     qa0 = a0.wrapped();  qa1 = a1.wrapped();  qa2 = a2.wrapped();  qa3 = a3.wrapped();
208     detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
209     a0.wrapped() = qa0;  a1.wrapped() = qa1;  a2.wrapped() = qa2;  a3.wrapped() = qa3;
210 }
211 
212 /** Transposes two 4x4 16-bit matrices within four int16x8 vectors
213 
214     Mask or expression vectors are not supported.
215 
216     @code
217     r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
218     r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
219     r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
220     r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
221     @endcode
222 
223     @par 128-bit version:
224     @icost{SSE2-AVX2, 12}
225     @icost{NEON, 4}
226     @icost{ALTIVEC, 8-12}
227 
228     @par 256-bit version:
229     The lower and higher 128-bit halves are processed as if 128-bit instruction
230     was applied to each of them separately.
231 
232     @icost{SSE2-AVX, 24}
233     @icost{AVX2, 12}
234     @icost{NEON, 8}
235     @icost{ALTIVEC, 16-20}
236 */
237 template<unsigned N, class V> SIMDPP_INL
transpose4(any_int16<N,V> & a0,any_int16<N,V> & a1,any_int16<N,V> & a2,any_int16<N,V> & a3)238 void transpose4(any_int16<N,V>& a0, any_int16<N,V>& a1,
239                 any_int16<N,V>& a2, any_int16<N,V>& a3)
240 {
241     static_assert(!is_mask<V>::value, "Mask vectors are not supported");
242     static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
243     uint16<N> qa0, qa1, qa2, qa3;
244     qa0 = a0.wrapped();  qa1 = a1.wrapped();  qa2 = a2.wrapped();  qa3 = a3.wrapped();
245     detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
246     a0.wrapped() = qa0;  a1.wrapped() = qa1;  a2.wrapped() = qa2;  a3.wrapped() = qa3;
247 }
248 
249 /** Transposes a 4x4 32-bit matrix within four int32x4 vectors
250 
251     Mask or expression vectors are not supported.
252 
253     @code
254     r0 = [ a0_0; a1_0; a2_0; a3_0 ]
255     r1 = [ a0_1; a1_1; a2_1; a3_1 ]
256     r2 = [ a0_2; a1_2; a2_2; a3_2 ]
257     r3 = [ a0_3; a1_3; a2_3; a3_3 ]
258     @endcode
259 
260     @par 128-bit version:
261     @icost{SSE2-AVX2, 12}
262     @icost{NEON, 4}
263     @icost{ALTIVEC, 8-12}
264 
265     @par 256-bit version:
266     @icost{SSE2-AVX, 24}
267     @icost{AVX2, 12}
268     @icost{NEON, 8}
269     @icost{ALTIVEC, 16-20}
270 
271     The lower and higher 128-bit halves are processed as if 128-bit instruction
272     was applied to each of them separately.
273 */
274 template<unsigned N, class V> SIMDPP_INL
transpose4(any_int32<N,V> & a0,any_int32<N,V> & a1,any_int32<N,V> & a2,any_int32<N,V> & a3)275 void transpose4(any_int32<N,V>& a0, any_int32<N,V>& a1,
276                 any_int32<N,V>& a2, any_int32<N,V>& a3)
277 {
278     static_assert(!is_mask<V>::value, "Mask vectors are not supported");
279     static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
280     uint32<N> qa0, qa1, qa2, qa3;
281     qa0 = a0.wrapped();  qa1 = a1.wrapped();  qa2 = a2.wrapped();  qa3 = a3.wrapped();
282     detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
283     a0.wrapped() = qa0;  a1.wrapped() = qa1;  a2.wrapped() = qa2;  a3.wrapped() = qa3;
284 }
285 
286 /** Transposes 4x4 32-bit matrix within four float32x4 vectors
287 
288     @code
289     r0 = [ a0_0; a1_0; a2_0; a3_0 ]
290     r1 = [ a0_1; a1_1; a2_1; a3_1 ]
291     r2 = [ a0_2; a1_2; a2_2; a3_2 ]
292     r3 = [ a0_3; a1_3; a2_3; a3_3 ]
293     @endcode
294 
295     @par 128-bit version:
296     @icost{SSE2-AVX2, 12}
297     @icost{NEON, 4}
298     @icost{ALTIVEC, 8-12}
299 
300     @par 256-bit version:
301     @icost{SSE2-SSE4.1, 24}
302     @icost{AVX-AVX2, 12}
303     @icost{NEON, 8}
304     @icost{ALTIVEC, 16-20}
305 
306     The lower and higher 128-bit halves are processed as if 128-bit instruction
307     was applied to each of them separately.
308 */
309 template<unsigned N> SIMDPP_INL
transpose4(float32<N> & a0,float32<N> & a1,float32<N> & a2,float32<N> & a3)310 void transpose4(float32<N>& a0, float32<N>& a1,
311                 float32<N>& a2, float32<N>& a3)
312 {
313     detail::insn::i_transpose4(a0, a1, a2, a3);
314 }
315 
316 } // namespace SIMDPP_ARCH_NAMESPACE
317 } // namespace simdpp
318 
319 #endif
320 
321