1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SPLIT_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_SPLIT_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 
17 namespace simdpp {
18 namespace SIMDPP_ARCH_NAMESPACE {
19 namespace detail {
20 namespace insn {
21 
22 
23 #if SIMDPP_USE_AVX2
24 static SIMDPP_INL
i_split(const uint8<32> & a,uint8<16> & r1,uint8<16> & r2)25 void i_split(const uint8<32>& a,  uint8<16>& r1, uint8<16>& r2)
26 {
27     r1 = _mm256_castsi256_si128(a.native());
28     r2 = _mm256_extracti128_si256(a.native(), 1);
29 }
30 #endif
31 
32 #if SIMDPP_USE_AVX512BW
i_split(const uint8<64> & a,uint8<32> & r1,uint8<32> & r2)33 SIMDPP_INL void i_split(const uint8<64>& a, uint8<32>& r1, uint8<32>& r2)
34 {
35     r1 = _mm512_castsi512_si256(a.native());
36     r2 = _mm512_extracti64x4_epi64(a.native(), 1);
37 }
38 #endif
39 
40 // -----------------------------------------------------------------------------
41 
42 #if SIMDPP_USE_AVX2
43 static SIMDPP_INL
i_split(const uint16<16> & a,uint16<8> & r1,uint16<8> & r2)44 void i_split(const uint16<16>& a, uint16<8>& r1, uint16<8>& r2)
45 {
46     r1 = _mm256_castsi256_si128(a.native());
47     r2 = _mm256_extracti128_si256(a.native(), 1);
48 }
49 #endif
50 
51 #if SIMDPP_USE_AVX512BW
i_split(const uint16<32> & a,uint16<16> & r1,uint16<16> & r2)52 SIMDPP_INL void i_split(const uint16<32>& a, uint16<16>& r1, uint16<16>& r2)
53 {
54     r1 = _mm512_castsi512_si256(a.native());
55     r2 = _mm512_extracti64x4_epi64(a.native(), 1);
56 }
57 #endif
58 
59 // -----------------------------------------------------------------------------
60 
61 #if SIMDPP_USE_AVX2
62 static SIMDPP_INL
i_split(const uint32<8> & a,uint32<4> & r1,uint32<4> & r2)63 void i_split(const uint32<8>& a, uint32<4>& r1, uint32<4>& r2)
64 {
65     r1 = _mm256_castsi256_si128(a.native());
66     r2 = _mm256_extracti128_si256(a.native(), 1);
67 }
68 #endif
69 
70 #if SIMDPP_USE_AVX512F
71 static SIMDPP_INL
i_split(const uint32<16> & a,uint32<8> & r1,uint32<8> & r2)72 void i_split(const uint32<16>& a, uint32<8>& r1, uint32<8>& r2)
73 {
74     r1 = _mm512_castsi512_si256(a.native());
75     r2 = _mm512_extracti64x4_epi64(a.native(), 1);
76 }
77 #endif
78 
79 // -----------------------------------------------------------------------------
80 
81 #if SIMDPP_USE_AVX2
82 static SIMDPP_INL
i_split(const uint64<4> & a,uint64<2> & r1,uint64<2> & r2)83 void i_split(const uint64<4>& a, uint64<2>& r1, uint64<2>& r2)
84 {
85     r1 = _mm256_castsi256_si128(a.native());
86     r2 = _mm256_extracti128_si256(a.native(), 1);
87 }
88 #endif
89 
90 #if SIMDPP_USE_AVX512F
91 static SIMDPP_INL
i_split(const uint64<8> & a,uint64<4> & r1,uint64<4> & r2)92 void i_split(const uint64<8>& a, uint64<4>& r1, uint64<4>& r2)
93 {
94     r1 = _mm512_castsi512_si256(a.native());
95     r2 = _mm512_extracti64x4_epi64(a.native(), 1);
96 }
97 #endif
98 
99 // -----------------------------------------------------------------------------
100 
101 #if SIMDPP_USE_AVX
102 static SIMDPP_INL
i_split(const float32<8> & a,float32<4> & r1,float32<4> & r2)103 void i_split(const float32<8>& a, float32<4>& r1, float32<4>& r2)
104 {
105     r1 = _mm256_castps256_ps128(a.native());
106     r2 = _mm256_extractf128_ps(a.native(), 1);
107 }
108 #endif
109 
110 #if SIMDPP_USE_AVX512F
111 static SIMDPP_INL
i_split(const float32<16> & a,float32<8> & r1,float32<8> & r2)112 void i_split(const float32<16>& a, float32<8>& r1, float32<8>& r2)
113 {
114     r1 = _mm512_castps512_ps256(a.native());
115     r2 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a.native()), 1));
116 }
117 #endif
118 
119 // -----------------------------------------------------------------------------
120 
121 #if SIMDPP_USE_AVX
122 static SIMDPP_INL
i_split(const float64<4> & a,float64<2> & r1,float64<2> & r2)123 void i_split(const float64<4>& a, float64<2>& r1, float64<2>& r2)
124 {
125     r1 = _mm256_castpd256_pd128(a.native());
126     r2 = _mm256_extractf128_pd(a.native(), 1);
127 }
128 #endif
129 
130 #if SIMDPP_USE_AVX512F
131 static SIMDPP_INL
i_split(const float64<8> & a,float64<4> & r1,float64<4> & r2)132 void i_split(const float64<8>& a, float64<4>& r1, float64<4>& r2)
133 {
134     // r1 = _mm512_castpd512_pd256(a.native()); GCC BUG
135     r1 = _mm512_extractf64x4_pd(a.native(), 0);
136     r2 = _mm512_extractf64x4_pd(a.native(), 1);
137 }
138 #endif
139 
140 // -----------------------------------------------------------------------------
141 // generic version -- picked up if none of the above matches the arguments
142 
143 template<class V, class H> SIMDPP_INL
i_split(const V & a,H & r1,H & r2)144 void i_split(const V& a, H& r1, H& r2)
145 {
146     unsigned h = H::vec_length;
147     for (unsigned i = 0; i < h; ++i) { r1.vec(i) = a.vec(i); }
148     for (unsigned i = 0; i < h; ++i) { r2.vec(i) = a.vec(i+h); }
149 }
150 
151 
152 } // namespace insn
153 } // namespace detail
154 } // namespace SIMDPP_ARCH_NAMESPACE
155 } // namespace simdpp
156 
157 #endif
158