1 /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/detail/align.h>
17 #include <simdpp/detail/insn/mem_unpack.h>
18 #include <simdpp/core/load.h>
19 #include <simdpp/core/transpose.h>
20 #include <simdpp/detail/null/memory.h>
21
22 namespace simdpp {
23 namespace SIMDPP_ARCH_NAMESPACE {
24 namespace detail {
25 namespace insn {
26
27
28 // collect some boilerplate
29 template<class V> SIMDPP_INL
30 void v128_load_packed3(V& a, V& b, V& c, const char* p);
31 template<class V> SIMDPP_INL
32 void v256_load_packed3(V& a, V& b, V& c, const char* p);
33 template<class V> SIMDPP_INL
34 void v512_load_packed3(V& a, V& b, V& c, const char* p);
35
36 // -----------------------------------------------------------------------------
37
38 static SIMDPP_INL
i_load_packed3(uint8x16 & a,uint8x16 & b,uint8x16 & c,const char * p)39 void i_load_packed3(uint8x16& a, uint8x16& b, uint8x16& c, const char* p)
40 {
41 p = detail::assume_aligned(p, 16);
42 #if SIMDPP_USE_NULL
43 detail::null::load_packed3(a, b, c, p);
44 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
45 v128_load_packed3(a, b, c, p);
46 #elif SIMDPP_USE_NEON
47 auto r = vld3q_u8(reinterpret_cast<const uint8_t*>(p));
48 a = r.val[0];
49 b = r.val[1];
50 c = r.val[2];
51 #endif
52 }
53
54 #if SIMDPP_USE_AVX2
55 static SIMDPP_INL
i_load_packed3(uint8x32 & a,uint8x32 & b,uint8x32 & c,const char * p)56 void i_load_packed3(uint8x32& a, uint8x32& b, uint8x32& c, const char* p)
57 {
58 v256_load_packed3(a, b, c, p);
59 }
60 #endif
61
62 #if SIMDPP_USE_AVX512BW
i_load_packed3(uint8<64> & a,uint8<64> & b,uint8<64> & c,const char * p)63 SIMDPP_INL void i_load_packed3(uint8<64>& a, uint8<64>& b, uint8<64>& c, const char* p)
64 {
65 v512_load_packed3(a, b, c, p);
66 }
67 #endif
68
69 // -----------------------------------------------------------------------------
70
71 static SIMDPP_INL
i_load_packed3(uint16x8 & a,uint16x8 & b,uint16x8 & c,const char * p)72 void i_load_packed3(uint16x8& a, uint16x8& b, uint16x8& c,
73 const char* p)
74 {
75 p = detail::assume_aligned(p, 16);
76 #if SIMDPP_USE_NULL
77 detail::null::load_packed3(a, b, c, p);
78 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
79 v128_load_packed3(a, b, c, p);
80 #elif SIMDPP_USE_NEON
81 auto r = vld3q_u16(reinterpret_cast<const uint16_t*>(p));
82 a = r.val[0];
83 b = r.val[1];
84 c = r.val[2];
85 #endif
86 }
87
88 #if SIMDPP_USE_AVX2
89 static SIMDPP_INL
i_load_packed3(uint16x16 & a,uint16x16 & b,uint16x16 & c,const char * p)90 void i_load_packed3(uint16x16& a, uint16x16& b, uint16x16& c,
91 const char* p)
92 {
93 v256_load_packed3(a, b, c, p);
94 }
95 #endif
96
97 #if SIMDPP_USE_AVX512BW
i_load_packed3(uint16<32> & a,uint16<32> & b,uint16<32> & c,const char * p)98 SIMDPP_INL void i_load_packed3(uint16<32>& a, uint16<32>& b, uint16<32>& c,
99 const char* p)
100 {
101 v512_load_packed3(a, b, c, p);
102 }
103 #endif
104
105 // -----------------------------------------------------------------------------
106
107 static SIMDPP_INL
i_load_packed3(uint32x4 & a,uint32x4 & b,uint32x4 & c,const char * p)108 void i_load_packed3(uint32x4& a, uint32x4& b, uint32x4&c, const char* p)
109 {
110 p = detail::assume_aligned(p, 16);
111 #if SIMDPP_USE_NULL
112 detail::null::load_packed3(a, b, c, p);
113 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
114 v128_load_packed3(a, b, c, p);
115 #elif SIMDPP_USE_NEON
116 auto r = vld3q_u32(reinterpret_cast<const uint32_t*>(p));
117 a = r.val[0];
118 b = r.val[1];
119 c = r.val[2];
120 #endif
121 }
122
123 #if SIMDPP_USE_AVX2
124 static SIMDPP_INL
i_load_packed3(uint32x8 & a,uint32x8 & b,uint32x8 & c,const char * p)125 void i_load_packed3(uint32x8& a, uint32x8& b, uint32x8& c, const char* p)
126 {
127 v256_load_packed3(a, b, c, p);
128 }
129 #endif
130
131 #if SIMDPP_USE_AVX512F
132 static SIMDPP_INL
i_load_packed3(uint32<16> & a,uint32<16> & b,uint32<16> & c,const char * p)133 void i_load_packed3(uint32<16>& a, uint32<16>& b, uint32<16>& c, const char* p)
134 {
135 v512_load_packed3(a, b, c, p);
136 }
137 #endif
138
139 // -----------------------------------------------------------------------------
140
141 static SIMDPP_INL
i_load_packed3(uint64x2 & a,uint64x2 & b,uint64x2 & c,const char * p)142 void i_load_packed3(uint64x2& a, uint64x2& b, uint64x2& c, const char* p)
143 {
144 p = detail::assume_aligned(p, 16);
145 #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
146 v128_load_packed3(a, b, c, p);
147 #elif SIMDPP_USE_NEON64
148 auto r = vld3q_u64(reinterpret_cast<const uint64_t*>(p));
149 a = r.val[0];
150 b = r.val[1];
151 c = r.val[2];
152 #elif SIMDPP_USE_NEON32
153 uint64x2 a0, b0, c0;
154 a0 = load(p);
155 b0 = load(p+16);
156 c0 = load(p+32);
157
158 int64x1_t al, bl, cl, ah, bh, ch;
159 al = vget_low_u64(a0.native());
160 ah = vget_high_u64(a0.native());
161 bl = vget_low_u64(b0.native());
162 bh = vget_high_u64(b0.native());
163 cl = vget_low_u64(c0.native());
164 ch = vget_high_u64(c0.native());
165 a = vcombine_u64(al, bh);
166 b = vcombine_u64(ah, cl);
167 c = vcombine_u64(bl, ch);
168 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
169 detail::null::load_packed3(a, b, c, p);
170 #endif
171 }
172
173 #if SIMDPP_USE_AVX2
174 static SIMDPP_INL
i_load_packed3(uint64x4 & a,uint64x4 & b,uint64x4 & c,const char * p)175 void i_load_packed3(uint64x4& a, uint64x4& b, uint64x4& c, const char* p)
176 {
177 v256_load_packed3(a, b, c, p);
178 }
179 #endif
180
181 #if SIMDPP_USE_AVX512F
182 static SIMDPP_INL
i_load_packed3(uint64<8> & a,uint64<8> & b,uint64<8> & c,const char * p)183 void i_load_packed3(uint64<8>& a, uint64<8>& b, uint64<8>& c,
184 const char* p)
185 {
186 v512_load_packed3(a, b, c, p);
187 }
188 #endif
189
190 // -----------------------------------------------------------------------------
191
192 static SIMDPP_INL
i_load_packed3(float32x4 & a,float32x4 & b,float32x4 & c,const char * p)193 void i_load_packed3(float32x4& a, float32x4& b, float32x4& c, const char* p)
194 {
195 p = detail::assume_aligned(p, 16);
196 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
197 detail::null::load_packed3(a, b, c, p);
198 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
199 v128_load_packed3(a, b, c, p);
200 #elif SIMDPP_USE_NEON
201 auto r = vld3q_f32(reinterpret_cast<const float*>(p));
202 a = r.val[0];
203 b = r.val[1];
204 c = r.val[2];
205 #endif
206 }
207
208 #if SIMDPP_USE_AVX
209 static SIMDPP_INL
i_load_packed3(float32x8 & a,float32x8 & b,float32x8 & c,const char * p)210 void i_load_packed3(float32x8& a, float32x8& b, float32x8& c, const char* p)
211 {
212 v256_load_packed3(a, b, c, p);
213 }
214 #endif
215
216 #if SIMDPP_USE_AVX512F
217 static SIMDPP_INL
i_load_packed3(float32<16> & a,float32<16> & b,float32<16> & c,const char * p)218 void i_load_packed3(float32<16>& a, float32<16>& b, float32<16>& c,
219 const char* p)
220 {
221 v512_load_packed3(a, b, c, p);
222 }
223 #endif
224
225 // -----------------------------------------------------------------------------
226
227 static SIMDPP_INL
i_load_packed3(float64x2 & a,float64x2 & b,float64x2 & c,const char * p)228 void i_load_packed3(float64x2& a, float64x2& b, float64x2& c, const char* p)
229 {
230 p = detail::assume_aligned(p, 16);
231 #if SIMDPP_USE_NEON64
232 auto r = vld3q_f64(reinterpret_cast<const double*>(p));
233 a = r.val[0];
234 b = r.val[1];
235 c = r.val[2];
236 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
237 v128_load_packed3(a, b, c, p);
238 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
239 detail::null::load_packed3(a, b, c, p);
240 #endif
241 }
242
243 #if SIMDPP_USE_AVX
244 static SIMDPP_INL
i_load_packed3(float64x4 & a,float64x4 & b,float64x4 & c,const char * p)245 void i_load_packed3(float64x4& a, float64x4& b, float64x4& c,
246 const char* p)
247 {
248 v256_load_packed3(a, b, c, p);
249 }
250 #endif
251
252 #if SIMDPP_USE_AVX512F
253 static SIMDPP_INL
i_load_packed3(float64<8> & a,float64<8> & b,float64<8> & c,const char * p)254 void i_load_packed3(float64<8>& a, float64<8>& b, float64<8>& c,
255 const char* p)
256 {
257 v512_load_packed3(a, b, c, p);
258 }
259 #endif
260
261 // -----------------------------------------------------------------------------
262
263 template<class V> SIMDPP_INL
v128_load_packed3(V & a,V & b,V & c,const char * p)264 void v128_load_packed3(V& a, V& b, V& c, const char* p)
265 {
266 p = detail::assume_aligned(p, 16);
267 a = load(p);
268 b = load(p + 16);
269 c = load(p + 32);
270 mem_unpack3(a, b, c);
271 }
272
273 template<class V> SIMDPP_INL
v256_load_packed3(V & a,V & b,V & c,const char * p)274 void v256_load_packed3(V& a, V& b, V& c, const char* p)
275 {
276 p = detail::assume_aligned(p, 32);
277 a = load(p);
278 b = load(p + 32);
279 c = load(p + 64);
280 mem_unpack3(a, b, c);
281 }
282
283 template<class V> SIMDPP_INL
v512_load_packed3(V & a,V & b,V & c,const char * p)284 void v512_load_packed3(V& a, V& b, V& c, const char* p)
285 {
286 p = detail::assume_aligned(p, 64);
287 a = load(p);
288 b = load(p + 64);
289 c = load(p + 128);
290 mem_unpack3(a, b, c);
291 }
292
293
294 template<class V> SIMDPP_INL
i_load_packed3(V & a,V & b,V & c,const char * p)295 void i_load_packed3(V& a, V& b, V& c, const char* p)
296 {
297 const unsigned veclen = V::base_vector_type::length_bytes;
298
299 p = detail::assume_aligned(p, veclen);
300 for (unsigned i = 0; i < V::vec_length; ++i) {
301 i_load_packed3(a.vec(i), b.vec(i), c.vec(i), p);
302 p += veclen*3;
303 }
304 }
305
306 } // namespace insn
307 } // namespace detail
308 } // namespace SIMDPP_ARCH_NAMESPACE
309 } // namespace simdpp
310
311 #endif
312
313