1 /*  Copyright (C) 2013-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/align.h>
17 #include <simdpp/detail/insn/mem_unpack.h>
18 #include <simdpp/core/load.h>
19 #include <simdpp/core/transpose.h>
20 #include <simdpp/detail/null/memory.h>
21 
22 namespace simdpp {
23 namespace SIMDPP_ARCH_NAMESPACE {
24 namespace detail {
25 namespace insn {
26 
27 
28 // collect some boilerplate
29 template<class V> SIMDPP_INL
30 void v128_load_packed3(V& a, V& b, V& c, const char* p);
31 template<class V> SIMDPP_INL
32 void v256_load_packed3(V& a, V& b, V& c, const char* p);
33 template<class V> SIMDPP_INL
34 void v512_load_packed3(V& a, V& b, V& c, const char* p);
35 
36 // -----------------------------------------------------------------------------
37 
38 static SIMDPP_INL
i_load_packed3(uint8x16 & a,uint8x16 & b,uint8x16 & c,const char * p)39 void i_load_packed3(uint8x16& a, uint8x16& b, uint8x16& c, const char* p)
40 {
41     p = detail::assume_aligned(p, 16);
42 #if SIMDPP_USE_NULL
43     detail::null::load_packed3(a, b, c, p);
44 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
45     v128_load_packed3(a, b, c, p);
46 #elif SIMDPP_USE_NEON
47     auto r = vld3q_u8(reinterpret_cast<const uint8_t*>(p));
48     a = r.val[0];
49     b = r.val[1];
50     c = r.val[2];
51 #endif
52 }
53 
54 #if SIMDPP_USE_AVX2
55 static SIMDPP_INL
i_load_packed3(uint8x32 & a,uint8x32 & b,uint8x32 & c,const char * p)56 void i_load_packed3(uint8x32& a, uint8x32& b, uint8x32& c, const char* p)
57 {
58     v256_load_packed3(a, b, c, p);
59 }
60 #endif
61 
62 #if SIMDPP_USE_AVX512BW
i_load_packed3(uint8<64> & a,uint8<64> & b,uint8<64> & c,const char * p)63 SIMDPP_INL void i_load_packed3(uint8<64>& a, uint8<64>& b, uint8<64>& c, const char* p)
64 {
65     v512_load_packed3(a, b, c, p);
66 }
67 #endif
68 
69 // -----------------------------------------------------------------------------
70 
71 static SIMDPP_INL
i_load_packed3(uint16x8 & a,uint16x8 & b,uint16x8 & c,const char * p)72 void i_load_packed3(uint16x8& a, uint16x8& b, uint16x8& c,
73                                const char* p)
74 {
75     p = detail::assume_aligned(p, 16);
76 #if SIMDPP_USE_NULL
77     detail::null::load_packed3(a, b, c, p);
78 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
79     v128_load_packed3(a, b, c, p);
80 #elif SIMDPP_USE_NEON
81     auto r = vld3q_u16(reinterpret_cast<const uint16_t*>(p));
82     a = r.val[0];
83     b = r.val[1];
84     c = r.val[2];
85 #endif
86 }
87 
88 #if SIMDPP_USE_AVX2
89 static SIMDPP_INL
i_load_packed3(uint16x16 & a,uint16x16 & b,uint16x16 & c,const char * p)90 void i_load_packed3(uint16x16& a, uint16x16& b, uint16x16& c,
91                                const char* p)
92 {
93     v256_load_packed3(a, b, c, p);
94 }
95 #endif
96 
97 #if SIMDPP_USE_AVX512BW
i_load_packed3(uint16<32> & a,uint16<32> & b,uint16<32> & c,const char * p)98 SIMDPP_INL void i_load_packed3(uint16<32>& a, uint16<32>& b, uint16<32>& c,
99                                const char* p)
100 {
101     v512_load_packed3(a, b, c, p);
102 }
103 #endif
104 
105 // -----------------------------------------------------------------------------
106 
107 static SIMDPP_INL
i_load_packed3(uint32x4 & a,uint32x4 & b,uint32x4 & c,const char * p)108 void i_load_packed3(uint32x4& a, uint32x4& b, uint32x4&c, const char* p)
109 {
110     p = detail::assume_aligned(p, 16);
111 #if SIMDPP_USE_NULL
112     detail::null::load_packed3(a, b, c, p);
113 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
114     v128_load_packed3(a, b, c, p);
115 #elif SIMDPP_USE_NEON
116     auto r = vld3q_u32(reinterpret_cast<const uint32_t*>(p));
117     a = r.val[0];
118     b = r.val[1];
119     c = r.val[2];
120 #endif
121 }
122 
123 #if SIMDPP_USE_AVX2
124 static SIMDPP_INL
i_load_packed3(uint32x8 & a,uint32x8 & b,uint32x8 & c,const char * p)125 void i_load_packed3(uint32x8& a, uint32x8& b, uint32x8& c, const char* p)
126 {
127     v256_load_packed3(a, b, c, p);
128 }
129 #endif
130 
131 #if SIMDPP_USE_AVX512F
132 static SIMDPP_INL
i_load_packed3(uint32<16> & a,uint32<16> & b,uint32<16> & c,const char * p)133 void i_load_packed3(uint32<16>& a, uint32<16>& b, uint32<16>& c, const char* p)
134 {
135     v512_load_packed3(a, b, c, p);
136 }
137 #endif
138 
139 // -----------------------------------------------------------------------------
140 
141 static SIMDPP_INL
i_load_packed3(uint64x2 & a,uint64x2 & b,uint64x2 & c,const char * p)142 void i_load_packed3(uint64x2& a, uint64x2& b, uint64x2& c, const char* p)
143 {
144     p = detail::assume_aligned(p, 16);
145 #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
146     v128_load_packed3(a, b, c, p);
147 #elif SIMDPP_USE_NEON64
148     auto r = vld3q_u64(reinterpret_cast<const uint64_t*>(p));
149     a = r.val[0];
150     b = r.val[1];
151     c = r.val[2];
152 #elif SIMDPP_USE_NEON32
153     uint64x2 a0, b0, c0;
154     a0 = load(p);
155     b0 = load(p+16);
156     c0 = load(p+32);
157 
158     int64x1_t al, bl, cl, ah, bh, ch;
159     al = vget_low_u64(a0.native());
160     ah = vget_high_u64(a0.native());
161     bl = vget_low_u64(b0.native());
162     bh = vget_high_u64(b0.native());
163     cl = vget_low_u64(c0.native());
164     ch = vget_high_u64(c0.native());
165     a = vcombine_u64(al, bh);
166     b = vcombine_u64(ah, cl);
167     c = vcombine_u64(bl, ch);
168 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
169     detail::null::load_packed3(a, b, c, p);
170 #endif
171 }
172 
173 #if SIMDPP_USE_AVX2
174 static SIMDPP_INL
i_load_packed3(uint64x4 & a,uint64x4 & b,uint64x4 & c,const char * p)175 void i_load_packed3(uint64x4& a, uint64x4& b, uint64x4& c, const char* p)
176 {
177     v256_load_packed3(a, b, c, p);
178 }
179 #endif
180 
181 #if SIMDPP_USE_AVX512F
182 static SIMDPP_INL
i_load_packed3(uint64<8> & a,uint64<8> & b,uint64<8> & c,const char * p)183 void i_load_packed3(uint64<8>& a, uint64<8>& b, uint64<8>& c,
184                                const char* p)
185 {
186     v512_load_packed3(a, b, c, p);
187 }
188 #endif
189 
190 // -----------------------------------------------------------------------------
191 
192 static SIMDPP_INL
i_load_packed3(float32x4 & a,float32x4 & b,float32x4 & c,const char * p)193 void i_load_packed3(float32x4& a, float32x4& b, float32x4& c, const char* p)
194 {
195     p = detail::assume_aligned(p, 16);
196 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
197     detail::null::load_packed3(a, b, c, p);
198 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
199     v128_load_packed3(a, b, c, p);
200 #elif SIMDPP_USE_NEON
201     auto r = vld3q_f32(reinterpret_cast<const float*>(p));
202     a = r.val[0];
203     b = r.val[1];
204     c = r.val[2];
205 #endif
206 }
207 
208 #if SIMDPP_USE_AVX
209 static SIMDPP_INL
i_load_packed3(float32x8 & a,float32x8 & b,float32x8 & c,const char * p)210 void i_load_packed3(float32x8& a, float32x8& b, float32x8& c, const char* p)
211 {
212     v256_load_packed3(a, b, c, p);
213 }
214 #endif
215 
216 #if SIMDPP_USE_AVX512F
217 static SIMDPP_INL
i_load_packed3(float32<16> & a,float32<16> & b,float32<16> & c,const char * p)218 void i_load_packed3(float32<16>& a, float32<16>& b, float32<16>& c,
219                                const char* p)
220 {
221     v512_load_packed3(a, b, c, p);
222 }
223 #endif
224 
225 // -----------------------------------------------------------------------------
226 
227 static SIMDPP_INL
i_load_packed3(float64x2 & a,float64x2 & b,float64x2 & c,const char * p)228 void i_load_packed3(float64x2& a, float64x2& b, float64x2& c, const char* p)
229 {
230     p = detail::assume_aligned(p, 16);
231 #if SIMDPP_USE_NEON64
232     auto r = vld3q_f64(reinterpret_cast<const double*>(p));
233     a = r.val[0];
234     b = r.val[1];
235     c = r.val[2];
236 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
237     v128_load_packed3(a, b, c, p);
238 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
239     detail::null::load_packed3(a, b, c, p);
240 #endif
241 }
242 
243 #if SIMDPP_USE_AVX
244 static SIMDPP_INL
i_load_packed3(float64x4 & a,float64x4 & b,float64x4 & c,const char * p)245 void i_load_packed3(float64x4& a, float64x4& b, float64x4& c,
246                                const char* p)
247 {
248     v256_load_packed3(a, b, c, p);
249 }
250 #endif
251 
252 #if SIMDPP_USE_AVX512F
253 static SIMDPP_INL
i_load_packed3(float64<8> & a,float64<8> & b,float64<8> & c,const char * p)254 void i_load_packed3(float64<8>& a, float64<8>& b, float64<8>& c,
255                                const char* p)
256 {
257     v512_load_packed3(a, b, c, p);
258 }
259 #endif
260 
261 // -----------------------------------------------------------------------------
262 
263 template<class V> SIMDPP_INL
v128_load_packed3(V & a,V & b,V & c,const char * p)264 void v128_load_packed3(V& a, V& b, V& c, const char* p)
265 {
266     p = detail::assume_aligned(p, 16);
267     a = load(p);
268     b = load(p + 16);
269     c = load(p + 32);
270     mem_unpack3(a, b, c);
271 }
272 
273 template<class V> SIMDPP_INL
v256_load_packed3(V & a,V & b,V & c,const char * p)274 void v256_load_packed3(V& a, V& b, V& c, const char* p)
275 {
276     p = detail::assume_aligned(p, 32);
277     a = load(p);
278     b = load(p + 32);
279     c = load(p + 64);
280     mem_unpack3(a, b, c);
281 }
282 
283 template<class V> SIMDPP_INL
v512_load_packed3(V & a,V & b,V & c,const char * p)284 void v512_load_packed3(V& a, V& b, V& c, const char* p)
285 {
286     p = detail::assume_aligned(p, 64);
287     a = load(p);
288     b = load(p + 64);
289     c = load(p + 128);
290     mem_unpack3(a, b, c);
291 }
292 
293 
294 template<class V> SIMDPP_INL
i_load_packed3(V & a,V & b,V & c,const char * p)295 void i_load_packed3(V& a, V& b, V& c, const char* p)
296 {
297     const unsigned veclen = V::base_vector_type::length_bytes;
298 
299     p = detail::assume_aligned(p, veclen);
300     for (unsigned i = 0; i < V::vec_length; ++i) {
301         i_load_packed3(a.vec(i), b.vec(i), c.vec(i), p);
302         p += veclen*3;
303     }
304 }
305 
306 } // namespace insn
307 } // namespace detail
308 } // namespace SIMDPP_ARCH_NAMESPACE
309 } // namespace simdpp
310 
311 #endif
312 
313