1 /*  Copyright (C) 2013-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED2_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED2_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/align.h>
17 #include <simdpp/detail/insn/mem_unpack.h>
18 #include <simdpp/core/load.h>
19 #include <simdpp/detail/null/memory.h>
20 
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23 namespace detail {
24 namespace insn {
25 
26 
27 // collect some boilerplate
28 template<class V> SIMDPP_INL
29 void v128_load_packed2(V& a, V& b, const char* p);
30 template<class V> SIMDPP_INL
31 void v256_load_packed2(V& a, V& b, const char* p);
32 template<class V> SIMDPP_INL
33 void v512_load_packed2(V& a, V& b, const char* p);
34 
35 // -----------------------------------------------------------------------------
36 
37 static SIMDPP_INL
i_load_packed2(uint8x16 & a,uint8x16 & b,const char * p)38 void i_load_packed2(uint8x16& a, uint8x16& b, const char* p)
39 {
40     p = detail::assume_aligned(p, 16);
41 #if SIMDPP_USE_NULL
42     detail::null::load_packed2(a, b, p);
43 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
44     v128_load_packed2(a, b, p);
45 #elif SIMDPP_USE_NEON
46     auto r = vld2q_u8(reinterpret_cast<const uint8_t*>(p));
47     a = r.val[0];
48     b = r.val[1];
49 #endif
50 }
51 
52 #if SIMDPP_USE_AVX2
53 static SIMDPP_INL
i_load_packed2(uint8x32 & a,uint8x32 & b,const char * p)54 void i_load_packed2(uint8x32& a, uint8x32& b, const char* p)
55 {
56     v256_load_packed2(a, b, p);
57 }
58 #endif
59 
60 #if SIMDPP_USE_AVX512BW
i_load_packed2(uint8<64> & a,uint8<64> & b,const char * p)61 SIMDPP_INL void i_load_packed2(uint8<64>& a, uint8<64>& b, const char* p)
62 {
63     v512_load_packed2(a, b, p);
64 }
65 #endif
66 
67 // -----------------------------------------------------------------------------
68 
69 static SIMDPP_INL
i_load_packed2(uint16x8 & a,uint16x8 & b,const char * p)70 void i_load_packed2(uint16x8& a, uint16x8& b, const char* p)
71 {
72     p = detail::assume_aligned(p, 16);
73 #if SIMDPP_USE_NULL
74     detail::null::load_packed2(a, b, p);
75 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
76     v128_load_packed2(a, b, p);
77 #elif SIMDPP_USE_NEON
78     auto r = vld2q_u16(reinterpret_cast<const uint16_t*>(p));
79     a = r.val[0];
80     b = r.val[1];
81 #endif
82 }
83 
84 #if SIMDPP_USE_AVX2
85 static SIMDPP_INL
i_load_packed2(uint16x16 & a,uint16x16 & b,const char * p)86 void i_load_packed2(uint16x16& a, uint16x16& b, const char* p)
87 {
88     v256_load_packed2(a, b, p);
89 }
90 #endif
91 
92 #if SIMDPP_USE_AVX512BW
i_load_packed2(uint16<32> & a,uint16<32> & b,const char * p)93 SIMDPP_INL void i_load_packed2(uint16<32>& a, uint16<32>& b, const char* p)
94 {
95     v512_load_packed2(a, b, p);
96 }
97 #endif
98 
99 // -----------------------------------------------------------------------------
100 
101 static SIMDPP_INL
i_load_packed2(uint32x4 & a,uint32x4 & b,const char * p)102 void i_load_packed2(uint32x4& a, uint32x4& b, const char* p)
103 {
104     p = detail::assume_aligned(p, 16);
105 #if SIMDPP_USE_NULL
106     detail::null::load_packed2(a, b, p);
107 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
108     v128_load_packed2(a, b, p);
109 #elif SIMDPP_USE_NEON
110     auto r = vld2q_u32(reinterpret_cast<const uint32_t*>(p));
111     a = r.val[0];
112     b = r.val[1];
113 #endif
114 }
115 
116 #if SIMDPP_USE_AVX2
117 static SIMDPP_INL
i_load_packed2(uint32x8 & a,uint32x8 & b,const char * p)118 void i_load_packed2(uint32x8& a, uint32x8& b, const char* p)
119 {
120     v256_load_packed2(a, b, p);
121 }
122 #endif
123 
124 #if SIMDPP_USE_AVX512F
125 static SIMDPP_INL
i_load_packed2(uint32<16> & a,uint32<16> & b,const char * p)126 void i_load_packed2(uint32<16>& a, uint32<16>& b, const char* p)
127 {
128     v512_load_packed2(a, b, p);
129 }
130 #endif
131 
132 // -----------------------------------------------------------------------------
133 
134 static SIMDPP_INL
i_load_packed2(uint64x2 & a,uint64x2 & b,const char * p)135 void i_load_packed2(uint64x2& a, uint64x2& b, const char* p)
136 {
137     p = detail::assume_aligned(p, 16);
138 #if SIMDPP_USE_NEON64
139     auto r = vld2q_u64(reinterpret_cast<const uint64_t*>(p));
140     a = r.val[0];
141     b = r.val[1];
142 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
143     v128_load_packed2(a, b, p);
144 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
145     detail::null::load_packed2(a, b, p);
146 #endif
147 }
148 
149 #if SIMDPP_USE_AVX2
150 static SIMDPP_INL
i_load_packed2(uint64x4 & a,uint64x4 & b,const char * p)151 void i_load_packed2(uint64x4& a, uint64x4& b, const char* p)
152 {
153     v256_load_packed2(a, b, p);
154 }
155 #endif
156 
157 #if SIMDPP_USE_AVX512F
158 static SIMDPP_INL
i_load_packed2(uint64<8> & a,uint64<8> & b,const char * p)159 void i_load_packed2(uint64<8>& a, uint64<8>& b, const char* p)
160 {
161     v512_load_packed2(a, b, p);
162 }
163 #endif
164 
165 // -----------------------------------------------------------------------------
166 
167 static SIMDPP_INL
i_load_packed2(float32x4 & a,float32x4 & b,const char * p)168 void i_load_packed2(float32x4& a, float32x4& b, const char* p)
169 {
170     p = detail::assume_aligned(p, 16);
171 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
172     detail::null::load_packed2(a, b, p);
173 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
174     v128_load_packed2(a, b, p);
175 #elif SIMDPP_USE_NEON
176     auto r = vld2q_f32(reinterpret_cast<const float*>(p));
177     a = r.val[0];
178     b = r.val[1];
179 #endif
180 }
181 
182 #if SIMDPP_USE_AVX
183 static SIMDPP_INL
i_load_packed2(float32x8 & a,float32x8 & b,const char * p)184 void i_load_packed2(float32x8& a, float32x8& b, const char* p)
185 {
186     v256_load_packed2(a, b, p);
187 }
188 #endif
189 
190 #if SIMDPP_USE_AVX512F
191 static SIMDPP_INL
i_load_packed2(float32<16> & a,float32<16> & b,const char * p)192 void i_load_packed2(float32<16>& a, float32<16>& b, const char* p)
193 {
194     v512_load_packed2(a, b, p);
195 }
196 #endif
197 
198 // -----------------------------------------------------------------------------
199 
200 static SIMDPP_INL
i_load_packed2(float64x2 & a,float64x2 & b,const char * p)201 void i_load_packed2(float64x2& a, float64x2& b, const char* p)
202 {
203     p = detail::assume_aligned(p, 16);
204 #if SIMDPP_USE_NEON64
205     auto r = vld2q_f64(reinterpret_cast<const double*>(p));
206     a = r.val[0];
207     b = r.val[1];
208 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
209     v128_load_packed2(a, b, p);
210 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
211     detail::null::load_packed2(a, b, p);
212 #endif
213 }
214 
215 #if SIMDPP_USE_AVX
216 static SIMDPP_INL
i_load_packed2(float64x4 & a,float64x4 & b,const char * p)217 void i_load_packed2(float64x4& a, float64x4& b, const char* p)
218 {
219     v256_load_packed2(a, b, p);
220 }
221 #endif
222 
223 #if SIMDPP_USE_AVX512F
224 static SIMDPP_INL
i_load_packed2(float64<8> & a,float64<8> & b,const char * p)225 void i_load_packed2(float64<8>& a, float64<8>& b, const char* p)
226 {
227     v512_load_packed2(a, b, p);
228 }
229 #endif
230 
231 // -----------------------------------------------------------------------------
232 
233 template<class V> SIMDPP_INL
v128_load_packed2(V & a,V & b,const char * p)234 void v128_load_packed2(V& a, V& b, const char* p)
235 {
236     p = detail::assume_aligned(p, 16);
237     a = load(p);
238     b = load(p + 16);
239     mem_unpack2(a, b);
240 }
241 
242 template<class V> SIMDPP_INL
v256_load_packed2(V & a,V & b,const char * p)243 void v256_load_packed2(V& a, V& b, const char* p)
244 {
245     p = detail::assume_aligned(p, 32);
246     a = load(p);
247     b = load(p + 32);
248     mem_unpack2(a, b);
249 }
250 
251 template<class V> SIMDPP_INL
v512_load_packed2(V & a,V & b,const char * p)252 void v512_load_packed2(V& a, V& b, const char* p)
253 {
254     p = detail::assume_aligned(p, 64);
255     a = load(p);
256     b = load(p + 64);
257     mem_unpack2(a, b);
258 }
259 
260 template<class V> SIMDPP_INL
i_load_packed2(V & a,V & b,const char * p)261 void i_load_packed2(V& a, V& b, const char* p)
262 {
263     const unsigned veclen = V::base_vector_type::length_bytes;
264 
265     p = detail::assume_aligned(p, veclen);
266     for (unsigned i = 0; i < V::vec_length; ++i) {
267         i_load_packed2(a.vec(i), b.vec(i), p);
268         p += veclen*2;
269     }
270 }
271 
272 
273 } // namespace insn
274 } // namespace detail
275 } // namespace SIMDPP_ARCH_NAMESPACE
276 } // namespace simdpp
277 
278 #endif
279 
280